File size: 9,863 Bytes

07def9e

# Separate file which contains the functions to convert predictions to hard
# labels and calculate the IoU score using the settings of our best model in
# SemEval 2025 Task 3.
import argparse
import collections
from scipy.stats import spearmanr

import jsonlines
import numpy as np
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

def add_answers_column(example):
    starts, texts = [], []
    for hard_label in example["hard_labels"]:
        starts.append(hard_label[0])
        texts.append(example["context"][hard_label[0]:hard_label[1]])
    example["answers"] = {"answer_start": starts, "text": texts}
    return example

def to_dataset(file_path):
    mushroom = load_dataset("json", data_files=file_path)["train"]
    mushroom = mushroom.rename_column("model_output_text", "context")
    mushroom = mushroom.rename_column("model_input", "question")
    if "hard_labels" in mushroom.column_names:
        mushroom = mushroom.map(add_answers_column)
    else:
        print("No hard labels found in the evaluation data: only generating predictions.")

    return mushroom

def preprocess_examples(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


def score_iou(ref_dict, pred_dict):
    """
    Computes intersection-over-union between reference and predicted hard
    labels, for a single datapoint.

    Arguments:
        ref_dict (dict): a gold reference datapoint,
        pred_dict (dict): a model's prediction

    Returns:
        int: The IoU, or 1.0 if neither the reference nor the prediction contain hallucinations
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to sets of indices
    ref_indices = {idx for span in ref_dict['hard_labels'] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict['hard_labels'] for idx in range(*span)}
    # avoid division by zero
    if not pred_indices and not ref_indices: return 1.
    # otherwise compute & return IoU
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)

def score_cor(ref_dict, pred_dict):
    """computes Spearman correlation between predicted and reference soft labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the Spearman correlation, or a binarized exact match (0.0 or 1.0) if the reference or prediction contains no variation
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to vectors of observations
    ref_vec = [0.] * ref_dict['text_len']
    pred_vec = [0.] * ref_dict['text_len']
    for span in ref_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            ref_vec[idx] = span['prob']
    for span in pred_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            pred_vec[idx] = span['prob']
    # constant series (i.e., no hallucination) => cor is undef
    if len({round(flt, 8) for flt in pred_vec}) == 1 or len({round(flt, 8) for flt in ref_vec}) == 1 :
        return float(len({round(flt, 8) for flt in ref_vec}) == len({round(flt, 8) for flt in pred_vec}))
    # otherwise compute Spearman's rho
    return spearmanr(ref_vec, pred_vec).correlation

def infer_soft_labels(hard_labels):
    """reformat hard labels into soft labels with prob 1"""
    return [
        {
            'start': start,
            'end': end,
            'prob': 1.0,
        }
        for start, end in hard_labels
    ]

def find_possible_spans(answers, example):
    """
    Creates and filters possible hallucination spans.

    Arguments:
        answers (list): List containing dictionaries with spans as text and
                        logit scores.
        example: The instance which is being predicted. The context is used to map the predicted text to the start
                 and end indexes of the target context.
    Returns:
        list: List with lists of hard labels.
    """
    best_answer = max(answers, key=lambda x: x["logit_score"])
    threshold = best_answer["logit_score"] * 0.8
    hard_labels = []
    for answer in answers:
        if answer["logit_score"] > threshold:
            start_index = example["context"].index(answer["text"])
            end_index = start_index + len(answer["text"])
            hard_labels.append([start_index, end_index])
    soft_labels = infer_soft_labels(hard_labels)
    return hard_labels, soft_labels

def compute_metrics(start_logits, end_logits, features, examples, predictions_file):
    """
    Function to process predictions, create spans and if possible,
    calculates IoU

    Arguments:
        args (ArgumentParser): Arguments supplied by user.
        start_logits (list): Logits of all start positions.
        end_logits (list): Logits of all end positions.
        features (Dataset): Dataset containing features of questions and context.
        examples (Dataset): Dataset containing examples with hard labels.

    Returns:
        None
    """
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1: -20 - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -20 - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                            end_index < start_index
                            or end_index - start_index + 1 > 30
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0]: offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            hard_labels, soft_labels = find_possible_spans(answers, example)
            predicted_answers.append(
                {"id": example_id, "hard_labels": hard_labels, "soft_labels": soft_labels}
            )
        else:
            predicted_answers.append({"id": example_id, "hard_labels": [], "soft_labels": []})

    with jsonlines.open(predictions_file, mode="w") as writer:
        writer.write_all(predicted_answers)

    if "answers" in examples.column_names:
        true_answers = [{"id": ex["id"], "hard_labels": ex["hard_labels"], "soft_labels": ex["soft_labels"],
                         "text_len": len(ex["context"])} for ex in examples]
        ious = np.array([score_iou(r, d) for r, d in zip(true_answers, predicted_answers)])
        cors = np.array([score_cor(r, d) for r, d in zip(true_answers, predicted_answers)])

        print(f"IOU: {ious.mean():.8f}, COR: {cors.mean():.8f}")
    else:
        print("Evaluation data contained no answers. No scores to show.")

def main(model_path, evaluation_file_path, output_file):
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_path
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_path
    )
    # Initialize Trainer
    args = TrainingArguments(
        output_dir="output_dir",
        per_device_eval_batch_size=16,
        report_to="none"
    )

    model = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
    )

    mushroom_dataset = to_dataset(evaluation_file_path)
    features = mushroom_dataset.map(
        preprocess_examples,
        batched=True,
        remove_columns=mushroom_dataset.column_names,
        fn_kwargs={"tokenizer": tokenizer}
    )

    predictions, _, _ = model.predict(features)
    start_logits, end_logits = predictions
    compute_metrics(start_logits, end_logits, features, mushroom_dataset, output_file)


if __name__ == '__main__':
    p = argparse.ArgumentParser()
    p.add_argument('model_name', type=str)
    p.add_argument('evaluation_file_path', type=str)
    p.add_argument('output_file', type=str)
    a = p.parse_args()
    main(a.model_name, a.evaluation_file_path, a.output_file)