tess-2-demo / sdlm /data /sni /sni_dataset.py
hamishivi's picture
commit
17ff0d8 verified
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Natural Instruction V2 Dataset."""
import json
import os
import random
import datasets
logger = datasets.logging.get_logger(__name__)
_CITATION = """
@article{wang2022benchmarking,
title={Benchmarking Generalization via In-Context Instructions on 1,600+ Language Tasks},
author={Wang, Yizhong and Mishra, Swaroop and Alipoormolabashi, Pegah and Kordi, Yeganeh and others},
journal={arXiv preprint arXiv:2204.07705},
year={2022}
}
"""
_DESCRIPTION = """
Natural-Instructions v2 is a benchmark of 1,600+ diverse language tasks and their expert-written instructions.
It covers 70+ distinct task types, such as tagging, in-filling, and rewriting.
These tasks are collected with contributions of NLP practitioners in the community and
through an iterative peer review process to ensure their quality.
"""
_URL = "https://instructions.apps.allenai.org/"
_VERSION = "2.6"
_RELEASE_URL = (
f"https://api.github.com/repos/allenai/natural-instructions/zipball/v{_VERSION}"
)
class NIConfig(datasets.BuilderConfig):
def __init__(
self,
split_subdir="splits/default/",
task_subdir="tasks/",
max_num_instances_per_task: int = 100,
max_num_instances_per_eval_task: int = 100,
seed=42,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.split_subdir: str = split_subdir
self.task_subdir: str = task_subdir
self.seed: int = seed
self.max_num_instances_per_task: int = max_num_instances_per_task
self.max_num_instances_per_eval_task: int = (
max_num_instances_per_eval_task or max_num_instances_per_task
)
class NaturalInstructions(datasets.GeneratorBasedBuilder):
"""NaturalInstructions Dataset."""
VERSION = datasets.Version(_VERSION + ".0")
BUILDER_CONFIG_CLASS = NIConfig
BUILDER_CONFIGS = [
NIConfig(
name="default",
description="Default config for NaturalInstructions V2",
)
]
DEFAULT_CONFIG_NAME = "default"
def _info(self):
return datasets.DatasetInfo(
version=self.VERSION,
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"), # instance_id
"Task": datasets.Value("string"),
"Contributors": datasets.Value("string"),
"Source": [datasets.Value("string")],
"URL": [datasets.Value("string")],
"Categories": [datasets.Value("string")],
"Reasoning": [datasets.Value("string")],
"Definition": [datasets.Value("string")],
"Positive Examples": [
{
"input": datasets.Value("string"),
"output": datasets.Value("string"),
"explanation": datasets.Value("string"),
}
],
"Negative Examples": [
{
"input": datasets.Value("string"),
"output": datasets.Value("string"),
"explanation": datasets.Value("string"),
}
],
"Input_language": [datasets.Value("string")],
"Output_language": [datasets.Value("string")],
"Instruction_language": [datasets.Value("string")],
"Domains": [datasets.Value("string")],
"Instance": {
"id": datasets.Value("string"),
"input": datasets.Value("string"),
"output": [datasets.Value("string")],
},
}
),
license="",
homepage=_URL,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
if self.config.data_dir is None:
dl_path = dl_manager.download_and_extract(_RELEASE_URL)
self.config.data_dir = os.path.join(
dl_path, os.listdir(dl_path)[0]
) # get the extracted directory
split_dir = os.path.join(self.config.data_dir, self.config.split_subdir)
task_dir = os.path.join(self.config.data_dir, self.config.task_subdir)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"path": os.path.join(split_dir, "train_tasks.txt"),
"task_dir": task_dir,
"max_num_instances_per_task": self.config.max_num_instances_per_task,
"split": datasets.Split.TRAIN,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"path": os.path.join(split_dir, "test_tasks.txt"),
"task_dir": task_dir,
"max_num_instances_per_task": self.config.max_num_instances_per_eval_task,
"split": datasets.Split.TEST,
},
),
]
def _generate_examples(
self, path=None, task_dir=None, max_num_instances_per_task=None, split=None
):
"""Yields examples."""
logger.info(f"Reading {split} tasks from {path}")
with open(path, encoding="utf-8") as split_f:
for line in split_f:
task_name = line.strip()
task_path = os.path.join(task_dir, task_name + ".json")
with open(task_path, encoding="utf-8") as task_f:
s = task_f.read()
task_data = json.loads(s)
# rename task name to task_num + source + category
task_name = (
task_name.split("_")[0]
+ "_"
+ "_".join(task_data["Source"]).lower()
+ "_"
+ "_".join(task_data["Categories"][0].lower().split())
)
task_data["Task"] = task_name
if "Instruction Source" in task_data:
task_data.pop("Instruction Source")
all_instances = task_data.pop("Instances")
if split == datasets.Split.TEST:
# for testing tasks, 100 instances are selected for efficient
# evaluation and they are label-balanced.
# we put them in the first for reproducibility.
# so, we use them here
instances = all_instances[:100]
else:
instances = all_instances
if (
max_num_instances_per_task is not None
and max_num_instances_per_task >= 0
):
random.Random(self.config.seed).shuffle(instances)
instances = instances[:max_num_instances_per_task]
for idx, instance in enumerate(instances):
example = task_data.copy()
example["id"] = instance["id"]
example["Instance"] = instance
yield f"{task_name}_{idx}", example