# coding=utf-8 # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Lint as: python3 """Natural Instruction V2 Dataset.""" import json import os import random import datasets logger = datasets.logging.get_logger(__name__) _CITATION = """ @article{wang2022benchmarking, title={Benchmarking Generalization via In-Context Instructions on 1,600+ Language Tasks}, author={Wang, Yizhong and Mishra, Swaroop and Alipoormolabashi, Pegah and Kordi, Yeganeh and others}, journal={arXiv preprint arXiv:2204.07705}, year={2022} } """ _DESCRIPTION = """ Natural-Instructions v2 is a benchmark of 1,600+ diverse language tasks and their expert-written instructions. It covers 70+ distinct task types, such as tagging, in-filling, and rewriting. These tasks are collected with contributions of NLP practitioners in the community and through an iterative peer review process to ensure their quality. """ _URL = "https://instructions.apps.allenai.org/" _VERSION = "2.6" _RELEASE_URL = ( f"https://api.github.com/repos/allenai/natural-instructions/zipball/v{_VERSION}" ) class NIConfig(datasets.BuilderConfig): def __init__( self, split_subdir="splits/default/", task_subdir="tasks/", max_num_instances_per_task: int = 100, max_num_instances_per_eval_task: int = 100, seed=42, *args, **kwargs, ): super().__init__(*args, **kwargs) self.split_subdir: str = split_subdir self.task_subdir: str = task_subdir self.seed: int = seed self.max_num_instances_per_task: int = max_num_instances_per_task self.max_num_instances_per_eval_task: int = ( max_num_instances_per_eval_task or max_num_instances_per_task ) class NaturalInstructions(datasets.GeneratorBasedBuilder): """NaturalInstructions Dataset.""" VERSION = datasets.Version(_VERSION + ".0") BUILDER_CONFIG_CLASS = NIConfig BUILDER_CONFIGS = [ NIConfig( name="default", description="Default config for NaturalInstructions V2", ) ] DEFAULT_CONFIG_NAME = "default" def _info(self): return datasets.DatasetInfo( version=self.VERSION, description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), # instance_id "Task": datasets.Value("string"), "Contributors": datasets.Value("string"), "Source": [datasets.Value("string")], "URL": [datasets.Value("string")], "Categories": [datasets.Value("string")], "Reasoning": [datasets.Value("string")], "Definition": [datasets.Value("string")], "Positive Examples": [ { "input": datasets.Value("string"), "output": datasets.Value("string"), "explanation": datasets.Value("string"), } ], "Negative Examples": [ { "input": datasets.Value("string"), "output": datasets.Value("string"), "explanation": datasets.Value("string"), } ], "Input_language": [datasets.Value("string")], "Output_language": [datasets.Value("string")], "Instruction_language": [datasets.Value("string")], "Domains": [datasets.Value("string")], "Instance": { "id": datasets.Value("string"), "input": datasets.Value("string"), "output": [datasets.Value("string")], }, } ), license="", homepage=_URL, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.data_dir is None: dl_path = dl_manager.download_and_extract(_RELEASE_URL) self.config.data_dir = os.path.join( dl_path, os.listdir(dl_path)[0] ) # get the extracted directory split_dir = os.path.join(self.config.data_dir, self.config.split_subdir) task_dir = os.path.join(self.config.data_dir, self.config.task_subdir) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "path": os.path.join(split_dir, "train_tasks.txt"), "task_dir": task_dir, "max_num_instances_per_task": self.config.max_num_instances_per_task, "split": datasets.Split.TRAIN, }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "path": os.path.join(split_dir, "test_tasks.txt"), "task_dir": task_dir, "max_num_instances_per_task": self.config.max_num_instances_per_eval_task, "split": datasets.Split.TEST, }, ), ] def _generate_examples( self, path=None, task_dir=None, max_num_instances_per_task=None, split=None ): """Yields examples.""" logger.info(f"Reading {split} tasks from {path}") with open(path, encoding="utf-8") as split_f: for line in split_f: task_name = line.strip() task_path = os.path.join(task_dir, task_name + ".json") with open(task_path, encoding="utf-8") as task_f: s = task_f.read() task_data = json.loads(s) # rename task name to task_num + source + category task_name = ( task_name.split("_")[0] + "_" + "_".join(task_data["Source"]).lower() + "_" + "_".join(task_data["Categories"][0].lower().split()) ) task_data["Task"] = task_name if "Instruction Source" in task_data: task_data.pop("Instruction Source") all_instances = task_data.pop("Instances") if split == datasets.Split.TEST: # for testing tasks, 100 instances are selected for efficient # evaluation and they are label-balanced. # we put them in the first for reproducibility. # so, we use them here instances = all_instances[:100] else: instances = all_instances if ( max_num_instances_per_task is not None and max_num_instances_per_task >= 0 ): random.Random(self.config.seed).shuffle(instances) instances = instances[:max_num_instances_per_task] for idx, instance in enumerate(instances): example = task_data.copy() example["id"] = instance["id"] example["Instance"] = instance yield f"{task_name}_{idx}", example