Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# Lint as: python3 | |
"""Natural Instruction V2 Dataset.""" | |
import json | |
import os | |
import random | |
import datasets | |
logger = datasets.logging.get_logger(__name__) | |
_CITATION = """ | |
@article{wang2022benchmarking, | |
title={Benchmarking Generalization via In-Context Instructions on 1,600+ Language Tasks}, | |
author={Wang, Yizhong and Mishra, Swaroop and Alipoormolabashi, Pegah and Kordi, Yeganeh and others}, | |
journal={arXiv preprint arXiv:2204.07705}, | |
year={2022} | |
} | |
""" | |
_DESCRIPTION = """ | |
Natural-Instructions v2 is a benchmark of 1,600+ diverse language tasks and their expert-written instructions. | |
It covers 70+ distinct task types, such as tagging, in-filling, and rewriting. | |
These tasks are collected with contributions of NLP practitioners in the community and | |
through an iterative peer review process to ensure their quality. | |
""" | |
_URL = "https://instructions.apps.allenai.org/" | |
_VERSION = "2.6" | |
_RELEASE_URL = ( | |
f"https://api.github.com/repos/allenai/natural-instructions/zipball/v{_VERSION}" | |
) | |
class NIConfig(datasets.BuilderConfig): | |
def __init__( | |
self, | |
split_subdir="splits/default/", | |
task_subdir="tasks/", | |
max_num_instances_per_task: int = 100, | |
max_num_instances_per_eval_task: int = 100, | |
seed=42, | |
*args, | |
**kwargs, | |
): | |
super().__init__(*args, **kwargs) | |
self.split_subdir: str = split_subdir | |
self.task_subdir: str = task_subdir | |
self.seed: int = seed | |
self.max_num_instances_per_task: int = max_num_instances_per_task | |
self.max_num_instances_per_eval_task: int = ( | |
max_num_instances_per_eval_task or max_num_instances_per_task | |
) | |
class NaturalInstructions(datasets.GeneratorBasedBuilder): | |
"""NaturalInstructions Dataset.""" | |
VERSION = datasets.Version(_VERSION + ".0") | |
BUILDER_CONFIG_CLASS = NIConfig | |
BUILDER_CONFIGS = [ | |
NIConfig( | |
name="default", | |
description="Default config for NaturalInstructions V2", | |
) | |
] | |
DEFAULT_CONFIG_NAME = "default" | |
def _info(self): | |
return datasets.DatasetInfo( | |
version=self.VERSION, | |
description=_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"id": datasets.Value("string"), # instance_id | |
"Task": datasets.Value("string"), | |
"Contributors": datasets.Value("string"), | |
"Source": [datasets.Value("string")], | |
"URL": [datasets.Value("string")], | |
"Categories": [datasets.Value("string")], | |
"Reasoning": [datasets.Value("string")], | |
"Definition": [datasets.Value("string")], | |
"Positive Examples": [ | |
{ | |
"input": datasets.Value("string"), | |
"output": datasets.Value("string"), | |
"explanation": datasets.Value("string"), | |
} | |
], | |
"Negative Examples": [ | |
{ | |
"input": datasets.Value("string"), | |
"output": datasets.Value("string"), | |
"explanation": datasets.Value("string"), | |
} | |
], | |
"Input_language": [datasets.Value("string")], | |
"Output_language": [datasets.Value("string")], | |
"Instruction_language": [datasets.Value("string")], | |
"Domains": [datasets.Value("string")], | |
"Instance": { | |
"id": datasets.Value("string"), | |
"input": datasets.Value("string"), | |
"output": [datasets.Value("string")], | |
}, | |
} | |
), | |
license="", | |
homepage=_URL, | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
if self.config.data_dir is None: | |
dl_path = dl_manager.download_and_extract(_RELEASE_URL) | |
self.config.data_dir = os.path.join( | |
dl_path, os.listdir(dl_path)[0] | |
) # get the extracted directory | |
split_dir = os.path.join(self.config.data_dir, self.config.split_subdir) | |
task_dir = os.path.join(self.config.data_dir, self.config.task_subdir) | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
gen_kwargs={ | |
"path": os.path.join(split_dir, "train_tasks.txt"), | |
"task_dir": task_dir, | |
"max_num_instances_per_task": self.config.max_num_instances_per_task, | |
"split": datasets.Split.TRAIN, | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.TEST, | |
gen_kwargs={ | |
"path": os.path.join(split_dir, "test_tasks.txt"), | |
"task_dir": task_dir, | |
"max_num_instances_per_task": self.config.max_num_instances_per_eval_task, | |
"split": datasets.Split.TEST, | |
}, | |
), | |
] | |
def _generate_examples( | |
self, path=None, task_dir=None, max_num_instances_per_task=None, split=None | |
): | |
"""Yields examples.""" | |
logger.info(f"Reading {split} tasks from {path}") | |
with open(path, encoding="utf-8") as split_f: | |
for line in split_f: | |
task_name = line.strip() | |
task_path = os.path.join(task_dir, task_name + ".json") | |
with open(task_path, encoding="utf-8") as task_f: | |
s = task_f.read() | |
task_data = json.loads(s) | |
# rename task name to task_num + source + category | |
task_name = ( | |
task_name.split("_")[0] | |
+ "_" | |
+ "_".join(task_data["Source"]).lower() | |
+ "_" | |
+ "_".join(task_data["Categories"][0].lower().split()) | |
) | |
task_data["Task"] = task_name | |
if "Instruction Source" in task_data: | |
task_data.pop("Instruction Source") | |
all_instances = task_data.pop("Instances") | |
if split == datasets.Split.TEST: | |
# for testing tasks, 100 instances are selected for efficient | |
# evaluation and they are label-balanced. | |
# we put them in the first for reproducibility. | |
# so, we use them here | |
instances = all_instances[:100] | |
else: | |
instances = all_instances | |
if ( | |
max_num_instances_per_task is not None | |
and max_num_instances_per_task >= 0 | |
): | |
random.Random(self.config.seed).shuffle(instances) | |
instances = instances[:max_num_instances_per_task] | |
for idx, instance in enumerate(instances): | |
example = task_data.copy() | |
example["id"] = instance["id"] | |
example["Instance"] = instance | |
yield f"{task_name}_{idx}", example | |