Spaces:

hamishivi
/

tess-2-demo

Sleeping

App Files Files Community

tess-2-demo / sdlm /data /sni /sni_dataset.py

hamishivi

commit

17ff0d8 verified 2 months ago

raw

history blame contribute delete

8.18 kB

	# coding=utf-8
	# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Lint as: python3
	"""Natural Instruction V2 Dataset."""


	import json
	import os
	import random

	import datasets

	logger = datasets.logging.get_logger(__name__)

	_CITATION = """
	@article{wang2022benchmarking,
	title={Benchmarking Generalization via In-Context Instructions on 1,600+ Language Tasks},
	author={Wang, Yizhong and Mishra, Swaroop and Alipoormolabashi, Pegah and Kordi, Yeganeh and others},
	journal={arXiv preprint arXiv:2204.07705},
	year={2022}
	}
	"""

	_DESCRIPTION = """
	Natural-Instructions v2 is a benchmark of 1,600+ diverse language tasks and their expert-written instructions.
	It covers 70+ distinct task types, such as tagging, in-filling, and rewriting.
	These tasks are collected with contributions of NLP practitioners in the community and
	through an iterative peer review process to ensure their quality.
	"""

	_URL = "https://instructions.apps.allenai.org/"
	_VERSION = "2.6"
	_RELEASE_URL = (
	f"https://api.github.com/repos/allenai/natural-instructions/zipball/v{_VERSION}"
	)


	class NIConfig(datasets.BuilderConfig):
	def __init__(
	self,
	split_subdir="splits/default/",
	task_subdir="tasks/",
	max_num_instances_per_task: int = 100,
	max_num_instances_per_eval_task: int = 100,
	seed=42,
	*args,
	**kwargs,
	):
	super().__init__(args, *kwargs)
	self.split_subdir: str = split_subdir
	self.task_subdir: str = task_subdir
	self.seed: int = seed
	self.max_num_instances_per_task: int = max_num_instances_per_task
	self.max_num_instances_per_eval_task: int = (
	max_num_instances_per_eval_task or max_num_instances_per_task
	)


	class NaturalInstructions(datasets.GeneratorBasedBuilder):
	"""NaturalInstructions Dataset."""

	VERSION = datasets.Version(_VERSION + ".0")
	BUILDER_CONFIG_CLASS = NIConfig
	BUILDER_CONFIGS = [
	NIConfig(
	name="default",
	description="Default config for NaturalInstructions V2",
	)
	]
	DEFAULT_CONFIG_NAME = "default"

	def _info(self):
	return datasets.DatasetInfo(
	version=self.VERSION,
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"id": datasets.Value("string"), # instance_id
	"Task": datasets.Value("string"),
	"Contributors": datasets.Value("string"),
	"Source": [datasets.Value("string")],
	"URL": [datasets.Value("string")],
	"Categories": [datasets.Value("string")],
	"Reasoning": [datasets.Value("string")],
	"Definition": [datasets.Value("string")],
	"Positive Examples": [
	{
	"input": datasets.Value("string"),
	"output": datasets.Value("string"),
	"explanation": datasets.Value("string"),
	}
	],
	"Negative Examples": [
	{
	"input": datasets.Value("string"),
	"output": datasets.Value("string"),
	"explanation": datasets.Value("string"),
	}
	],
	"Input_language": [datasets.Value("string")],
	"Output_language": [datasets.Value("string")],
	"Instruction_language": [datasets.Value("string")],
	"Domains": [datasets.Value("string")],
	"Instance": {
	"id": datasets.Value("string"),
	"input": datasets.Value("string"),
	"output": [datasets.Value("string")],
	},
	}
	),
	license="",
	homepage=_URL,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	if self.config.data_dir is None:
	dl_path = dl_manager.download_and_extract(_RELEASE_URL)
	self.config.data_dir = os.path.join(
	dl_path, os.listdir(dl_path)[0]
	) # get the extracted directory
	split_dir = os.path.join(self.config.data_dir, self.config.split_subdir)
	task_dir = os.path.join(self.config.data_dir, self.config.task_subdir)
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={
	"path": os.path.join(split_dir, "train_tasks.txt"),
	"task_dir": task_dir,
	"max_num_instances_per_task": self.config.max_num_instances_per_task,
	"split": datasets.Split.TRAIN,
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={
	"path": os.path.join(split_dir, "test_tasks.txt"),
	"task_dir": task_dir,
	"max_num_instances_per_task": self.config.max_num_instances_per_eval_task,
	"split": datasets.Split.TEST,
	},
	),
	]

	def _generate_examples(
	self, path=None, task_dir=None, max_num_instances_per_task=None, split=None
	):
	"""Yields examples."""
	logger.info(f"Reading {split} tasks from {path}")
	with open(path, encoding="utf-8") as split_f:
	for line in split_f:
	task_name = line.strip()
	task_path = os.path.join(task_dir, task_name + ".json")
	with open(task_path, encoding="utf-8") as task_f:
	s = task_f.read()
	task_data = json.loads(s)
	# rename task name to task_num + source + category
	task_name = (
	task_name.split("_")[0]
	+ "_"
	+ "_".join(task_data["Source"]).lower()
	+ "_"
	+ "_".join(task_data["Categories"][0].lower().split())
	)
	task_data["Task"] = task_name
	if "Instruction Source" in task_data:
	task_data.pop("Instruction Source")
	all_instances = task_data.pop("Instances")
	if split == datasets.Split.TEST:
	# for testing tasks, 100 instances are selected for efficient
	# evaluation and they are label-balanced.
	# we put them in the first for reproducibility.
	# so, we use them here
	instances = all_instances[:100]
	else:
	instances = all_instances
	if (
	max_num_instances_per_task is not None
	and max_num_instances_per_task >= 0
	):
	random.Random(self.config.seed).shuffle(instances)
	instances = instances[:max_num_instances_per_task]
	for idx, instance in enumerate(instances):
	example = task_data.copy()
	example["id"] = instance["id"]
	example["Instance"] = instance
	yield f"{task_name}_{idx}", example