Spaces:

ArneBinder
/

ScientificArgumentRecommender

Running

App Files Files Community

ScientificArgumentRecommender / src /data /acl_anthology_crawler.py

ArneBinder

update from https://github.com/ArneBinder/pie-document-level/pull/397

ced4316 verified 2 months ago

raw

history blame contribute delete

4.22 kB

	import pyrootutils

	root = pyrootutils.setup_root(
	search_from=__file__,
	indicator=[".project-root"],
	pythonpath=True,
	dotenv=True,
	)

	import os
	from argparse import ArgumentParser, RawTextHelpFormatter
	from dataclasses import dataclass, field
	from pathlib import Path

	from acl_anthology import Anthology
	from tqdm import tqdm

	from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers
	from src.utils.pdf_utils.process_pdf import (
	FulltextExtractor,
	GrobidFulltextExtractor,
	PDFDownloader,
	)

	HELP_MSG = """
	Generate paper json files from an ACL Anthology collection, with fulltext extraction.

	Iterate over entries in the ACL Anthology metadata, and for each entry:
	1. extract relevant paper info from the xml entry
	2. download pdf file
	3. extract fulltext
	4. format a json file and save

	pre-requisites:
	- Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema
	- Get the meta data from ACL Anthology: git clone [email protected]:acl-org/acl-anthology.git
	- Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
	"""


	@dataclass
	class XML2Jsons:
	base_output_dir: Path
	pdf_output_dir: Path

	xml2raw_papers: XML2RawPapers
	pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader)
	fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor)
	show_progress: bool = True

	@classmethod
	def from_cli(cls) -> "XML2Jsons":
	parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter)
	parser.add_argument(
	"--base-output-dir", type=str, help="Directory to save all the paper json files"
	)
	parser.add_argument(
	"--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files"
	)
	parser.add_argument(
	"--anthology-data-dir",
	type=str,
	help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. "
	"You can obtain the data via: git clone [email protected]:acl-org/acl-anthology.git",
	)
	parser.add_argument(
	"--collection-id-filters",
	nargs="+",
	type=str,
	default=None,
	help="If provided, only papers from the collections whose id (Anthology ID) contains the "
	"specified strings will be processed.",
	)
	parser.add_argument(
	"--venue-id-whitelist",
	nargs="+",
	type=str,
	default=None,
	help="If provided, only papers from the specified venues will be processed. See here for "
	"the list of venues: https://aclanthology.org/venues",
	)
	args = parser.parse_args()

	return cls(
	base_output_dir=Path(args.base_output_dir),
	pdf_output_dir=Path(args.pdf_output_dir),
	xml2raw_papers=XML2RawPapers(
	anthology=Anthology(datadir=args.anthology_data_dir),
	collection_id_filters=args.collection_id_filters,
	venue_id_whitelist=args.venue_id_whitelist,
	),
	)

	def run(self):
	os.makedirs(self.pdf_output_dir, exist_ok=True)
	papers = self.xml2raw_papers()
	if self.show_progress:
	papers = tqdm(list(papers), desc="extracting fulltext")
	for paper in papers:
	volume_dir = self.base_output_dir / paper.volume_id
	if paper.url is not None:
	pdf_save_path = self.pdf_downloader.download(
	paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf"
	)
	fulltext_extraction_output = self.fulltext_extractor(pdf_save_path)

	if fulltext_extraction_output:
	plain_text, extraction_data = fulltext_extraction_output
	paper.fulltext = extraction_data.get("sections")
	if not paper.abstract:
	paper.abstract = extraction_data.get("abstract")
	paper.save(str(volume_dir))


	if __name__ == "__main__":
	xml2jsons = XML2Jsons.from_cli()
	xml2jsons.run()