ScientificArgumentRecommender / src /data /acl_anthology_crawler.py
ArneBinder's picture
update from https://github.com/ArneBinder/pie-document-level/pull/397
ced4316 verified
import pyrootutils
root = pyrootutils.setup_root(
search_from=__file__,
indicator=[".project-root"],
pythonpath=True,
dotenv=True,
)
import os
from argparse import ArgumentParser, RawTextHelpFormatter
from dataclasses import dataclass, field
from pathlib import Path
from acl_anthology import Anthology
from tqdm import tqdm
from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers
from src.utils.pdf_utils.process_pdf import (
FulltextExtractor,
GrobidFulltextExtractor,
PDFDownloader,
)
HELP_MSG = """
Generate paper json files from an ACL Anthology collection, with fulltext extraction.
Iterate over entries in the ACL Anthology metadata, and for each entry:
1. extract relevant paper info from the xml entry
2. download pdf file
3. extract fulltext
4. format a json file and save
pre-requisites:
- Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema
- Get the meta data from ACL Anthology: git clone [email protected]:acl-org/acl-anthology.git
- Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
"""
@dataclass
class XML2Jsons:
base_output_dir: Path
pdf_output_dir: Path
xml2raw_papers: XML2RawPapers
pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader)
fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor)
show_progress: bool = True
@classmethod
def from_cli(cls) -> "XML2Jsons":
parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter)
parser.add_argument(
"--base-output-dir", type=str, help="Directory to save all the paper json files"
)
parser.add_argument(
"--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files"
)
parser.add_argument(
"--anthology-data-dir",
type=str,
help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. "
"You can obtain the data via: git clone [email protected]:acl-org/acl-anthology.git",
)
parser.add_argument(
"--collection-id-filters",
nargs="+",
type=str,
default=None,
help="If provided, only papers from the collections whose id (Anthology ID) contains the "
"specified strings will be processed.",
)
parser.add_argument(
"--venue-id-whitelist",
nargs="+",
type=str,
default=None,
help="If provided, only papers from the specified venues will be processed. See here for "
"the list of venues: https://aclanthology.org/venues",
)
args = parser.parse_args()
return cls(
base_output_dir=Path(args.base_output_dir),
pdf_output_dir=Path(args.pdf_output_dir),
xml2raw_papers=XML2RawPapers(
anthology=Anthology(datadir=args.anthology_data_dir),
collection_id_filters=args.collection_id_filters,
venue_id_whitelist=args.venue_id_whitelist,
),
)
def run(self):
os.makedirs(self.pdf_output_dir, exist_ok=True)
papers = self.xml2raw_papers()
if self.show_progress:
papers = tqdm(list(papers), desc="extracting fulltext")
for paper in papers:
volume_dir = self.base_output_dir / paper.volume_id
if paper.url is not None:
pdf_save_path = self.pdf_downloader.download(
paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf"
)
fulltext_extraction_output = self.fulltext_extractor(pdf_save_path)
if fulltext_extraction_output:
plain_text, extraction_data = fulltext_extraction_output
paper.fulltext = extraction_data.get("sections")
if not paper.abstract:
paper.abstract = extraction_data.get("abstract")
paper.save(str(volume_dir))
if __name__ == "__main__":
xml2jsons = XML2Jsons.from_cli()
xml2jsons.run()