|
import pyrootutils |
|
|
|
root = pyrootutils.setup_root( |
|
search_from=__file__, |
|
indicator=[".project-root"], |
|
pythonpath=True, |
|
dotenv=True, |
|
) |
|
|
|
import os |
|
from argparse import ArgumentParser, RawTextHelpFormatter |
|
from dataclasses import dataclass, field |
|
from pathlib import Path |
|
|
|
from acl_anthology import Anthology |
|
from tqdm import tqdm |
|
|
|
from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers |
|
from src.utils.pdf_utils.process_pdf import ( |
|
FulltextExtractor, |
|
GrobidFulltextExtractor, |
|
PDFDownloader, |
|
) |
|
|
|
HELP_MSG = """ |
|
Generate paper json files from an ACL Anthology collection, with fulltext extraction. |
|
|
|
Iterate over entries in the ACL Anthology metadata, and for each entry: |
|
1. extract relevant paper info from the xml entry |
|
2. download pdf file |
|
3. extract fulltext |
|
4. format a json file and save |
|
|
|
pre-requisites: |
|
- Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema |
|
- Get the meta data from ACL Anthology: git clone [email protected]:acl-org/acl-anthology.git |
|
- Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0 |
|
""" |
|
|
|
|
|
@dataclass |
|
class XML2Jsons: |
|
base_output_dir: Path |
|
pdf_output_dir: Path |
|
|
|
xml2raw_papers: XML2RawPapers |
|
pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader) |
|
fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor) |
|
show_progress: bool = True |
|
|
|
@classmethod |
|
def from_cli(cls) -> "XML2Jsons": |
|
parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter) |
|
parser.add_argument( |
|
"--base-output-dir", type=str, help="Directory to save all the paper json files" |
|
) |
|
parser.add_argument( |
|
"--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files" |
|
) |
|
parser.add_argument( |
|
"--anthology-data-dir", |
|
type=str, |
|
help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. " |
|
"You can obtain the data via: git clone [email protected]:acl-org/acl-anthology.git", |
|
) |
|
parser.add_argument( |
|
"--collection-id-filters", |
|
nargs="+", |
|
type=str, |
|
default=None, |
|
help="If provided, only papers from the collections whose id (Anthology ID) contains the " |
|
"specified strings will be processed.", |
|
) |
|
parser.add_argument( |
|
"--venue-id-whitelist", |
|
nargs="+", |
|
type=str, |
|
default=None, |
|
help="If provided, only papers from the specified venues will be processed. See here for " |
|
"the list of venues: https://aclanthology.org/venues", |
|
) |
|
args = parser.parse_args() |
|
|
|
return cls( |
|
base_output_dir=Path(args.base_output_dir), |
|
pdf_output_dir=Path(args.pdf_output_dir), |
|
xml2raw_papers=XML2RawPapers( |
|
anthology=Anthology(datadir=args.anthology_data_dir), |
|
collection_id_filters=args.collection_id_filters, |
|
venue_id_whitelist=args.venue_id_whitelist, |
|
), |
|
) |
|
|
|
def run(self): |
|
os.makedirs(self.pdf_output_dir, exist_ok=True) |
|
papers = self.xml2raw_papers() |
|
if self.show_progress: |
|
papers = tqdm(list(papers), desc="extracting fulltext") |
|
for paper in papers: |
|
volume_dir = self.base_output_dir / paper.volume_id |
|
if paper.url is not None: |
|
pdf_save_path = self.pdf_downloader.download( |
|
paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf" |
|
) |
|
fulltext_extraction_output = self.fulltext_extractor(pdf_save_path) |
|
|
|
if fulltext_extraction_output: |
|
plain_text, extraction_data = fulltext_extraction_output |
|
paper.fulltext = extraction_data.get("sections") |
|
if not paper.abstract: |
|
paper.abstract = extraction_data.get("abstract") |
|
paper.save(str(volume_dir)) |
|
|
|
|
|
if __name__ == "__main__": |
|
xml2jsons = XML2Jsons.from_cli() |
|
xml2jsons.run() |
|
|