fix: upload https://github.com/ArneBinder/pie-document-level/pull/397
Browse files- configs/argumentation_model/_joint.yaml +4 -0
- configs/argumentation_model/_pipelined.yaml +17 -0
- configs/argumentation_model/joint.yaml +10 -0
- configs/argumentation_model/joint_hps.yaml +7 -0
- configs/argumentation_model/pipelined.yaml +8 -0
- configs/argumentation_model/pipelined_deprecated.yaml +9 -0
- configs/argumentation_model/pipelined_hps.yaml +8 -0
- configs/argumentation_model/pipelined_new.yaml +14 -0
- configs/demo.yaml +17 -7
- configs/pdf_fulltext_extractor/grobid_local.yaml +18 -0
- configs/pdf_fulltext_extractor/none.yaml +0 -0
- configs/retriever/related_span_retriever_with_relations_from_other_docs.yaml +3 -1
configs/argumentation_model/_joint.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: pytorch_ie.auto.AutoPipeline.from_pretrained
|
2 |
+
pretrained_model_name_or_path: ???
|
3 |
+
# this batch_size that works good (fastest) on a single RTX2080Ti (11GB) (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344)
|
4 |
+
batch_size: 1
|
configs/argumentation_model/_pipelined.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: src.pipeline.NerRePipeline
|
2 |
+
ner_model_path: ???
|
3 |
+
re_model_path: ???
|
4 |
+
entity_layer: labeled_spans
|
5 |
+
relation_layer: binary_relations
|
6 |
+
# this works good on a single RTX2080Ti (11GB)
|
7 |
+
ner_pipeline:
|
8 |
+
batch_size: 256
|
9 |
+
re_pipeline:
|
10 |
+
batch_size: 64
|
11 |
+
# convert the RE model to half precision for mixed precision inference (speedup approx. 4x)
|
12 |
+
half_precision_model: true
|
13 |
+
taskmodule_kwargs:
|
14 |
+
# don't show statistics after encoding
|
15 |
+
collect_statistics: false
|
16 |
+
# don't show pipeline steps
|
17 |
+
verbose: false
|
configs/argumentation_model/joint.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _joint
|
3 |
+
|
4 |
+
# best model based on the validation set (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344 for details)
|
5 |
+
# i.e. models from https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2578422544, but with last checkpoint (instead of best validation checkpoint)
|
6 |
+
# model_name_or_path: models/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-53
|
7 |
+
# ckpt_path: logs/training/multiruns/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-52/2/checkpoints/last.ckpt
|
8 |
+
# w&b run (for the loaded checkpoint): [icy-glitter-5](https://wandb.ai/arne/dataset-sciarg-task-ner_re-v0.4-training/runs/it5toj6w)
|
9 |
+
pretrained_model_name_or_path: "ArneBinder/sam-pointer-bart-base-v0.4"
|
10 |
+
revision: "0445c69bafa31f8153aaeafc1767fad84919926a"
|
configs/argumentation_model/joint_hps.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _joint
|
3 |
+
|
4 |
+
# from: hparams_search for all datasets
|
5 |
+
# see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2682711151
|
6 |
+
# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
|
7 |
+
pretrained_model_name_or_path: models/dataset-sciarg/task-ner_re/2025-02-23_05-16-45
|
configs/argumentation_model/pipelined.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _pipelined
|
3 |
+
|
4 |
+
# from: train pipeline models with bigger train set,
|
5 |
+
# see https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
|
6 |
+
# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
|
7 |
+
ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_05-50-00
|
8 |
+
re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_20-36-23
|
configs/argumentation_model/pipelined_deprecated.yaml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _pipelined
|
3 |
+
|
4 |
+
# from: train pipeline models with bigger train set, but with strange choice of models,
|
5 |
+
# see edit history of https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
|
6 |
+
# NOTE: these were originally in the pipelined.yaml
|
7 |
+
# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
|
8 |
+
ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_09-09-11
|
9 |
+
re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_12-44-51
|
configs/argumentation_model/pipelined_hps.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _pipelined
|
3 |
+
|
4 |
+
# from: hparams_search for all datasets,
|
5 |
+
# see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2684865102
|
6 |
+
# THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
|
7 |
+
ner_model_path: models/dataset-sciarg/task-adur/2025-02-26_07-14-59
|
8 |
+
re_model_path: models/dataset-sciarg/task-are/2025-02-20_18-09-25
|
configs/argumentation_model/pipelined_new.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults:
|
2 |
+
- _pipelined
|
3 |
+
|
4 |
+
# from: Update scientific ARE experiment configs,
|
5 |
+
# see https://github.com/ArneBinder/pie-document-level/pull/379#issuecomment-2651669398
|
6 |
+
# i.e. the models are now on Hugging Face
|
7 |
+
# ner_model_path: models/dataset-sciarg/task-adur/2025-02-09_23-08-37
|
8 |
+
# re_model_path: models/dataset-sciarg/task-are/2025-02-10_19-24-52
|
9 |
+
ner_model_path: ArneBinder/sam-adur-sciarg
|
10 |
+
ner_pipeline:
|
11 |
+
revision: bcbef4e585a5f637009ff702661cf824abede6b0
|
12 |
+
re_model_path: ArneBinder/sam-are-sciarg
|
13 |
+
re_pipeline:
|
14 |
+
revision: 93024388330c58daf20963c2020e08f54553e74c
|
configs/demo.yaml
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
#
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
6 |
# Whether to handle segmented entities in the document. If True, labeled_spans are converted
|
7 |
# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
|
8 |
# This requires the networkx package to be installed.
|
@@ -10,9 +14,9 @@ handle_parts_of_same: true
|
|
10 |
# Split the document text into sections that are processed separately.
|
11 |
default_split_regex: "\n\n\n+"
|
12 |
|
13 |
-
# retriever details
|
14 |
-
default_retriever_config_path: "configs/retriever/related_span_retriever_with_relations_from_other_docs.yaml"
|
15 |
default_min_similarity: 0.95
|
|
|
16 |
|
17 |
# data import details
|
18 |
default_arxiv_id: "1706.03762"
|
@@ -21,6 +25,12 @@ default_load_pie_dataset_kwargs:
|
|
21 |
name: "resolve_parts_of_same"
|
22 |
split: "train"
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# for better readability in the UI
|
25 |
render_mode_captions:
|
26 |
displacy: "displaCy + highlighted arguments"
|
|
|
1 |
+
defaults:
|
2 |
+
- _self_
|
3 |
+
# default retriever, see subfolder retriever for more details
|
4 |
+
- retriever: related_span_retriever_with_relations_from_other_docs
|
5 |
+
# default argumentation model, see subfolder argumentation_model for more details
|
6 |
+
- argumentation_model: pipelined_new
|
7 |
+
# since this requires a running GROBID server, we disable it by default
|
8 |
+
- pdf_fulltext_extractor: none
|
9 |
+
|
10 |
# Whether to handle segmented entities in the document. If True, labeled_spans are converted
|
11 |
# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
|
12 |
# This requires the networkx package to be installed.
|
|
|
14 |
# Split the document text into sections that are processed separately.
|
15 |
default_split_regex: "\n\n\n+"
|
16 |
|
17 |
+
# retriever details (query parameters)
|
|
|
18 |
default_min_similarity: 0.95
|
19 |
+
default_top_k: 10
|
20 |
|
21 |
# data import details
|
22 |
default_arxiv_id: "1706.03762"
|
|
|
25 |
name: "resolve_parts_of_same"
|
26 |
split: "train"
|
27 |
|
28 |
+
# set to the data directory of https://github.com/acl-org/acl-anthology
|
29 |
+
# to enable ACL venue PDF import (requires to also have a valid pdf_fulltext_extractor)
|
30 |
+
# acl_anthology_data_dir=../acl-anthology/data
|
31 |
+
# temporary directory to store downloaded PDFs
|
32 |
+
acl_anthology_pdf_dir: "data/acl-anthology/pdf"
|
33 |
+
|
34 |
# for better readability in the UI
|
35 |
render_mode_captions:
|
36 |
displacy: "displaCy + highlighted arguments"
|
configs/pdf_fulltext_extractor/grobid_local.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This requires a running GROBID server. To start the server via Docker, run:
|
2 |
+
# docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
|
3 |
+
|
4 |
+
_target_: src.utils.pdf_utils.process_pdf.GrobidFulltextExtractor
|
5 |
+
section_seperator: "\n\n\n"
|
6 |
+
paragraph_seperator: "\n\n"
|
7 |
+
grobid_config:
|
8 |
+
grobid_server: localhost
|
9 |
+
grobid_port: 8070
|
10 |
+
batch_size: 1000
|
11 |
+
sleep_time: 5
|
12 |
+
generateIDs: false
|
13 |
+
consolidate_header: false
|
14 |
+
consolidate_citations: false
|
15 |
+
include_raw_citations: true
|
16 |
+
include_raw_affiliations: false
|
17 |
+
max_workers: 2
|
18 |
+
verbose: false
|
configs/pdf_fulltext_extractor/none.yaml
ADDED
File without changes
|
configs/retriever/related_span_retriever_with_relations_from_other_docs.yaml
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
_target_: src.langchain_modules.DocumentAwareSpanRetrieverWithRelations
|
|
|
|
|
2 |
reversed_relations_suffix: _reversed
|
3 |
relation_labels:
|
4 |
- supports_reversed
|
5 |
-
-
|
6 |
retrieve_from_same_document: false
|
7 |
retrieve_from_different_documents: true
|
8 |
pie_document_type:
|
|
|
1 |
_target_: src.langchain_modules.DocumentAwareSpanRetrieverWithRelations
|
2 |
+
symmetric_relations:
|
3 |
+
- contradicts
|
4 |
reversed_relations_suffix: _reversed
|
5 |
relation_labels:
|
6 |
- supports_reversed
|
7 |
+
- contradicts
|
8 |
retrieve_from_same_document: false
|
9 |
retrieve_from_different_documents: true
|
10 |
pie_document_type:
|