ArneBinder commited on
Commit
f848af8
·
verified ·
1 Parent(s): ced4316

fix: upload https://github.com/ArneBinder/pie-document-level/pull/397

Browse files
configs/argumentation_model/_joint.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: pytorch_ie.auto.AutoPipeline.from_pretrained
2
+ pretrained_model_name_or_path: ???
3
+ # this batch_size that works good (fastest) on a single RTX2080Ti (11GB) (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344)
4
+ batch_size: 1
configs/argumentation_model/_pipelined.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.pipeline.NerRePipeline
2
+ ner_model_path: ???
3
+ re_model_path: ???
4
+ entity_layer: labeled_spans
5
+ relation_layer: binary_relations
6
+ # this works good on a single RTX2080Ti (11GB)
7
+ ner_pipeline:
8
+ batch_size: 256
9
+ re_pipeline:
10
+ batch_size: 64
11
+ # convert the RE model to half precision for mixed precision inference (speedup approx. 4x)
12
+ half_precision_model: true
13
+ taskmodule_kwargs:
14
+ # don't show statistics after encoding
15
+ collect_statistics: false
16
+ # don't show pipeline steps
17
+ verbose: false
configs/argumentation_model/joint.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _joint
3
+
4
+ # best model based on the validation set (see https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2613232344 for details)
5
+ # i.e. models from https://github.com/ArneBinder/pie-document-level/issues/334#issuecomment-2578422544, but with last checkpoint (instead of best validation checkpoint)
6
+ # model_name_or_path: models/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-53
7
+ # ckpt_path: logs/training/multiruns/dataset-sciarg/task-ner_re/v0.4/2025-01-09_01-50-52/2/checkpoints/last.ckpt
8
+ # w&b run (for the loaded checkpoint): [icy-glitter-5](https://wandb.ai/arne/dataset-sciarg-task-ner_re-v0.4-training/runs/it5toj6w)
9
+ pretrained_model_name_or_path: "ArneBinder/sam-pointer-bart-base-v0.4"
10
+ revision: "0445c69bafa31f8153aaeafc1767fad84919926a"
configs/argumentation_model/joint_hps.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _joint
3
+
4
+ # from: hparams_search for all datasets
5
+ # see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2682711151
6
+ # THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
7
+ pretrained_model_name_or_path: models/dataset-sciarg/task-ner_re/2025-02-23_05-16-45
configs/argumentation_model/pipelined.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _pipelined
3
+
4
+ # from: train pipeline models with bigger train set,
5
+ # see https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
6
+ # THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
7
+ ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_05-50-00
8
+ re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_20-36-23
configs/argumentation_model/pipelined_deprecated.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _pipelined
3
+
4
+ # from: train pipeline models with bigger train set, but with strange choice of models,
5
+ # see edit history of https://github.com/ArneBinder/pie-document-level/issues/355#issuecomment-2612958658
6
+ # NOTE: these were originally in the pipelined.yaml
7
+ # THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
8
+ ner_model_path: models/dataset-sciarg/task-adus/v0.4/2025-01-20_09-09-11
9
+ re_model_path: models/dataset-sciarg/task-relations/v0.4/2025-01-22_12-44-51
configs/argumentation_model/pipelined_hps.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _pipelined
3
+
4
+ # from: hparams_search for all datasets,
5
+ # see https://github.com/ArneBinder/pie-document-level/pull/381#issuecomment-2684865102
6
+ # THESE ARE LOCAL PATHS, NOT HUGGINGFACE MODELS!
7
+ ner_model_path: models/dataset-sciarg/task-adur/2025-02-26_07-14-59
8
+ re_model_path: models/dataset-sciarg/task-are/2025-02-20_18-09-25
configs/argumentation_model/pipelined_new.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _pipelined
3
+
4
+ # from: Update scientific ARE experiment configs,
5
+ # see https://github.com/ArneBinder/pie-document-level/pull/379#issuecomment-2651669398
6
+ # i.e. the models are now on Hugging Face
7
+ # ner_model_path: models/dataset-sciarg/task-adur/2025-02-09_23-08-37
8
+ # re_model_path: models/dataset-sciarg/task-are/2025-02-10_19-24-52
9
+ ner_model_path: ArneBinder/sam-adur-sciarg
10
+ ner_pipeline:
11
+ revision: bcbef4e585a5f637009ff702661cf824abede6b0
12
+ re_model_path: ArneBinder/sam-are-sciarg
13
+ re_pipeline:
14
+ revision: 93024388330c58daf20963c2020e08f54553e74c
configs/demo.yaml CHANGED
@@ -1,8 +1,12 @@
1
- # model details
2
- # this is models/dataset-sciarg/task-ner_re/v0.3/2024-11-02_13-23-14
3
- # w&b run: [clear-grass-46](https://wandb.ai/arne/dataset-sciarg-task-ner_re-v0.3-training/runs/3cgqcalc)
4
- default_model_name: "ArneBinder/sam-pointer-bart-base-v0.3.1"
5
- default_model_revision: "d090d5385380692933e8a3bc466236e3a905492d"
 
 
 
 
6
  # Whether to handle segmented entities in the document. If True, labeled_spans are converted
7
  # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
8
  # This requires the networkx package to be installed.
@@ -10,9 +14,9 @@ handle_parts_of_same: true
10
  # Split the document text into sections that are processed separately.
11
  default_split_regex: "\n\n\n+"
12
 
13
- # retriever details
14
- default_retriever_config_path: "configs/retriever/related_span_retriever_with_relations_from_other_docs.yaml"
15
  default_min_similarity: 0.95
 
16
 
17
  # data import details
18
  default_arxiv_id: "1706.03762"
@@ -21,6 +25,12 @@ default_load_pie_dataset_kwargs:
21
  name: "resolve_parts_of_same"
22
  split: "train"
23
 
 
 
 
 
 
 
24
  # for better readability in the UI
25
  render_mode_captions:
26
  displacy: "displaCy + highlighted arguments"
 
1
+ defaults:
2
+ - _self_
3
+ # default retriever, see subfolder retriever for more details
4
+ - retriever: related_span_retriever_with_relations_from_other_docs
5
+ # default argumentation model, see subfolder argumentation_model for more details
6
+ - argumentation_model: pipelined_new
7
+ # since this requires a running GROBID server, we disable it by default
8
+ - pdf_fulltext_extractor: none
9
+
10
  # Whether to handle segmented entities in the document. If True, labeled_spans are converted
11
  # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
12
  # This requires the networkx package to be installed.
 
14
  # Split the document text into sections that are processed separately.
15
  default_split_regex: "\n\n\n+"
16
 
17
+ # retriever details (query parameters)
 
18
  default_min_similarity: 0.95
19
+ default_top_k: 10
20
 
21
  # data import details
22
  default_arxiv_id: "1706.03762"
 
25
  name: "resolve_parts_of_same"
26
  split: "train"
27
 
28
+ # set to the data directory of https://github.com/acl-org/acl-anthology
29
+ # to enable ACL venue PDF import (requires to also have a valid pdf_fulltext_extractor)
30
+ # acl_anthology_data_dir=../acl-anthology/data
31
+ # temporary directory to store downloaded PDFs
32
+ acl_anthology_pdf_dir: "data/acl-anthology/pdf"
33
+
34
  # for better readability in the UI
35
  render_mode_captions:
36
  displacy: "displaCy + highlighted arguments"
configs/pdf_fulltext_extractor/grobid_local.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This requires a running GROBID server. To start the server via Docker, run:
2
+ # docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
3
+
4
+ _target_: src.utils.pdf_utils.process_pdf.GrobidFulltextExtractor
5
+ section_seperator: "\n\n\n"
6
+ paragraph_seperator: "\n\n"
7
+ grobid_config:
8
+ grobid_server: localhost
9
+ grobid_port: 8070
10
+ batch_size: 1000
11
+ sleep_time: 5
12
+ generateIDs: false
13
+ consolidate_header: false
14
+ consolidate_citations: false
15
+ include_raw_citations: true
16
+ include_raw_affiliations: false
17
+ max_workers: 2
18
+ verbose: false
configs/pdf_fulltext_extractor/none.yaml ADDED
File without changes
configs/retriever/related_span_retriever_with_relations_from_other_docs.yaml CHANGED
@@ -1,8 +1,10 @@
1
  _target_: src.langchain_modules.DocumentAwareSpanRetrieverWithRelations
 
 
2
  reversed_relations_suffix: _reversed
3
  relation_labels:
4
  - supports_reversed
5
- - contradicts_reversed
6
  retrieve_from_same_document: false
7
  retrieve_from_different_documents: true
8
  pie_document_type:
 
1
  _target_: src.langchain_modules.DocumentAwareSpanRetrieverWithRelations
2
+ symmetric_relations:
3
+ - contradicts
4
  reversed_relations_suffix: _reversed
5
  relation_labels:
6
  - supports_reversed
7
+ - contradicts
8
  retrieve_from_same_document: false
9
  retrieve_from_different_documents: true
10
  pie_document_type: