Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +16 -0
- data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json +39 -0
- data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json +44 -0
- data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json +53 -0
- data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json +67 -0
- data/datasets/lilac/squad_v2/config.yml +44 -0
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/squad_v2/context/gte-small/spans.pkl +3 -0
- data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/context/pii/signal_manifest.json +45 -0
- data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json +59 -0
- data/datasets/lilac/squad_v2/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/manifest.json +41 -0
- data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/question/pii/signal_manifest.json +45 -0
- data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json +59 -0
.gitattributes
CHANGED
@@ -52,3 +52,19 @@ data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000
|
|
52 |
data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
53 |
data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
54 |
data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
53 |
data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
54 |
data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
55 |
+
data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
56 |
+
data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
57 |
+
data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
58 |
+
data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
59 |
+
data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
60 |
+
data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
61 |
+
data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
62 |
+
data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
63 |
+
data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
64 |
+
data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
65 |
+
data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
66 |
+
data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
67 |
+
data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
68 |
+
data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
69 |
+
data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
70 |
+
data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c03103ba132a7209461f86bd1045431d06db431930344e4bdf97236347cc2164
|
3 |
+
size 4738120
|
data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(answers.text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"answers": {
|
12 |
+
"fields": {
|
13 |
+
"text": {
|
14 |
+
"repeated_field": {
|
15 |
+
"fields": {
|
16 |
+
"lang_detection": {
|
17 |
+
"dtype": "string",
|
18 |
+
"signal": {
|
19 |
+
"split_by_paragraph": false,
|
20 |
+
"signal_name": "lang_detection"
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"signal": {
|
31 |
+
"split_by_paragraph": false,
|
32 |
+
"signal_name": "lang_detection"
|
33 |
+
},
|
34 |
+
"enriched_path": [
|
35 |
+
"answers",
|
36 |
+
"text",
|
37 |
+
"*"
|
38 |
+
]
|
39 |
+
}
|
data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf4ae7259d126104da2aea0e1fad0c7cd83033f7774f0d44a2436f7c891fde34
|
3 |
+
size 5224344
|
data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(answers.text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"answers": {
|
12 |
+
"fields": {
|
13 |
+
"text": {
|
14 |
+
"repeated_field": {
|
15 |
+
"fields": {
|
16 |
+
"near_dup": {
|
17 |
+
"fields": {
|
18 |
+
"cluster_id": {
|
19 |
+
"dtype": "uint32",
|
20 |
+
"categorical": true
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"signal": {
|
24 |
+
"threshold": 0.85,
|
25 |
+
"signal_name": "near_dup"
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"signal": {
|
36 |
+
"threshold": 0.85,
|
37 |
+
"signal_name": "near_dup"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"answers",
|
41 |
+
"text",
|
42 |
+
"*"
|
43 |
+
]
|
44 |
+
}
|
data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:adf2c35877ae9957a987049c40a9a1b2edbe4b2d93b1da86bfeb739fae240040
|
3 |
+
size 4841393
|
data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(answers.text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"answers": {
|
12 |
+
"fields": {
|
13 |
+
"text": {
|
14 |
+
"repeated_field": {
|
15 |
+
"fields": {
|
16 |
+
"pii": {
|
17 |
+
"fields": {
|
18 |
+
"emails": {
|
19 |
+
"repeated_field": {
|
20 |
+
"dtype": "string_span"
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"ip_addresses": {
|
24 |
+
"repeated_field": {
|
25 |
+
"dtype": "string_span"
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"secrets": {
|
29 |
+
"repeated_field": {
|
30 |
+
"dtype": "string_span"
|
31 |
+
}
|
32 |
+
}
|
33 |
+
},
|
34 |
+
"signal": {
|
35 |
+
"signal_name": "pii"
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
}
|
42 |
+
}
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "pii"
|
47 |
+
},
|
48 |
+
"enriched_path": [
|
49 |
+
"answers",
|
50 |
+
"text",
|
51 |
+
"*"
|
52 |
+
]
|
53 |
+
}
|
data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16b5ddbc51455341a26121c5427bd0f32639515dad34d77561402df81d8ab903
|
3 |
+
size 5100206
|
data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(answers.text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"answers": {
|
12 |
+
"fields": {
|
13 |
+
"text": {
|
14 |
+
"repeated_field": {
|
15 |
+
"fields": {
|
16 |
+
"text_statistics": {
|
17 |
+
"fields": {
|
18 |
+
"num_characters": {
|
19 |
+
"dtype": "int32"
|
20 |
+
},
|
21 |
+
"readability": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"log(type_token_ratio)": {
|
25 |
+
"dtype": "float32"
|
26 |
+
},
|
27 |
+
"frac_non_ascii": {
|
28 |
+
"dtype": "float32",
|
29 |
+
"bins": [
|
30 |
+
[
|
31 |
+
"Low",
|
32 |
+
null,
|
33 |
+
0.15
|
34 |
+
],
|
35 |
+
[
|
36 |
+
"Medium",
|
37 |
+
0.15,
|
38 |
+
0.3
|
39 |
+
],
|
40 |
+
[
|
41 |
+
"High",
|
42 |
+
0.3,
|
43 |
+
null
|
44 |
+
]
|
45 |
+
]
|
46 |
+
}
|
47 |
+
},
|
48 |
+
"signal": {
|
49 |
+
"signal_name": "text_statistics"
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
55 |
+
}
|
56 |
+
}
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"signal": {
|
60 |
+
"signal_name": "text_statistics"
|
61 |
+
},
|
62 |
+
"enriched_path": [
|
63 |
+
"answers",
|
64 |
+
"text",
|
65 |
+
"*"
|
66 |
+
]
|
67 |
+
}
|
data/datasets/lilac/squad_v2/config.yml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- {embedding: gte-small, path: context}
|
3 |
+
name: squad_v2
|
4 |
+
namespace: local
|
5 |
+
settings:
|
6 |
+
preferred_embedding: gte-small
|
7 |
+
ui:
|
8 |
+
media_paths:
|
9 |
+
- context
|
10 |
+
- question
|
11 |
+
- [answers, text, '*']
|
12 |
+
signals:
|
13 |
+
- path: context
|
14 |
+
signal: {signal_name: text_statistics}
|
15 |
+
- path: context
|
16 |
+
signal: {signal_name: pii}
|
17 |
+
- path: context
|
18 |
+
signal: {signal_name: near_dup}
|
19 |
+
- path: question
|
20 |
+
signal: {signal_name: spacy_ner}
|
21 |
+
- path: question
|
22 |
+
signal: {signal_name: pii}
|
23 |
+
- path: [answers, text, '*']
|
24 |
+
signal: {signal_name: pii}
|
25 |
+
- path: [answers, text, '*']
|
26 |
+
signal: {signal_name: spacy_ner}
|
27 |
+
- path: [answers, text, '*']
|
28 |
+
signal: {signal_name: near_dup}
|
29 |
+
- path: context
|
30 |
+
signal: {signal_name: lang_detection}
|
31 |
+
- path: [answers, text, '*']
|
32 |
+
signal: {signal_name: lang_detection}
|
33 |
+
- path: question
|
34 |
+
signal: {signal_name: near_dup}
|
35 |
+
- path: question
|
36 |
+
signal: {signal_name: lang_detection}
|
37 |
+
- path: [answers, text, '*']
|
38 |
+
signal: {signal_name: text_statistics}
|
39 |
+
- path: question
|
40 |
+
signal: {signal_name: text_statistics}
|
41 |
+
- path: context
|
42 |
+
signal: {signal_name: spacy_ner}
|
43 |
+
source: {dataset_name: squad_v2, source_name: huggingface}
|
44 |
+
tags: [machine-learning]
|
data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a940b774cac49c315e321d8ed7705687499d9d53857ef8824f690b3a8d40c226
|
3 |
+
size 601394376
|
data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc41116b96f4e1fa547697ce62afe0fe7aba054a8d694b308e1e0270474801da
|
3 |
+
size 10694495
|
data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(context)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"context": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"context"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/squad_v2/context/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d69a524ee48c0c218eeb901ae265ae74b12511fee17fe31ae1627c0122e25f04
|
3 |
+
size 8815907
|
data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd10704958387059935d9e22f0415677a6daf3105105af8314314ce3c3114274
|
3 |
+
size 4682949
|
data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(context)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"context": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"context"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0d3a359a05aa7c073900e4973569f808afa26c7bf0328c31e553efcc14bea90
|
3 |
+
size 4962702
|
data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(context)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"context": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"context"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0265330a9d7ff27498f4f0e9ddce89a027203d11941d6bc8f8d4334872346d9c
|
3 |
+
size 4685328
|
data/datasets/lilac/squad_v2/context/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(context)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"context": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"context"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1915970f9199dc82019cbb6089c85df3ddfd189848e0f34f549e34b617cd0f8
|
3 |
+
size 5165481
|
data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(context)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"context": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"context"
|
58 |
+
]
|
59 |
+
}
|
data/datasets/lilac/squad_v2/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9f54db85b8bacd3ea30ecf70410441e300c783e621767c1d9746d6474852ceb
|
3 |
+
size 27086838
|
data/datasets/lilac/squad_v2/manifest.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"id": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"title": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"context": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"question": {
|
17 |
+
"dtype": "string"
|
18 |
+
},
|
19 |
+
"answers": {
|
20 |
+
"fields": {
|
21 |
+
"text": {
|
22 |
+
"repeated_field": {
|
23 |
+
"dtype": "string"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"answer_start": {
|
27 |
+
"repeated_field": {
|
28 |
+
"dtype": "int32"
|
29 |
+
}
|
30 |
+
}
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"__hfsplit__": {
|
34 |
+
"dtype": "string"
|
35 |
+
},
|
36 |
+
"__rowid__": {
|
37 |
+
"dtype": "string"
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
}
|
data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06d32aa3096865a6236fd620a16499876c919b245e26fe9a2809b3c02eebc13d
|
3 |
+
size 4694280
|
data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"question"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b7e1d521750d16c37c70c378306ef22916e2d9715a565f1127d9e3626c966d4
|
3 |
+
size 5571030
|
data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"question"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a72f4f32331de183cfe67be224ba473ec83ba8f855dafab97371580684718e4f
|
3 |
+
size 4685523
|
data/datasets/lilac/squad_v2/question/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"question"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36304efa2147c8737c1c4da192837aa855504ecfe9eb6f14d267c501bcaaa246
|
3 |
+
size 5104750
|
data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"question"
|
58 |
+
]
|
59 |
+
}
|