Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +15 -0
- data/datasets/lilac/OpenOrca-100k/.DS_Store +0 -0
- data/datasets/lilac/OpenOrca-100k/config.yml +28 -0
- data/datasets/lilac/OpenOrca-100k/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/manifest.json +24 -0
- data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/OpenOrca-100k/question/gte-small/signal_manifest.json +32 -0
- data/datasets/lilac/OpenOrca-100k/question/gte-small/spans.pkl +3 -0
- data/datasets/lilac/OpenOrca-100k/question/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/question/lang_detection/signal_manifest.json +28 -0
- data/datasets/lilac/OpenOrca-100k/question/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/question/near_dup/signal_manifest.json +33 -0
- data/datasets/lilac/OpenOrca-100k/question/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/question/pii/signal_manifest.json +42 -0
- data/datasets/lilac/OpenOrca-100k/question/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/question/text_statistics/signal_manifest.json +56 -0
- data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/OpenOrca-100k/response/gte-small/signal_manifest.json +32 -0
- data/datasets/lilac/OpenOrca-100k/response/gte-small/spans.pkl +3 -0
- data/datasets/lilac/OpenOrca-100k/response/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/response/lang_detection/signal_manifest.json +28 -0
- data/datasets/lilac/OpenOrca-100k/response/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/response/near_dup/signal_manifest.json +33 -0
- data/datasets/lilac/OpenOrca-100k/response/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/response/pii/signal_manifest.json +42 -0
- data/datasets/lilac/OpenOrca-100k/response/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/OpenOrca-100k/response/text_statistics/signal_manifest.json +56 -0
.gitattributes
CHANGED
@@ -79,3 +79,18 @@ data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter
|
|
79 |
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
80 |
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
81 |
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
80 |
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
81 |
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
82 |
+
data/datasets/lilac/OpenOrca-100k/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
83 |
+
data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
84 |
+
data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
85 |
+
data/datasets/lilac/OpenOrca-100k/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
86 |
+
data/datasets/lilac/OpenOrca-100k/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
87 |
+
data/datasets/lilac/OpenOrca-100k/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
88 |
+
data/datasets/lilac/OpenOrca-100k/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
89 |
+
data/datasets/lilac/OpenOrca-100k/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
90 |
+
data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
91 |
+
data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
92 |
+
data/datasets/lilac/OpenOrca-100k/response/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
93 |
+
data/datasets/lilac/OpenOrca-100k/response/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
94 |
+
data/datasets/lilac/OpenOrca-100k/response/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
95 |
+
data/datasets/lilac/OpenOrca-100k/response/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
96 |
+
data/datasets/lilac/OpenOrca-100k/response/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/OpenOrca-100k/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/datasets/lilac/OpenOrca-100k/config.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- {embedding: gte-small, path: response}
|
3 |
+
- {embedding: gte-small, path: question}
|
4 |
+
name: OpenOrca-100k
|
5 |
+
namespace: local
|
6 |
+
settings:
|
7 |
+
preferred_embedding: gte-small
|
8 |
+
ui:
|
9 |
+
media_paths: [question, response]
|
10 |
+
signals:
|
11 |
+
- path: question
|
12 |
+
signal: {signal_name: near_dup}
|
13 |
+
- path: question
|
14 |
+
signal: {signal_name: text_statistics}
|
15 |
+
- path: question
|
16 |
+
signal: {signal_name: pii}
|
17 |
+
- path: question
|
18 |
+
signal: {signal_name: lang_detection}
|
19 |
+
- path: response
|
20 |
+
signal: {signal_name: near_dup}
|
21 |
+
- path: response
|
22 |
+
signal: {signal_name: text_statistics}
|
23 |
+
- path: response
|
24 |
+
signal: {signal_name: pii}
|
25 |
+
- path: response
|
26 |
+
signal: {signal_name: lang_detection}
|
27 |
+
source: {dataset_name: Open-Orca/OpenOrca, sample_size: 100000, source_name: huggingface}
|
28 |
+
tags: [machine-learning]
|
data/datasets/lilac/OpenOrca-100k/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f95588367446af55ccc2f089092779670c57308ee1f72a849e41f22e126d5052
|
3 |
+
size 105147761
|
data/datasets/lilac/OpenOrca-100k/manifest.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"id": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"system_prompt": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"question": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"response": {
|
17 |
+
"dtype": "string"
|
18 |
+
},
|
19 |
+
"__hfsplit__": {
|
20 |
+
"dtype": "string"
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f154c2dc5a0d69538c39df10508fe05cc36fb5489b61c303c9869320ef04581
|
3 |
+
size 596704812
|
data/datasets/lilac/OpenOrca-100k/question/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c0bac1790aa5247eb288c2a828a92eb313090b36a015665f6aae42e5a4dcb18
|
3 |
+
size 9378299
|
data/datasets/lilac/OpenOrca-100k/question/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "question.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"question": {
|
7 |
+
"fields": {
|
8 |
+
"gte-small": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"embedding": {
|
12 |
+
"dtype": "embedding"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"dtype": "string_span"
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"signal_name": "gte-small"
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"signal": {
|
26 |
+
"signal_name": "gte-small"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"question"
|
30 |
+
],
|
31 |
+
"vector_store": "hnsw"
|
32 |
+
}
|
data/datasets/lilac/OpenOrca-100k/question/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef19c506d4af4eab34aec3b280663687002db0792108b84d313f8ab6f532aa6c
|
3 |
+
size 6922769
|
data/datasets/lilac/OpenOrca-100k/question/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1a09e7085a4019205c62c28e6dcb46254fea37243e8087346d9c7298e05f9e1
|
3 |
+
size 3327888
|
data/datasets/lilac/OpenOrca-100k/question/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "question.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"question": {
|
9 |
+
"fields": {
|
10 |
+
"lang_detection": {
|
11 |
+
"dtype": "string",
|
12 |
+
"signal": {
|
13 |
+
"split_by_paragraph": false,
|
14 |
+
"signal_name": "lang_detection"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"split_by_paragraph": false,
|
23 |
+
"signal_name": "lang_detection"
|
24 |
+
},
|
25 |
+
"enriched_path": [
|
26 |
+
"question"
|
27 |
+
]
|
28 |
+
}
|
data/datasets/lilac/OpenOrca-100k/question/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1ff51f57fb136ab846d0c34a248aca4ef86d09fa0945737cd2c276d2f5dcb7d
|
3 |
+
size 3884385
|
data/datasets/lilac/OpenOrca-100k/question/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "question.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"question": {
|
9 |
+
"fields": {
|
10 |
+
"near_dup": {
|
11 |
+
"fields": {
|
12 |
+
"cluster_id": {
|
13 |
+
"dtype": "uint32",
|
14 |
+
"categorical": true
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"threshold": 0.85,
|
19 |
+
"signal_name": "near_dup"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"threshold": 0.85,
|
28 |
+
"signal_name": "near_dup"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"question"
|
32 |
+
]
|
33 |
+
}
|
data/datasets/lilac/OpenOrca-100k/question/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acc4cd2ae7c51b4450d159c63fee3e9739b3c1d5a36cfbf3bf45fe29e2ac15b5
|
3 |
+
size 3317869
|
data/datasets/lilac/OpenOrca-100k/question/pii/signal_manifest.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "question.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"question": {
|
9 |
+
"fields": {
|
10 |
+
"pii": {
|
11 |
+
"fields": {
|
12 |
+
"emails": {
|
13 |
+
"repeated_field": {
|
14 |
+
"dtype": "string_span"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"ip_addresses": {
|
18 |
+
"repeated_field": {
|
19 |
+
"dtype": "string_span"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"secrets": {
|
23 |
+
"repeated_field": {
|
24 |
+
"dtype": "string_span"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "pii"
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"signal": {
|
37 |
+
"signal_name": "pii"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"question"
|
41 |
+
]
|
42 |
+
}
|
data/datasets/lilac/OpenOrca-100k/question/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6703f93314760ee15d64532812a601c85d2f411254c1d809c6b3f558cc1c7c7
|
3 |
+
size 4321496
|
data/datasets/lilac/OpenOrca-100k/question/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "question.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"question": {
|
9 |
+
"fields": {
|
10 |
+
"text_statistics": {
|
11 |
+
"fields": {
|
12 |
+
"num_characters": {
|
13 |
+
"dtype": "int32"
|
14 |
+
},
|
15 |
+
"readability": {
|
16 |
+
"dtype": "float32"
|
17 |
+
},
|
18 |
+
"log(type_token_ratio)": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"frac_non_ascii": {
|
22 |
+
"dtype": "float32",
|
23 |
+
"bins": [
|
24 |
+
[
|
25 |
+
"Low",
|
26 |
+
null,
|
27 |
+
0.15
|
28 |
+
],
|
29 |
+
[
|
30 |
+
"Medium",
|
31 |
+
0.15,
|
32 |
+
0.3
|
33 |
+
],
|
34 |
+
[
|
35 |
+
"High",
|
36 |
+
0.3,
|
37 |
+
null
|
38 |
+
]
|
39 |
+
]
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "text_statistics"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"signal": {
|
51 |
+
"signal_name": "text_statistics"
|
52 |
+
},
|
53 |
+
"enriched_path": [
|
54 |
+
"question"
|
55 |
+
]
|
56 |
+
}
|
data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2374770842450f7d1712e2d56bc2e50bb1579af4cda061df2baf4631965dbcd
|
3 |
+
size 482647596
|
data/datasets/lilac/OpenOrca-100k/response/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afd6636965df6ed8f6aadd52a9638edf201c36dd470b816e6488e5417dcfe3c4
|
3 |
+
size 8159214
|
data/datasets/lilac/OpenOrca-100k/response/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "response.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"response": {
|
7 |
+
"fields": {
|
8 |
+
"gte-small": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"embedding": {
|
12 |
+
"dtype": "embedding"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"dtype": "string_span"
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"signal_name": "gte-small"
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"signal": {
|
26 |
+
"signal_name": "gte-small"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"response"
|
30 |
+
],
|
31 |
+
"vector_store": "hnsw"
|
32 |
+
}
|
data/datasets/lilac/OpenOrca-100k/response/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf61917d291a1e3157ca017b4eacdf8983bf8094b3b22d710031381927f19b16
|
3 |
+
size 6373377
|
data/datasets/lilac/OpenOrca-100k/response/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62af2b56e9bf3cbcddbceab6f858fc35fef50953b73b06a7da3bc1d2e62d3a53
|
3 |
+
size 3339983
|
data/datasets/lilac/OpenOrca-100k/response/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "response.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"response": {
|
9 |
+
"fields": {
|
10 |
+
"lang_detection": {
|
11 |
+
"dtype": "string",
|
12 |
+
"signal": {
|
13 |
+
"split_by_paragraph": false,
|
14 |
+
"signal_name": "lang_detection"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"split_by_paragraph": false,
|
23 |
+
"signal_name": "lang_detection"
|
24 |
+
},
|
25 |
+
"enriched_path": [
|
26 |
+
"response"
|
27 |
+
]
|
28 |
+
}
|
data/datasets/lilac/OpenOrca-100k/response/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc30679d1a2e6f2d3d45b2f145932daebf8a3f6ae4b73cfa9da3dbf5c495967d
|
3 |
+
size 3902985
|
data/datasets/lilac/OpenOrca-100k/response/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "response.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"response": {
|
9 |
+
"fields": {
|
10 |
+
"near_dup": {
|
11 |
+
"fields": {
|
12 |
+
"cluster_id": {
|
13 |
+
"dtype": "uint32",
|
14 |
+
"categorical": true
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"threshold": 0.85,
|
19 |
+
"signal_name": "near_dup"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"threshold": 0.85,
|
28 |
+
"signal_name": "near_dup"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"response"
|
32 |
+
]
|
33 |
+
}
|
data/datasets/lilac/OpenOrca-100k/response/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3b5830b33a8ddbe000b1b4403ef882731243075acc6416b5f673c90d4bf25ac
|
3 |
+
size 3313965
|
data/datasets/lilac/OpenOrca-100k/response/pii/signal_manifest.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "response.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"response": {
|
9 |
+
"fields": {
|
10 |
+
"pii": {
|
11 |
+
"fields": {
|
12 |
+
"emails": {
|
13 |
+
"repeated_field": {
|
14 |
+
"dtype": "string_span"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"ip_addresses": {
|
18 |
+
"repeated_field": {
|
19 |
+
"dtype": "string_span"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"secrets": {
|
23 |
+
"repeated_field": {
|
24 |
+
"dtype": "string_span"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "pii"
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"signal": {
|
37 |
+
"signal_name": "pii"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"response"
|
41 |
+
]
|
42 |
+
}
|
data/datasets/lilac/OpenOrca-100k/response/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e5bae031d37b7df9a3df49a616d58a8f9962307750039c1736b8faa56d8501a
|
3 |
+
size 4281305
|
data/datasets/lilac/OpenOrca-100k/response/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "response.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"response": {
|
9 |
+
"fields": {
|
10 |
+
"text_statistics": {
|
11 |
+
"fields": {
|
12 |
+
"num_characters": {
|
13 |
+
"dtype": "int32"
|
14 |
+
},
|
15 |
+
"readability": {
|
16 |
+
"dtype": "float32"
|
17 |
+
},
|
18 |
+
"log(type_token_ratio)": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"frac_non_ascii": {
|
22 |
+
"dtype": "float32",
|
23 |
+
"bins": [
|
24 |
+
[
|
25 |
+
"Low",
|
26 |
+
null,
|
27 |
+
0.15
|
28 |
+
],
|
29 |
+
[
|
30 |
+
"Medium",
|
31 |
+
0.15,
|
32 |
+
0.3
|
33 |
+
],
|
34 |
+
[
|
35 |
+
"High",
|
36 |
+
0.3,
|
37 |
+
null
|
38 |
+
]
|
39 |
+
]
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "text_statistics"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"signal": {
|
51 |
+
"signal_name": "text_statistics"
|
52 |
+
},
|
53 |
+
"enriched_path": [
|
54 |
+
"response"
|
55 |
+
]
|
56 |
+
}
|