Spaces:
Running
Running
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml +67 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json +87 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json +40 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json +36 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json +41 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json +50 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json +64 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json +36 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json +41 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json +50 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json +64 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json +36 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json +41 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json +50 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json +64 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json +32 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json +28 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json +33 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json +42 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json +56 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json +28 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json +33 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json +42 -0
.gitattributes
CHANGED
@@ -76,3 +76,6 @@ data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=
|
|
76 |
data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
77 |
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
78 |
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
76 |
data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
77 |
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
78 |
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
79 |
+
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
80 |
+
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
81 |
+
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- embedding: gte-small
|
3 |
+
path: [new-context, value, '*']
|
4 |
+
- {embedding: gte-small, path: original-context}
|
5 |
+
name: databricks-dolly-15k-curated-en
|
6 |
+
namespace: lilac
|
7 |
+
settings:
|
8 |
+
preferred_embedding: gte-small
|
9 |
+
ui:
|
10 |
+
media_paths:
|
11 |
+
- original-instruction
|
12 |
+
- original-context
|
13 |
+
- original-response
|
14 |
+
- [new-instruction, value, '*']
|
15 |
+
- [new-context, value, '*']
|
16 |
+
- [new-response, value, '*']
|
17 |
+
signals:
|
18 |
+
- path: original-instruction
|
19 |
+
signal: {signal_name: near_dup}
|
20 |
+
- path: original-instruction
|
21 |
+
signal: {signal_name: text_statistics}
|
22 |
+
- path: original-instruction
|
23 |
+
signal: {signal_name: pii}
|
24 |
+
- path: original-instruction
|
25 |
+
signal: {signal_name: lang_detection}
|
26 |
+
- path: original-context
|
27 |
+
signal: {signal_name: near_dup}
|
28 |
+
- path: original-context
|
29 |
+
signal: {signal_name: text_statistics}
|
30 |
+
- path: original-context
|
31 |
+
signal: {signal_name: lang_detection}
|
32 |
+
- path: original-context
|
33 |
+
signal: {signal_name: pii}
|
34 |
+
- path: original-response
|
35 |
+
signal: {signal_name: near_dup}
|
36 |
+
- path: original-response
|
37 |
+
signal: {signal_name: text_statistics}
|
38 |
+
- path: original-response
|
39 |
+
signal: {signal_name: pii}
|
40 |
+
- path: original-response
|
41 |
+
signal: {signal_name: lang_detection}
|
42 |
+
- path: [new-instruction, value, '*']
|
43 |
+
signal: {signal_name: near_dup}
|
44 |
+
- path: [new-instruction, value, '*']
|
45 |
+
signal: {signal_name: text_statistics}
|
46 |
+
- path: [new-instruction, value, '*']
|
47 |
+
signal: {signal_name: pii}
|
48 |
+
- path: [new-instruction, value, '*']
|
49 |
+
signal: {signal_name: lang_detection}
|
50 |
+
- path: [new-context, value, '*']
|
51 |
+
signal: {signal_name: near_dup}
|
52 |
+
- path: [new-context, value, '*']
|
53 |
+
signal: {signal_name: text_statistics}
|
54 |
+
- path: [new-context, value, '*']
|
55 |
+
signal: {signal_name: lang_detection}
|
56 |
+
- path: [new-context, value, '*']
|
57 |
+
signal: {signal_name: pii}
|
58 |
+
- path: [new-response, value, '*']
|
59 |
+
signal: {signal_name: near_dup}
|
60 |
+
- path: [new-response, value, '*']
|
61 |
+
signal: {signal_name: text_statistics}
|
62 |
+
- path: [new-response, value, '*']
|
63 |
+
signal: {signal_name: pii}
|
64 |
+
- path: [new-response, value, '*']
|
65 |
+
signal: {signal_name: lang_detection}
|
66 |
+
source: {dataset_name: argilla/databricks-dolly-15k-curated-en, source_name: huggingface}
|
67 |
+
tags: [machine-learning]
|
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad225b50d5880a097ea66eb4ca70fc529c0321cf8a5652bd8fbe7a638d016851
|
3 |
+
size 15882489
|
data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"id": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"category": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"original-instruction": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"original-context": {
|
17 |
+
"dtype": "string"
|
18 |
+
},
|
19 |
+
"original-response": {
|
20 |
+
"dtype": "string"
|
21 |
+
},
|
22 |
+
"new-instruction": {
|
23 |
+
"fields": {
|
24 |
+
"user_id": {
|
25 |
+
"repeated_field": {
|
26 |
+
"dtype": "string"
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"value": {
|
30 |
+
"repeated_field": {
|
31 |
+
"dtype": "string"
|
32 |
+
}
|
33 |
+
},
|
34 |
+
"status": {
|
35 |
+
"repeated_field": {
|
36 |
+
"dtype": "string"
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"new-context": {
|
42 |
+
"fields": {
|
43 |
+
"user_id": {
|
44 |
+
"repeated_field": {
|
45 |
+
"dtype": "string"
|
46 |
+
}
|
47 |
+
},
|
48 |
+
"value": {
|
49 |
+
"repeated_field": {
|
50 |
+
"dtype": "string"
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"status": {
|
54 |
+
"repeated_field": {
|
55 |
+
"dtype": "string"
|
56 |
+
}
|
57 |
+
}
|
58 |
+
}
|
59 |
+
},
|
60 |
+
"new-response": {
|
61 |
+
"fields": {
|
62 |
+
"user_id": {
|
63 |
+
"repeated_field": {
|
64 |
+
"dtype": "string"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"value": {
|
68 |
+
"repeated_field": {
|
69 |
+
"dtype": "string"
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"status": {
|
73 |
+
"repeated_field": {
|
74 |
+
"dtype": "string"
|
75 |
+
}
|
76 |
+
}
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"external_id": {
|
80 |
+
"dtype": "string"
|
81 |
+
},
|
82 |
+
"__hfsplit__": {
|
83 |
+
"dtype": "string"
|
84 |
+
}
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c879460250e68b6195eed6b48afa2fa2a7b8127483a299818a13f82ed7fea8dc
|
3 |
+
size 32553584
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl
ADDED
Binary file (522 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "new-context.value.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"new-context": {
|
7 |
+
"fields": {
|
8 |
+
"value": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "gte-small"
|
33 |
+
},
|
34 |
+
"enriched_path": [
|
35 |
+
"new-context",
|
36 |
+
"value",
|
37 |
+
"*"
|
38 |
+
],
|
39 |
+
"vector_store": "hnsw"
|
40 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl
ADDED
Binary file (351 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (521 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-context.value.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-context": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"signal": {
|
28 |
+
"split_by_paragraph": false,
|
29 |
+
"signal_name": "lang_detection"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"new-context",
|
33 |
+
"value",
|
34 |
+
"*"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (550 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-context.value.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-context": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"signal": {
|
33 |
+
"threshold": 0.85,
|
34 |
+
"signal_name": "near_dup"
|
35 |
+
},
|
36 |
+
"enriched_path": [
|
37 |
+
"new-context",
|
38 |
+
"value",
|
39 |
+
"*"
|
40 |
+
]
|
41 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet
ADDED
Binary file (519 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-context.value.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-context": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "pii"
|
44 |
+
},
|
45 |
+
"enriched_path": [
|
46 |
+
"new-context",
|
47 |
+
"value",
|
48 |
+
"*"
|
49 |
+
]
|
50 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (603 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-context.value.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-context": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"signal": {
|
57 |
+
"signal_name": "text_statistics"
|
58 |
+
},
|
59 |
+
"enriched_path": [
|
60 |
+
"new-context",
|
61 |
+
"value",
|
62 |
+
"*"
|
63 |
+
]
|
64 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (521 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-instruction.value.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"signal": {
|
28 |
+
"split_by_paragraph": false,
|
29 |
+
"signal_name": "lang_detection"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"new-instruction",
|
33 |
+
"value",
|
34 |
+
"*"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (602 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-instruction.value.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"signal": {
|
33 |
+
"threshold": 0.85,
|
34 |
+
"signal_name": "near_dup"
|
35 |
+
},
|
36 |
+
"enriched_path": [
|
37 |
+
"new-instruction",
|
38 |
+
"value",
|
39 |
+
"*"
|
40 |
+
]
|
41 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet
ADDED
Binary file (519 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-instruction.value.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "pii"
|
44 |
+
},
|
45 |
+
"enriched_path": [
|
46 |
+
"new-instruction",
|
47 |
+
"value",
|
48 |
+
"*"
|
49 |
+
]
|
50 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (581 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-instruction.value.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"signal": {
|
57 |
+
"signal_name": "text_statistics"
|
58 |
+
},
|
59 |
+
"enriched_path": [
|
60 |
+
"new-instruction",
|
61 |
+
"value",
|
62 |
+
"*"
|
63 |
+
]
|
64 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (521 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-response.value.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-response": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"signal": {
|
28 |
+
"split_by_paragraph": false,
|
29 |
+
"signal_name": "lang_detection"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"new-response",
|
33 |
+
"value",
|
34 |
+
"*"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (603 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-response.value.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-response": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"signal": {
|
33 |
+
"threshold": 0.85,
|
34 |
+
"signal_name": "near_dup"
|
35 |
+
},
|
36 |
+
"enriched_path": [
|
37 |
+
"new-response",
|
38 |
+
"value",
|
39 |
+
"*"
|
40 |
+
]
|
41 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet
ADDED
Binary file (520 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-response.value.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-response": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "pii"
|
44 |
+
},
|
45 |
+
"enriched_path": [
|
46 |
+
"new-response",
|
47 |
+
"value",
|
48 |
+
"*"
|
49 |
+
]
|
50 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (651 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "new-response.value.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"new-response": {
|
9 |
+
"fields": {
|
10 |
+
"value": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"signal": {
|
57 |
+
"signal_name": "text_statistics"
|
58 |
+
},
|
59 |
+
"enriched_path": [
|
60 |
+
"new-response",
|
61 |
+
"value",
|
62 |
+
"*"
|
63 |
+
]
|
64 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee5c4ca43663633f531a587438913cc15fecad5baed5fdce2a1c7bc97a6e9260
|
3 |
+
size 32775684
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl
ADDED
Binary file (488 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "original-context.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"original-context": {
|
7 |
+
"fields": {
|
8 |
+
"gte-small": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"embedding": {
|
12 |
+
"dtype": "embedding"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"dtype": "string_span"
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"signal_name": "gte-small"
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"signal": {
|
26 |
+
"signal_name": "gte-small"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"original-context"
|
30 |
+
],
|
31 |
+
"vector_store": "hnsw"
|
32 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl
ADDED
Binary file (347 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (521 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-context.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-context": {
|
9 |
+
"fields": {
|
10 |
+
"lang_detection": {
|
11 |
+
"dtype": "string",
|
12 |
+
"signal": {
|
13 |
+
"split_by_paragraph": false,
|
14 |
+
"signal_name": "lang_detection"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"split_by_paragraph": false,
|
23 |
+
"signal_name": "lang_detection"
|
24 |
+
},
|
25 |
+
"enriched_path": [
|
26 |
+
"original-context"
|
27 |
+
]
|
28 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (550 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-context.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-context": {
|
9 |
+
"fields": {
|
10 |
+
"near_dup": {
|
11 |
+
"fields": {
|
12 |
+
"cluster_id": {
|
13 |
+
"dtype": "uint32",
|
14 |
+
"categorical": true
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"threshold": 0.85,
|
19 |
+
"signal_name": "near_dup"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"threshold": 0.85,
|
28 |
+
"signal_name": "near_dup"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"original-context"
|
32 |
+
]
|
33 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet
ADDED
Binary file (519 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-context.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-context": {
|
9 |
+
"fields": {
|
10 |
+
"pii": {
|
11 |
+
"fields": {
|
12 |
+
"emails": {
|
13 |
+
"repeated_field": {
|
14 |
+
"dtype": "string_span"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"ip_addresses": {
|
18 |
+
"repeated_field": {
|
19 |
+
"dtype": "string_span"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"secrets": {
|
23 |
+
"repeated_field": {
|
24 |
+
"dtype": "string_span"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "pii"
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"signal": {
|
37 |
+
"signal_name": "pii"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"original-context"
|
41 |
+
]
|
42 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (602 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-context.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-context": {
|
9 |
+
"fields": {
|
10 |
+
"text_statistics": {
|
11 |
+
"fields": {
|
12 |
+
"num_characters": {
|
13 |
+
"dtype": "int32"
|
14 |
+
},
|
15 |
+
"readability": {
|
16 |
+
"dtype": "float32"
|
17 |
+
},
|
18 |
+
"log(type_token_ratio)": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"frac_non_ascii": {
|
22 |
+
"dtype": "float32",
|
23 |
+
"bins": [
|
24 |
+
[
|
25 |
+
"Low",
|
26 |
+
null,
|
27 |
+
0.15
|
28 |
+
],
|
29 |
+
[
|
30 |
+
"Medium",
|
31 |
+
0.15,
|
32 |
+
0.3
|
33 |
+
],
|
34 |
+
[
|
35 |
+
"High",
|
36 |
+
0.3,
|
37 |
+
null
|
38 |
+
]
|
39 |
+
]
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "text_statistics"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"signal": {
|
51 |
+
"signal_name": "text_statistics"
|
52 |
+
},
|
53 |
+
"enriched_path": [
|
54 |
+
"original-context"
|
55 |
+
]
|
56 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (521 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-instruction.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"lang_detection": {
|
11 |
+
"dtype": "string",
|
12 |
+
"signal": {
|
13 |
+
"split_by_paragraph": false,
|
14 |
+
"signal_name": "lang_detection"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"split_by_paragraph": false,
|
23 |
+
"signal_name": "lang_detection"
|
24 |
+
},
|
25 |
+
"enriched_path": [
|
26 |
+
"original-instruction"
|
27 |
+
]
|
28 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (602 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-instruction.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"near_dup": {
|
11 |
+
"fields": {
|
12 |
+
"cluster_id": {
|
13 |
+
"dtype": "uint32",
|
14 |
+
"categorical": true
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"threshold": 0.85,
|
19 |
+
"signal_name": "near_dup"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"threshold": 0.85,
|
28 |
+
"signal_name": "near_dup"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"original-instruction"
|
32 |
+
]
|
33 |
+
}
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet
ADDED
Binary file (519 kB). View file
|
|
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "original-instruction.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"original-instruction": {
|
9 |
+
"fields": {
|
10 |
+
"pii": {
|
11 |
+
"fields": {
|
12 |
+
"emails": {
|
13 |
+
"repeated_field": {
|
14 |
+
"dtype": "string_span"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"ip_addresses": {
|
18 |
+
"repeated_field": {
|
19 |
+
"dtype": "string_span"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"secrets": {
|
23 |
+
"repeated_field": {
|
24 |
+
"dtype": "string_span"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "pii"
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"signal": {
|
37 |
+
"signal_name": "pii"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"original-instruction"
|
41 |
+
]
|
42 |
+
}
|