abubasith86 commited on
Commit
28572de
Β·
verified Β·
1 Parent(s): 3c67173

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -0
app.py CHANGED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import pandas as pd
4
+ import os
5
+
6
+ st.set_page_config(page_title="Dataset Builder", layout="wide")
7
+ st.title("πŸ“š JSONL Dataset Editor")
8
+
9
+ TMP_DIR = "temp"
10
+ TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
11
+
12
+ # --- Helper: ensure tmp dir exists ---
13
+ os.makedirs(TMP_DIR, exist_ok=True)
14
+
15
+
16
+ # --- Helper: get all unique fields from records ---
17
+ def get_all_fields(data):
18
+ all_keys = set()
19
+ for record in data:
20
+ all_keys.update(record.keys())
21
+ return sorted(all_keys)
22
+
23
+
24
+ # --- Load session data from temp file if exists ---
25
+ if "data" not in st.session_state:
26
+ if os.path.exists(TMP_FILE):
27
+ with open(TMP_FILE, "r", encoding="utf-8") as f:
28
+ st.session_state.data = [json.loads(line) for line in f]
29
+ st.session_state.all_fields = get_all_fields(st.session_state.data)
30
+ else:
31
+ st.session_state.data = []
32
+ st.session_state.all_fields = []
33
+
34
+ # --- Upload JSONL File ---
35
+ uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
36
+
37
+ if uploaded_file:
38
+ content = uploaded_file.read().decode("utf-8")
39
+ st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
40
+ st.session_state.all_fields = get_all_fields(st.session_state.data)
41
+
42
+ # Save to temp
43
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
44
+ for item in st.session_state.data:
45
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
46
+
47
+ st.success(
48
+ f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
49
+ )
50
+
51
+ # If still no data, use safe fallback fields
52
+ if not st.session_state.data and not st.session_state.all_fields:
53
+ st.session_state.all_fields = ["context", "question", "answer"]
54
+
55
+ # --- Edit Existing Records ---
56
+ st.markdown("### ✏️ Edit Records")
57
+
58
+ df = pd.DataFrame(st.session_state.data)
59
+ df = df.reindex(columns=st.session_state.all_fields)
60
+
61
+ # Fix: Convert likely text fields to string to avoid StreamlitAPIException
62
+ for field in st.session_state.all_fields:
63
+ if field.lower() in ["context", "answer", "question"]:
64
+ df[field] = df[field].astype(str)
65
+
66
+ # Auto-set long fields like "context", "answer" as textareas
67
+ column_configs = {
68
+ field: (
69
+ st.column_config.TextColumn(label=field, width="large")
70
+ if field.lower() in ["context", "answer", "question"]
71
+ else None
72
+ )
73
+ for field in st.session_state.all_fields
74
+ }
75
+
76
+ # --- Use st.data_editor for editable table ---
77
+ edited_df = st.data_editor(
78
+ df,
79
+ use_container_width=True,
80
+ num_rows="dynamic",
81
+ column_config=column_configs,
82
+ )
83
+
84
+ # --- Save updated data ---
85
+ if edited_df is not None:
86
+ st.session_state.data = edited_df.fillna("").to_dict(orient="records")
87
+
88
+ # Save to temp file
89
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
90
+ for item in st.session_state.data:
91
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
92
+
93
+ st.toast("βœ… Changes auto-saved!", icon="πŸ’Ύ")
94
+
95
+ # --- Add New Entry ---
96
+ st.markdown("### βž• Add New Entry")
97
+
98
+ # Show form with current fields
99
+ with st.form("new_entry_form"):
100
+ new_record = {}
101
+ for field in st.session_state.all_fields:
102
+ new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
103
+
104
+ submitted = st.form_submit_button("Add Entry")
105
+ if submitted:
106
+ st.session_state.data.append(new_record)
107
+
108
+ # Save to temp
109
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
110
+ for item in st.session_state.data:
111
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
112
+
113
+ st.success("βœ… New entry added!")
114
+ st.rerun()
115
+
116
+ # Option to add a new field
117
+ with st.expander("βž• Add New Field"):
118
+ new_field = st.text_input("New field name", key="new_field_name")
119
+ if st.button("Add Field"):
120
+ if new_field and new_field not in st.session_state.all_fields:
121
+ st.session_state.all_fields.append(new_field)
122
+ st.success(f"βœ… Field '{new_field}' added!")
123
+ st.rerun()
124
+
125
+
126
+ # --- Export JSONL ---
127
+ st.markdown("### πŸ“€ Export Dataset")
128
+
129
+ # Let user define a custom export path
130
+ export_path = st.text_input(
131
+ "Custom save path (e.g., ./exports/my_dataset.jsonl)",
132
+ value="./exports/exported_dataset.jsonl",
133
+ )
134
+
135
+ col1, col2 = st.columns(2)
136
+
137
+ # --- Export Button ---
138
+ with col1:
139
+ if st.button("πŸ“ Export JSONL"):
140
+ if not os.path.exists(os.path.dirname(export_path)):
141
+ os.makedirs(os.path.dirname(export_path))
142
+
143
+ # Write to custom path
144
+ with open(export_path, "w", encoding="utf-8") as f_out:
145
+ for row in st.session_state.data:
146
+ f_out.write(json.dumps(row, ensure_ascii=False) + "\n")
147
+
148
+ st.success(f"βœ… Dataset saved to {export_path}")
149
+
150
+ # Load content for download
151
+ with open(export_path, "r", encoding="utf-8") as f_download:
152
+ exported_content = f_download.read()
153
+
154
+ st.download_button(
155
+ "⬇️ Download JSONL",
156
+ exported_content,
157
+ file_name=os.path.basename(export_path),
158
+ mime="application/json",
159
+ )
160
+
161
+ # Reset session and temp
162
+ if os.path.exists(TMP_FILE):
163
+ os.remove(TMP_FILE)
164
+ st.session_state.clear()
165
+ st.success("🧹 Temporary session cleared. You're starting fresh!")
166
+ st.rerun()
167
+
168
+ # --- Download Temp Only Button ---
169
+ with col2:
170
+ if os.path.exists(TMP_FILE):
171
+ with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
172
+ tmp_content = f_tmp.read()
173
+
174
+ st.download_button(
175
+ "⬇️ Download Temp File",
176
+ tmp_content,
177
+ file_name="session_dataset.jsonl",
178
+ mime="application/json",
179
+ )
180
+ else:
181
+ st.warning("⚠️ No temp file found to download.")