import streamlit as st import json import pandas as pd import os st.set_page_config(page_title="Dataset Builder", layout="wide") st.title("๐Ÿ“š JSONL Dataset Editor") TMP_DIR = "temp" TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl") # --- Helper: ensure tmp dir exists --- os.makedirs(TMP_DIR, exist_ok=True) # --- Helper: get all unique fields from records --- def get_all_fields(data): all_keys = set() for record in data: all_keys.update(record.keys()) return sorted(all_keys) # --- Load session data from temp file if exists --- if "data" not in st.session_state: if os.path.exists(TMP_FILE): with open(TMP_FILE, "r", encoding="utf-8") as f: st.session_state.data = [json.loads(line) for line in f] st.session_state.all_fields = get_all_fields(st.session_state.data) else: st.session_state.data = [] st.session_state.all_fields = [] # --- Upload JSONL File --- uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"]) if uploaded_file: content = uploaded_file.read().decode("utf-8") st.session_state.data = [json.loads(line) for line in content.strip().splitlines()] st.session_state.all_fields = get_all_fields(st.session_state.data) # Save to temp with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.success( f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}" ) # If still no data, use safe fallback fields if not st.session_state.data and not st.session_state.all_fields: st.session_state.all_fields = ["context", "question", "answer"] # --- Edit Existing Records --- st.markdown("### โœ๏ธ Edit Records") df = pd.DataFrame(st.session_state.data) df = df.reindex(columns=st.session_state.all_fields) # Fix: Convert likely text fields to string to avoid StreamlitAPIException for field in st.session_state.all_fields: if field.lower() in ["context", "answer", "question"]: df[field] = df[field].astype(str) # Auto-set long fields like "context", "answer" as textareas column_configs = { field: ( st.column_config.TextColumn(label=field, width="large") if field.lower() in ["context", "answer", "question"] else None ) for field in st.session_state.all_fields } # --- Use st.data_editor for editable table --- edited_df = st.data_editor( df, use_container_width=True, num_rows="dynamic", column_config=column_configs, ) # --- Save updated data --- if edited_df is not None: st.session_state.data = edited_df.fillna("").to_dict(orient="records") # Save to temp file with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.toast("โœ… Changes auto-saved!", icon="๐Ÿ’พ") # --- Add New Entry --- st.markdown("### โž• Add New Entry") # Show form with current fields with st.form("new_entry_form"): new_record = {} for field in st.session_state.all_fields: new_record[field] = st.text_area(f"{field}", key=f"input_{field}") submitted = st.form_submit_button("Add Entry") if submitted: st.session_state.data.append(new_record) # Save to temp with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.success("โœ… New entry added!") st.rerun() # Option to add a new field with st.expander("โž• Add New Field"): new_field = st.text_input("New field name", key="new_field_name") if st.button("Add Field"): if new_field and new_field not in st.session_state.all_fields: st.session_state.all_fields.append(new_field) st.success(f"โœ… Field '{new_field}' added!") st.rerun() # --- Export JSONL --- st.markdown("### ๐Ÿ“ค Export Dataset") # Let user define a custom export path export_path = st.text_input( "Custom save path (e.g., ./exports/my_dataset.jsonl)", value="./exports/exported_dataset.jsonl", ) col1, col2 = st.columns(2) # --- Export Button --- with col1: if st.button("๐Ÿ“ Export JSONL"): if not os.path.exists(os.path.dirname(export_path)): os.makedirs(os.path.dirname(export_path)) # Write to custom path with open(export_path, "w", encoding="utf-8") as f_out: for row in st.session_state.data: f_out.write(json.dumps(row, ensure_ascii=False) + "\n") st.success(f"โœ… Dataset saved to {export_path}") # Load content for download with open(export_path, "r", encoding="utf-8") as f_download: exported_content = f_download.read() st.download_button( "โฌ‡๏ธ Download JSONL", exported_content, file_name=os.path.basename(export_path), mime="application/json", ) # Reset session and temp if os.path.exists(TMP_FILE): os.remove(TMP_FILE) st.session_state.clear() st.success("๐Ÿงน Temporary session cleared. You're starting fresh!") st.rerun() # --- Download Temp Only Button --- with col2: if os.path.exists(TMP_FILE): with open(TMP_FILE, "r", encoding="utf-8") as f_tmp: tmp_content = f_tmp.read() st.download_button( "โฌ‡๏ธ Download Temp File", tmp_content, file_name="session_dataset.jsonl", mime="application/json", ) else: st.warning("โš ๏ธ No temp file found to download.")