Spaces:
Sleeping
Sleeping
File size: 2,721 Bytes
ffa1dc2 59dda75 6103102 ffa1dc2 6103102 b2308c6 537fc72 f50a132 ffa1dc2 6103102 f50a132 6103102 f50a132 6103102 f50a132 6103102 ffa1dc2 f50a132 6103102 f50a132 6103102 f50a132 ffa1dc2 f50a132 6103102 f50a132 6103102 f50a132 6103102 f50a132 6103102 f50a132 6103102 f50a132 ffa1dc2 f50a132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
from datasets import load_dataset
# Print all the available datasets
from huggingface_hub import list_datasets
import pandas as pd
import io
def list_available_datasets(query: str):
all_dsets = list_datasets()
matches = [ds for ds in all_dsets if query in ds]
return matches[:50]
def explore_dataset(dataset_name: str, split: str, num_examples: int):
ds = load_dataset(dataset_name, split=split)
# Schema: column name to feature type
schema = {col: str(ds.features[col]) for col in ds.column_names}
# Examples DataFrame
examples = ds.select(range(min(len(ds), num_examples))).to_pandas()
# Statistics: total samples and column types
stats = {"Anzahl Samples": len(ds)}
stats.update({col: str(ds.features[col]) for col in ds.column_names})
return schema, examples, stats
def export_column(dataset_name: str, split: str, column: str):
ds = load_dataset(dataset_name, split=split)
if column not in ds.column_names:
return "Spalte nicht gefunden.", ""
df = ds[column].to_pandas()
buffer = io.StringIO()
df.to_csv(buffer, index=False)
csv_text = buffer.getvalue()
return f"CSV für Spalte '{column}' erzeugt.", csv_text
with gr.Blocks() as demo:
gr.Markdown("## 📊 DataScout – Hugging Face Dataset Explorer")
with gr.Row():
query = gr.Textbox(label="Dataset suchen", placeholder="z.B. imdb")
search_btn = gr.Button("🔍 Suchen")
results = gr.Dropdown(label="Gefundene Datasets", choices=[], interactive=True)
split = gr.Dropdown(label="Split wählen", choices=["train", "test", "validation"], value="train")
num_examples = gr.Slider(label="Anzahl Beispiele", minimum=1, maximum=20, value=5, step=1)
explore_btn = gr.Button("👁️ Dataset erkunden")
schema_out = gr.JSON(label="Schema")
examples_out = gr.Dataframe(label="Beispiele")
stats_out = gr.JSON(label="Statistiken")
col_dropdown = gr.Dropdown(label="Spalte für CSV-Export", choices=[], interactive=True)
export_btn = gr.Button("📥 CSV erzeugen")
export_msg = gr.Textbox(label="Status")
export_csv = gr.TextArea(label="CSV-Ausgabe", lines=10)
# Events
search_btn.click(fn=list_available_datasets, inputs=query, outputs=results)
explore_btn.click(fn=explore_dataset, inputs=[results, split, num_examples], outputs=[schema_out, examples_out, stats_out])
results.change(fn=lambda name: load_dataset(name, split="train").column_names if name else [],
inputs=results, outputs=col_dropdown)
export_btn.click(fn=export_column, inputs=[results, split, col_dropdown], outputs=[export_msg, export_csv])
if __name__ == "__main__":
demo.launch()
|