File size: 2,721 Bytes
ffa1dc2
59dda75
 
 
 
6103102
 
ffa1dc2
6103102
b2308c6
537fc72
f50a132
ffa1dc2
6103102
 
f50a132
6103102
f50a132
6103102
f50a132
 
 
6103102
ffa1dc2
f50a132
6103102
 
f50a132
6103102
 
 
f50a132
 
ffa1dc2
f50a132
6103102
 
 
 
f50a132
 
6103102
 
 
f50a132
6103102
 
 
f50a132
 
 
6103102
f50a132
 
6103102
 
 
f50a132
 
 
ffa1dc2
f50a132
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
from datasets import load_dataset

# Print all the available datasets
from huggingface_hub import list_datasets
import pandas as pd
import io

def list_available_datasets(query: str):
    all_dsets = list_datasets()
    matches = [ds for ds in all_dsets if query in ds]
    return matches[:50]

def explore_dataset(dataset_name: str, split: str, num_examples: int):
    ds = load_dataset(dataset_name, split=split)
    # Schema: column name to feature type
    schema = {col: str(ds.features[col]) for col in ds.column_names}
    # Examples DataFrame
    examples = ds.select(range(min(len(ds), num_examples))).to_pandas()
    # Statistics: total samples and column types
    stats = {"Anzahl Samples": len(ds)}
    stats.update({col: str(ds.features[col]) for col in ds.column_names})
    return schema, examples, stats

def export_column(dataset_name: str, split: str, column: str):
    ds = load_dataset(dataset_name, split=split)
    if column not in ds.column_names:
        return "Spalte nicht gefunden.", ""
    df = ds[column].to_pandas()
    buffer = io.StringIO()
    df.to_csv(buffer, index=False)
    csv_text = buffer.getvalue()
    return f"CSV für Spalte '{column}' erzeugt.", csv_text

with gr.Blocks() as demo:
    gr.Markdown("## 📊 DataScout – Hugging Face Dataset Explorer")
    with gr.Row():
        query = gr.Textbox(label="Dataset suchen", placeholder="z.B. imdb")
        search_btn = gr.Button("🔍 Suchen")
    results = gr.Dropdown(label="Gefundene Datasets", choices=[], interactive=True)

    split = gr.Dropdown(label="Split wählen", choices=["train", "test", "validation"], value="train")
    num_examples = gr.Slider(label="Anzahl Beispiele", minimum=1, maximum=20, value=5, step=1)
    explore_btn = gr.Button("👁️ Dataset erkunden")

    schema_out = gr.JSON(label="Schema")
    examples_out = gr.Dataframe(label="Beispiele")
    stats_out = gr.JSON(label="Statistiken")

    col_dropdown = gr.Dropdown(label="Spalte für CSV-Export", choices=[], interactive=True)
    export_btn = gr.Button("📥 CSV erzeugen")
    export_msg = gr.Textbox(label="Status")
    export_csv = gr.TextArea(label="CSV-Ausgabe", lines=10)

    # Events
    search_btn.click(fn=list_available_datasets, inputs=query, outputs=results)
    explore_btn.click(fn=explore_dataset, inputs=[results, split, num_examples], outputs=[schema_out, examples_out, stats_out])
    results.change(fn=lambda name: load_dataset(name, split="train").column_names if name else [],
                   inputs=results, outputs=col_dropdown)
    export_btn.click(fn=export_column, inputs=[results, split, col_dropdown], outputs=[export_msg, export_csv])

if __name__ == "__main__":
    demo.launch()