Spaces:
Running
Running
Nattapong Tapachoom
commited on
Commit
·
e7a189a
1
Parent(s):
d53141f
Add data quality management features and update requirements
Browse files- app.py +69 -5
- data_quality.py +323 -0
- requirements.txt +5 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ import time
|
|
11 |
import queue
|
12 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
import asyncio
|
|
|
14 |
|
15 |
# Predefined task templates with Thai language support
|
16 |
TASK_TEMPLATES = {
|
@@ -443,8 +444,8 @@ def generate_dataset_multi_model(selected_models: List[str], task_type: str, cus
|
|
443 |
|
444 |
def create_interface():
|
445 |
with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
|
446 |
-
gr.Markdown("# 🤗
|
447 |
-
gr.Markdown("
|
448 |
|
449 |
with gr.Row():
|
450 |
with gr.Column():
|
@@ -551,24 +552,75 @@ def create_interface():
|
|
551 |
label="ความหลากหลาย (Top-p)"
|
552 |
)
|
553 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
|
555 |
|
556 |
with gr.Column():
|
557 |
with gr.Tabs():
|
558 |
with gr.TabItem("📊 ตัวอย่างข้อมูล"):
|
559 |
dataset_preview = gr.Dataframe(
|
560 |
-
headers=["id", "
|
561 |
interactive=False
|
562 |
)
|
563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
with gr.TabItem("💾 ดาวน์โหลด"):
|
565 |
-
gr.Markdown("### 💾
|
566 |
|
567 |
download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
|
568 |
|
569 |
with gr.Row():
|
570 |
csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
|
571 |
json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
|
|
|
|
|
572 |
|
573 |
csv_download = gr.File(
|
574 |
label="ไฟล์ CSV",
|
@@ -579,6 +631,16 @@ def create_interface():
|
|
579 |
label="ไฟล์ JSON",
|
580 |
visible=False
|
581 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
with gr.TabItem("📖 คู่มือการใช้งาน"):
|
584 |
gr.Markdown("""
|
@@ -617,6 +679,8 @@ def create_interface():
|
|
617 |
csv_data_state = gr.State()
|
618 |
json_data_state = gr.State()
|
619 |
file_data_state = gr.State([])
|
|
|
|
|
620 |
|
621 |
def update_model_info(model_key):
|
622 |
if model_key in THAI_MODELS:
|
@@ -726,7 +790,7 @@ def create_interface():
|
|
726 |
outputs=[json_download]
|
727 |
)
|
728 |
|
729 |
-
|
730 |
|
731 |
demo = create_interface()
|
732 |
demo.launch()
|
|
|
11 |
import queue
|
12 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
import asyncio
|
14 |
+
from data_quality import DataQualityManager, export_to_huggingface_format
|
15 |
|
16 |
# Predefined task templates with Thai language support
|
17 |
TASK_TEMPLATES = {
|
|
|
444 |
|
445 |
def create_interface():
|
446 |
with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
|
447 |
+
gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
|
448 |
+
gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูง สะอาด และเป็นสากลด้วยโมเดลหลายตัว")
|
449 |
|
450 |
with gr.Row():
|
451 |
with gr.Column():
|
|
|
552 |
label="ความหลากหลาย (Top-p)"
|
553 |
)
|
554 |
|
555 |
+
# Data Quality Settings
|
556 |
+
gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
|
557 |
+
|
558 |
+
enable_cleaning = gr.Checkbox(
|
559 |
+
label="เปิดใช้การทำความสะอาดข้อมูล",
|
560 |
+
value=True
|
561 |
+
)
|
562 |
+
|
563 |
+
remove_duplicates = gr.Checkbox(
|
564 |
+
label="ลบข้อมูลซ้ำซ้อน",
|
565 |
+
value=True
|
566 |
+
)
|
567 |
+
|
568 |
+
min_quality_score = gr.Slider(
|
569 |
+
minimum=0.0,
|
570 |
+
maximum=1.0,
|
571 |
+
value=0.5,
|
572 |
+
step=0.1,
|
573 |
+
label="คะแนนคุณภาพขั้นต่ำ (0-1)"
|
574 |
+
)
|
575 |
+
|
576 |
+
# Export Settings
|
577 |
+
gr.Markdown("### 📦 การส่งออกข้อมูล")
|
578 |
+
|
579 |
+
create_splits = gr.Checkbox(
|
580 |
+
label="แบ่งข้อมูล Train/Validation/Test",
|
581 |
+
value=True
|
582 |
+
)
|
583 |
+
|
584 |
+
export_format = gr.Radio(
|
585 |
+
choices=[
|
586 |
+
("📊 CSV + JSON (พื้นฐาน)", "standard"),
|
587 |
+
("🤗 Hugging Face Dataset (มาตรฐานสากล)", "huggingface"),
|
588 |
+
("📋 JSONL (สำหรับ Fine-tuning)", "jsonl")
|
589 |
+
],
|
590 |
+
value="huggingface",
|
591 |
+
label="รูปแบบการส่งออก"
|
592 |
+
)
|
593 |
+
|
594 |
generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
|
595 |
|
596 |
with gr.Column():
|
597 |
with gr.Tabs():
|
598 |
with gr.TabItem("📊 ตัวอย่างข้อมูล"):
|
599 |
dataset_preview = gr.Dataframe(
|
600 |
+
headers=["id", "task_type", "input", "output", "quality_score"],
|
601 |
interactive=False
|
602 |
)
|
603 |
|
604 |
+
with gr.TabItem("📈 รายงานคุณภาพ"):
|
605 |
+
quality_report = gr.JSON(
|
606 |
+
label="รายงานคุณภาพข้อมูล",
|
607 |
+
visible=True
|
608 |
+
)
|
609 |
+
|
610 |
+
quality_summary = gr.Markdown(
|
611 |
+
value="สร้างข้อมูลเสร็จแล้วจึงจะแสดงรายงานคุณภาพ"
|
612 |
+
)
|
613 |
+
|
614 |
with gr.TabItem("💾 ดาวน์โหลด"):
|
615 |
+
gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลคุณภาพสูง")
|
616 |
|
617 |
download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
|
618 |
|
619 |
with gr.Row():
|
620 |
csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
|
621 |
json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
|
622 |
+
hf_btn = gr.Button("🤗 ดาวน์โหลด HF Dataset", variant="secondary")
|
623 |
+
card_btn = gr.Button("📖 ดาวน์โหลด Dataset Card", variant="secondary")
|
624 |
|
625 |
csv_download = gr.File(
|
626 |
label="ไฟล์ CSV",
|
|
|
631 |
label="ไฟล์ JSON",
|
632 |
visible=False
|
633 |
)
|
634 |
+
|
635 |
+
dataset_card_download = gr.File(
|
636 |
+
label="Dataset Card (README.md)",
|
637 |
+
visible=False
|
638 |
+
)
|
639 |
+
|
640 |
+
hf_dataset_download = gr.File(
|
641 |
+
label="Hugging Face Dataset",
|
642 |
+
visible=False
|
643 |
+
)
|
644 |
|
645 |
with gr.TabItem("📖 คู่มือการใช้งาน"):
|
646 |
gr.Markdown("""
|
|
|
679 |
csv_data_state = gr.State()
|
680 |
json_data_state = gr.State()
|
681 |
file_data_state = gr.State([])
|
682 |
+
dataset_card_state = gr.State()
|
683 |
+
quality_report_state = gr.State()
|
684 |
|
685 |
def update_model_info(model_key):
|
686 |
if model_key in THAI_MODELS:
|
|
|
790 |
outputs=[json_download]
|
791 |
)
|
792 |
|
793 |
+
return demo
|
794 |
|
795 |
demo = create_interface()
|
796 |
demo.launch()
|
data_quality.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import hashlib
|
5 |
+
from typing import List, Dict, Tuple
|
6 |
+
from collections import Counter
|
7 |
+
import unicodedata
|
8 |
+
from datetime import datetime
|
9 |
+
|
10 |
+
class DataQualityManager:
|
11 |
+
"""Data Quality Management and Standardization"""
|
12 |
+
|
13 |
+
def __init__(self):
|
14 |
+
self.quality_report = {}
|
15 |
+
self.cleaned_data = []
|
16 |
+
|
17 |
+
def clean_text(self, text: str) -> str:
|
18 |
+
"""Clean and normalize Thai text"""
|
19 |
+
if not text or not isinstance(text, str):
|
20 |
+
return ""
|
21 |
+
|
22 |
+
# Remove HTML tags
|
23 |
+
text = re.sub(r'<[^>]+>', '', text)
|
24 |
+
|
25 |
+
# Remove excessive whitespace
|
26 |
+
text = re.sub(r'\s+', ' ', text)
|
27 |
+
|
28 |
+
# Normalize Thai characters
|
29 |
+
text = unicodedata.normalize('NFC', text)
|
30 |
+
|
31 |
+
# Clean Thai specific issues
|
32 |
+
text = re.sub(r'ๆ+', 'ๆ', text) # Multiple repetition marks
|
33 |
+
text = re.sub(r'[฿๏๎๚๛]', '', text) # Remove special Thai symbols
|
34 |
+
|
35 |
+
# Remove URLs
|
36 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
37 |
+
|
38 |
+
# Remove email addresses
|
39 |
+
text = re.sub(r'\S+@\S+', '', text)
|
40 |
+
|
41 |
+
return text.strip()
|
42 |
+
|
43 |
+
def detect_duplicates(self, data: List[Dict]) -> Tuple[List[int], Dict]:
|
44 |
+
"""Detect duplicate records"""
|
45 |
+
seen_hashes = {}
|
46 |
+
duplicates = []
|
47 |
+
|
48 |
+
for i, record in enumerate(data):
|
49 |
+
# Create hash from input content
|
50 |
+
content = str(record.get('prompt', '')) + str(record.get('input', ''))
|
51 |
+
content_hash = hashlib.md5(content.encode()).hexdigest()
|
52 |
+
|
53 |
+
if content_hash in seen_hashes:
|
54 |
+
duplicates.append(i)
|
55 |
+
else:
|
56 |
+
seen_hashes[content_hash] = i
|
57 |
+
|
58 |
+
return duplicates, {"total_duplicates": len(duplicates), "unique_records": len(seen_hashes)}
|
59 |
+
|
60 |
+
def validate_completeness(self, data: List[Dict]) -> Dict:
|
61 |
+
"""Check data completeness"""
|
62 |
+
required_fields = ['id', 'prompt', 'generated_text']
|
63 |
+
incomplete_records = []
|
64 |
+
|
65 |
+
for i, record in enumerate(data):
|
66 |
+
missing_fields = [field for field in required_fields if not record.get(field)]
|
67 |
+
if missing_fields:
|
68 |
+
incomplete_records.append({
|
69 |
+
'record_id': i,
|
70 |
+
'missing_fields': missing_fields
|
71 |
+
})
|
72 |
+
|
73 |
+
return {
|
74 |
+
"incomplete_records": len(incomplete_records),
|
75 |
+
"details": incomplete_records[:10] # Show first 10
|
76 |
+
}
|
77 |
+
|
78 |
+
def analyze_quality_metrics(self, data: List[Dict]) -> Dict:
|
79 |
+
"""Analyze various quality metrics"""
|
80 |
+
if not data:
|
81 |
+
return {}
|
82 |
+
|
83 |
+
# Text length statistics
|
84 |
+
prompt_lengths = [len(str(record.get('prompt', ''))) for record in data]
|
85 |
+
output_lengths = [len(str(record.get('generated_text', ''))) for record in data]
|
86 |
+
|
87 |
+
# Language detection (simplified for Thai)
|
88 |
+
thai_pattern = re.compile(r'[ก-๏]')
|
89 |
+
thai_records = sum(1 for record in data if thai_pattern.search(str(record.get('generated_text', ''))))
|
90 |
+
|
91 |
+
# Model distribution
|
92 |
+
model_usage = Counter([record.get('model_used', 'unknown') for record in data])
|
93 |
+
|
94 |
+
return {
|
95 |
+
"total_records": len(data),
|
96 |
+
"avg_prompt_length": sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0,
|
97 |
+
"avg_output_length": sum(output_lengths) / len(output_lengths) if output_lengths else 0,
|
98 |
+
"thai_content_ratio": thai_records / len(data) if data else 0,
|
99 |
+
"model_distribution": dict(model_usage),
|
100 |
+
"length_stats": {
|
101 |
+
"min_prompt": min(prompt_lengths) if prompt_lengths else 0,
|
102 |
+
"max_prompt": max(prompt_lengths) if prompt_lengths else 0,
|
103 |
+
"min_output": min(output_lengths) if output_lengths else 0,
|
104 |
+
"max_output": max(output_lengths) if output_lengths else 0
|
105 |
+
}
|
106 |
+
}
|
107 |
+
|
108 |
+
def standardize_format(self, data: List[Dict], task_type: str) -> Tuple[List[Dict], Dict]:
|
109 |
+
"""Standardize dataset format according to international standards"""
|
110 |
+
standardized_data = []
|
111 |
+
|
112 |
+
for i, record in enumerate(data):
|
113 |
+
# Create standardized record
|
114 |
+
std_record = {
|
115 |
+
"id": f"{task_type}_{i+1:06d}",
|
116 |
+
"task_type": task_type,
|
117 |
+
"input": self.clean_text(str(record.get('prompt', ''))),
|
118 |
+
"output": self.clean_text(str(record.get('generated_text', ''))),
|
119 |
+
"metadata": {
|
120 |
+
"model_used": record.get('model_used', 'unknown'),
|
121 |
+
"generation_time": record.get('generation_time'),
|
122 |
+
"language": "th",
|
123 |
+
"domain": self._detect_domain(record),
|
124 |
+
"quality_score": self._calculate_quality_score(record)
|
125 |
+
}
|
126 |
+
}
|
127 |
+
|
128 |
+
# Add original data if available
|
129 |
+
if record.get('original_data'):
|
130 |
+
std_record["metadata"]["source_data"] = record['original_data']
|
131 |
+
|
132 |
+
standardized_data.append(std_record)
|
133 |
+
|
134 |
+
# Create dataset metadata
|
135 |
+
dataset_metadata = {
|
136 |
+
"dataset_name": f"thai_{task_type}_dataset",
|
137 |
+
"created_at": datetime.now().isoformat(),
|
138 |
+
"version": "1.0.0",
|
139 |
+
"language": "th",
|
140 |
+
"task_type": task_type,
|
141 |
+
"total_samples": len(standardized_data),
|
142 |
+
"license": "CC-BY-4.0",
|
143 |
+
"description": f"High-quality Thai {task_type} dataset generated using multiple language models"
|
144 |
+
}
|
145 |
+
|
146 |
+
return standardized_data, dataset_metadata
|
147 |
+
|
148 |
+
def _detect_domain(self, record: Dict) -> str:
|
149 |
+
"""Detect domain/topic of the record"""
|
150 |
+
text = str(record.get('prompt', '')) + str(record.get('generated_text', ''))
|
151 |
+
text_lower = text.lower()
|
152 |
+
|
153 |
+
# Simple domain detection
|
154 |
+
if any(word in text_lower for word in ['สุขภาพ', 'โรค', 'ยา', 'แพทย์']):
|
155 |
+
return "health"
|
156 |
+
elif any(word in text_lower for word in ['การศึกษา', 'โรงเรียน', 'นักเรียน']):
|
157 |
+
return "education"
|
158 |
+
elif any(word in text_lower for word in ['เทคโนโลยี', 'คอมพิวเตอร์', 'โปรแกรม']):
|
159 |
+
return "technology"
|
160 |
+
elif any(word in text_lower for word in ['การเงิน', 'ธนาคาร', 'เงิน']):
|
161 |
+
return "finance"
|
162 |
+
else:
|
163 |
+
return "general"
|
164 |
+
|
165 |
+
def _calculate_quality_score(self, record: Dict) -> float:
|
166 |
+
"""Calculate quality score for a record (0-1)"""
|
167 |
+
score = 1.0
|
168 |
+
|
169 |
+
prompt = str(record.get('prompt', ''))
|
170 |
+
output = str(record.get('generated_text', ''))
|
171 |
+
|
172 |
+
# Penalize very short outputs
|
173 |
+
if len(output) < 10:
|
174 |
+
score -= 0.3
|
175 |
+
|
176 |
+
# Penalize repetitive content
|
177 |
+
if len(set(output.split())) / len(output.split()) < 0.7 if output.split() else True:
|
178 |
+
score -= 0.2
|
179 |
+
|
180 |
+
# Penalize incomplete responses
|
181 |
+
if output.endswith('...') or len(output) < len(prompt) * 0.5:
|
182 |
+
score -= 0.2
|
183 |
+
|
184 |
+
# Bonus for Thai content
|
185 |
+
thai_pattern = re.compile(r'[ก-๏]')
|
186 |
+
if thai_pattern.search(output):
|
187 |
+
score += 0.1
|
188 |
+
|
189 |
+
return max(0.0, min(1.0, score))
|
190 |
+
|
191 |
+
def create_data_splits(self, data: List[Dict], train_ratio: float = 0.8,
|
192 |
+
val_ratio: float = 0.1, test_ratio: float = 0.1) -> Dict:
|
193 |
+
"""Create train/validation/test splits"""
|
194 |
+
import random
|
195 |
+
|
196 |
+
# Shuffle data
|
197 |
+
shuffled_data = data.copy()
|
198 |
+
random.shuffle(shuffled_data)
|
199 |
+
|
200 |
+
total = len(shuffled_data)
|
201 |
+
train_end = int(total * train_ratio)
|
202 |
+
val_end = train_end + int(total * val_ratio)
|
203 |
+
|
204 |
+
return {
|
205 |
+
"train": shuffled_data[:train_end],
|
206 |
+
"validation": shuffled_data[train_end:val_end],
|
207 |
+
"test": shuffled_data[val_end:]
|
208 |
+
}
|
209 |
+
|
210 |
+
def generate_dataset_card(self, metadata: Dict, quality_metrics: Dict) -> str:
|
211 |
+
"""Generate dataset card (README) in markdown format"""
|
212 |
+
|
213 |
+
card_template = f"""# Thai {metadata['task_type'].title()} Dataset
|
214 |
+
|
215 |
+
## Dataset Description
|
216 |
+
|
217 |
+
This is a high-quality Thai {metadata['task_type']} dataset created using multiple state-of-the-art language models.
|
218 |
+
|
219 |
+
## Dataset Information
|
220 |
+
|
221 |
+
- **Language**: Thai (th)
|
222 |
+
- **Task Type**: {metadata['task_type']}
|
223 |
+
- **Total Samples**: {metadata['total_samples']:,}
|
224 |
+
- **Created**: {metadata['created_at']}
|
225 |
+
- **Version**: {metadata['version']}
|
226 |
+
- **License**: {metadata['license']}
|
227 |
+
|
228 |
+
## Quality Metrics
|
229 |
+
|
230 |
+
- **Average Prompt Length**: {quality_metrics.get('avg_prompt_length', 0):.1f} characters
|
231 |
+
- **Average Output Length**: {quality_metrics.get('avg_output_length', 0):.1f} characters
|
232 |
+
- **Thai Content Ratio**: {quality_metrics.get('thai_content_ratio', 0):.2%}
|
233 |
+
|
234 |
+
## Model Distribution
|
235 |
+
|
236 |
+
{self._format_model_distribution(quality_metrics.get('model_distribution', {}))}
|
237 |
+
|
238 |
+
## Data Fields
|
239 |
+
|
240 |
+
- `id`: Unique identifier for each sample
|
241 |
+
- `task_type`: Type of NLP task
|
242 |
+
- `input`: Input prompt or question
|
243 |
+
- `output`: Generated response or answer
|
244 |
+
- `metadata`: Additional information including model used, quality score, etc.
|
245 |
+
|
246 |
+
## Usage
|
247 |
+
|
248 |
+
```python
|
249 |
+
from datasets import load_dataset
|
250 |
+
|
251 |
+
dataset = load_dataset("path/to/dataset")
|
252 |
+
```
|
253 |
+
|
254 |
+
## License
|
255 |
+
|
256 |
+
This dataset is released under {metadata['license']} license.
|
257 |
+
|
258 |
+
## Citation
|
259 |
+
|
260 |
+
If you use this dataset in your research, please cite:
|
261 |
+
|
262 |
+
```bibtex
|
263 |
+
@dataset{{thai_{metadata['task_type']}_dataset,
|
264 |
+
title={{Thai {metadata['task_type'].title()} Dataset}},
|
265 |
+
author={{Thai Dataset Generator}},
|
266 |
+
year={{{datetime.now().year}}},
|
267 |
+
version={{{metadata['version']}}},
|
268 |
+
url={{https://github.com/your-repo/thai-dataset}}
|
269 |
+
}}
|
270 |
+
```
|
271 |
+
"""
|
272 |
+
return card_template
|
273 |
+
|
274 |
+
def _format_model_distribution(self, model_dist: Dict) -> str:
|
275 |
+
"""Format model distribution for markdown"""
|
276 |
+
if not model_dist:
|
277 |
+
return "No model distribution data available."
|
278 |
+
|
279 |
+
lines = []
|
280 |
+
for model, count in model_dist.items():
|
281 |
+
lines.append(f"- **{model}**: {count:,} samples")
|
282 |
+
|
283 |
+
return "\n".join(lines)
|
284 |
+
|
285 |
+
def export_to_huggingface_format(data_splits: Dict, metadata: Dict, output_dir: str):
|
286 |
+
"""Export dataset in Hugging Face compatible format"""
|
287 |
+
import os
|
288 |
+
import json
|
289 |
+
|
290 |
+
# Create output directory
|
291 |
+
os.makedirs(output_dir, exist_ok=True)
|
292 |
+
|
293 |
+
# Save data splits
|
294 |
+
for split_name, split_data in data_splits.items():
|
295 |
+
with open(os.path.join(output_dir, f"{split_name}.jsonl"), 'w', encoding='utf-8') as f:
|
296 |
+
for record in split_data:
|
297 |
+
f.write(json.dumps(record, ensure_ascii=False) + '\n')
|
298 |
+
|
299 |
+
# Save dataset info
|
300 |
+
dataset_info = {
|
301 |
+
"dataset_name": metadata["dataset_name"],
|
302 |
+
"config_name": "default",
|
303 |
+
"version": {"version_str": metadata["version"]},
|
304 |
+
"description": metadata["description"],
|
305 |
+
"homepage": "",
|
306 |
+
"license": metadata["license"],
|
307 |
+
"features": {
|
308 |
+
"id": {"dtype": "string"},
|
309 |
+
"task_type": {"dtype": "string"},
|
310 |
+
"input": {"dtype": "string"},
|
311 |
+
"output": {"dtype": "string"},
|
312 |
+
"metadata": {"dtype": "string"}
|
313 |
+
},
|
314 |
+
"splits": {
|
315 |
+
split_name: {"name": split_name, "num_examples": len(split_data)}
|
316 |
+
for split_name, split_data in data_splits.items()
|
317 |
+
}
|
318 |
+
}
|
319 |
+
|
320 |
+
with open(os.path.join(output_dir, "dataset_info.json"), 'w', encoding='utf-8') as f:
|
321 |
+
json.dump(dataset_info, f, ensure_ascii=False, indent=2)
|
322 |
+
|
323 |
+
print(f"Dataset exported to {output_dir}")
|
requirements.txt
CHANGED
@@ -6,3 +6,8 @@ datasets>=2.0.0
|
|
6 |
accelerate>=0.20.0
|
7 |
numpy>=1.21.0
|
8 |
requests>=2.28.0
|
|
|
|
|
|
|
|
|
|
|
|
6 |
accelerate>=0.20.0
|
7 |
numpy>=1.21.0
|
8 |
requests>=2.28.0
|
9 |
+
pyarrow>=10.0.0
|
10 |
+
fasttext>=0.9.2
|
11 |
+
langdetect>=1.0.9
|
12 |
+
clean-text>=0.6.0
|
13 |
+
jsonlines>=3.1.0
|