Nattapong Tapachoom commited on
Commit
e7a189a
·
1 Parent(s): d53141f

Add data quality management features and update requirements

Browse files
Files changed (3) hide show
  1. app.py +69 -5
  2. data_quality.py +323 -0
  3. requirements.txt +5 -0
app.py CHANGED
@@ -11,6 +11,7 @@ import time
11
  import queue
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  import asyncio
 
14
 
15
  # Predefined task templates with Thai language support
16
  TASK_TEMPLATES = {
@@ -443,8 +444,8 @@ def generate_dataset_multi_model(selected_models: List[str], task_type: str, cus
443
 
444
  def create_interface():
445
  with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
446
- gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยด้วย Hugging Face")
447
- gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูงโดยใช้โมเดลหลายตัวทำงานร่วมกัน")
448
 
449
  with gr.Row():
450
  with gr.Column():
@@ -551,24 +552,75 @@ def create_interface():
551
  label="ความหลากหลาย (Top-p)"
552
  )
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
555
 
556
  with gr.Column():
557
  with gr.Tabs():
558
  with gr.TabItem("📊 ตัวอย่างข้อมูล"):
559
  dataset_preview = gr.Dataframe(
560
- headers=["id", "ประเภทงาน", "คำสั่ง", "ผลลัพธ์"],
561
  interactive=False
562
  )
563
 
 
 
 
 
 
 
 
 
 
 
564
  with gr.TabItem("💾 ดาวน์โหลด"):
565
- gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลที่สร้างแล้ว")
566
 
567
  download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
568
 
569
  with gr.Row():
570
  csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
571
  json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
 
 
572
 
573
  csv_download = gr.File(
574
  label="ไฟล์ CSV",
@@ -579,6 +631,16 @@ def create_interface():
579
  label="ไฟล์ JSON",
580
  visible=False
581
  )
 
 
 
 
 
 
 
 
 
 
582
 
583
  with gr.TabItem("📖 คู่มือการใช้งาน"):
584
  gr.Markdown("""
@@ -617,6 +679,8 @@ def create_interface():
617
  csv_data_state = gr.State()
618
  json_data_state = gr.State()
619
  file_data_state = gr.State([])
 
 
620
 
621
  def update_model_info(model_key):
622
  if model_key in THAI_MODELS:
@@ -726,7 +790,7 @@ def create_interface():
726
  outputs=[json_download]
727
  )
728
 
729
- return demo
730
 
731
  demo = create_interface()
732
  demo.launch()
 
11
  import queue
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  import asyncio
14
+ from data_quality import DataQualityManager, export_to_huggingface_format
15
 
16
  # Predefined task templates with Thai language support
17
  TASK_TEMPLATES = {
 
444
 
445
  def create_interface():
446
  with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
447
+ gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
448
+ gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูง สะอาด และเป็นสากลด้วยโมเดลหลายตัว")
449
 
450
  with gr.Row():
451
  with gr.Column():
 
552
  label="ความหลากหลาย (Top-p)"
553
  )
554
 
555
+ # Data Quality Settings
556
+ gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
557
+
558
+ enable_cleaning = gr.Checkbox(
559
+ label="เปิดใช้การทำความสะอาดข้อมูล",
560
+ value=True
561
+ )
562
+
563
+ remove_duplicates = gr.Checkbox(
564
+ label="ลบข้อมูลซ้ำซ้อน",
565
+ value=True
566
+ )
567
+
568
+ min_quality_score = gr.Slider(
569
+ minimum=0.0,
570
+ maximum=1.0,
571
+ value=0.5,
572
+ step=0.1,
573
+ label="คะแนนคุณภาพขั้นต่ำ (0-1)"
574
+ )
575
+
576
+ # Export Settings
577
+ gr.Markdown("### 📦 การส่งออกข้อมูล")
578
+
579
+ create_splits = gr.Checkbox(
580
+ label="แบ่งข้อมูล Train/Validation/Test",
581
+ value=True
582
+ )
583
+
584
+ export_format = gr.Radio(
585
+ choices=[
586
+ ("📊 CSV + JSON (พื้นฐาน)", "standard"),
587
+ ("🤗 Hugging Face Dataset (มาตรฐานสากล)", "huggingface"),
588
+ ("📋 JSONL (สำหรับ Fine-tuning)", "jsonl")
589
+ ],
590
+ value="huggingface",
591
+ label="รูปแบบการส่งออก"
592
+ )
593
+
594
  generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
595
 
596
  with gr.Column():
597
  with gr.Tabs():
598
  with gr.TabItem("📊 ตัวอย่างข้อมูล"):
599
  dataset_preview = gr.Dataframe(
600
+ headers=["id", "task_type", "input", "output", "quality_score"],
601
  interactive=False
602
  )
603
 
604
+ with gr.TabItem("📈 รายงานคุณภาพ"):
605
+ quality_report = gr.JSON(
606
+ label="รายงานคุณภาพข้อมูล",
607
+ visible=True
608
+ )
609
+
610
+ quality_summary = gr.Markdown(
611
+ value="สร้างข้อมูลเสร็จแล้วจึงจะแสดงรายงานคุณภาพ"
612
+ )
613
+
614
  with gr.TabItem("💾 ดาวน์โหลด"):
615
+ gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลคุณภาพสูง")
616
 
617
  download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
618
 
619
  with gr.Row():
620
  csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
621
  json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
622
+ hf_btn = gr.Button("🤗 ดาวน์โหลด HF Dataset", variant="secondary")
623
+ card_btn = gr.Button("📖 ดาวน์โหลด Dataset Card", variant="secondary")
624
 
625
  csv_download = gr.File(
626
  label="ไฟล์ CSV",
 
631
  label="ไฟล์ JSON",
632
  visible=False
633
  )
634
+
635
+ dataset_card_download = gr.File(
636
+ label="Dataset Card (README.md)",
637
+ visible=False
638
+ )
639
+
640
+ hf_dataset_download = gr.File(
641
+ label="Hugging Face Dataset",
642
+ visible=False
643
+ )
644
 
645
  with gr.TabItem("📖 คู่มือการใช้งาน"):
646
  gr.Markdown("""
 
679
  csv_data_state = gr.State()
680
  json_data_state = gr.State()
681
  file_data_state = gr.State([])
682
+ dataset_card_state = gr.State()
683
+ quality_report_state = gr.State()
684
 
685
  def update_model_info(model_key):
686
  if model_key in THAI_MODELS:
 
790
  outputs=[json_download]
791
  )
792
 
793
+ return demo
794
 
795
  demo = create_interface()
796
  demo.launch()
data_quality.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ import re
4
+ import hashlib
5
+ from typing import List, Dict, Tuple
6
+ from collections import Counter
7
+ import unicodedata
8
+ from datetime import datetime
9
+
10
+ class DataQualityManager:
11
+ """Data Quality Management and Standardization"""
12
+
13
+ def __init__(self):
14
+ self.quality_report = {}
15
+ self.cleaned_data = []
16
+
17
+ def clean_text(self, text: str) -> str:
18
+ """Clean and normalize Thai text"""
19
+ if not text or not isinstance(text, str):
20
+ return ""
21
+
22
+ # Remove HTML tags
23
+ text = re.sub(r'<[^>]+>', '', text)
24
+
25
+ # Remove excessive whitespace
26
+ text = re.sub(r'\s+', ' ', text)
27
+
28
+ # Normalize Thai characters
29
+ text = unicodedata.normalize('NFC', text)
30
+
31
+ # Clean Thai specific issues
32
+ text = re.sub(r'ๆ+', 'ๆ', text) # Multiple repetition marks
33
+ text = re.sub(r'[฿๏๎๚๛]', '', text) # Remove special Thai symbols
34
+
35
+ # Remove URLs
36
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
37
+
38
+ # Remove email addresses
39
+ text = re.sub(r'\S+@\S+', '', text)
40
+
41
+ return text.strip()
42
+
43
+ def detect_duplicates(self, data: List[Dict]) -> Tuple[List[int], Dict]:
44
+ """Detect duplicate records"""
45
+ seen_hashes = {}
46
+ duplicates = []
47
+
48
+ for i, record in enumerate(data):
49
+ # Create hash from input content
50
+ content = str(record.get('prompt', '')) + str(record.get('input', ''))
51
+ content_hash = hashlib.md5(content.encode()).hexdigest()
52
+
53
+ if content_hash in seen_hashes:
54
+ duplicates.append(i)
55
+ else:
56
+ seen_hashes[content_hash] = i
57
+
58
+ return duplicates, {"total_duplicates": len(duplicates), "unique_records": len(seen_hashes)}
59
+
60
+ def validate_completeness(self, data: List[Dict]) -> Dict:
61
+ """Check data completeness"""
62
+ required_fields = ['id', 'prompt', 'generated_text']
63
+ incomplete_records = []
64
+
65
+ for i, record in enumerate(data):
66
+ missing_fields = [field for field in required_fields if not record.get(field)]
67
+ if missing_fields:
68
+ incomplete_records.append({
69
+ 'record_id': i,
70
+ 'missing_fields': missing_fields
71
+ })
72
+
73
+ return {
74
+ "incomplete_records": len(incomplete_records),
75
+ "details": incomplete_records[:10] # Show first 10
76
+ }
77
+
78
+ def analyze_quality_metrics(self, data: List[Dict]) -> Dict:
79
+ """Analyze various quality metrics"""
80
+ if not data:
81
+ return {}
82
+
83
+ # Text length statistics
84
+ prompt_lengths = [len(str(record.get('prompt', ''))) for record in data]
85
+ output_lengths = [len(str(record.get('generated_text', ''))) for record in data]
86
+
87
+ # Language detection (simplified for Thai)
88
+ thai_pattern = re.compile(r'[ก-๏]')
89
+ thai_records = sum(1 for record in data if thai_pattern.search(str(record.get('generated_text', ''))))
90
+
91
+ # Model distribution
92
+ model_usage = Counter([record.get('model_used', 'unknown') for record in data])
93
+
94
+ return {
95
+ "total_records": len(data),
96
+ "avg_prompt_length": sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0,
97
+ "avg_output_length": sum(output_lengths) / len(output_lengths) if output_lengths else 0,
98
+ "thai_content_ratio": thai_records / len(data) if data else 0,
99
+ "model_distribution": dict(model_usage),
100
+ "length_stats": {
101
+ "min_prompt": min(prompt_lengths) if prompt_lengths else 0,
102
+ "max_prompt": max(prompt_lengths) if prompt_lengths else 0,
103
+ "min_output": min(output_lengths) if output_lengths else 0,
104
+ "max_output": max(output_lengths) if output_lengths else 0
105
+ }
106
+ }
107
+
108
+ def standardize_format(self, data: List[Dict], task_type: str) -> Tuple[List[Dict], Dict]:
109
+ """Standardize dataset format according to international standards"""
110
+ standardized_data = []
111
+
112
+ for i, record in enumerate(data):
113
+ # Create standardized record
114
+ std_record = {
115
+ "id": f"{task_type}_{i+1:06d}",
116
+ "task_type": task_type,
117
+ "input": self.clean_text(str(record.get('prompt', ''))),
118
+ "output": self.clean_text(str(record.get('generated_text', ''))),
119
+ "metadata": {
120
+ "model_used": record.get('model_used', 'unknown'),
121
+ "generation_time": record.get('generation_time'),
122
+ "language": "th",
123
+ "domain": self._detect_domain(record),
124
+ "quality_score": self._calculate_quality_score(record)
125
+ }
126
+ }
127
+
128
+ # Add original data if available
129
+ if record.get('original_data'):
130
+ std_record["metadata"]["source_data"] = record['original_data']
131
+
132
+ standardized_data.append(std_record)
133
+
134
+ # Create dataset metadata
135
+ dataset_metadata = {
136
+ "dataset_name": f"thai_{task_type}_dataset",
137
+ "created_at": datetime.now().isoformat(),
138
+ "version": "1.0.0",
139
+ "language": "th",
140
+ "task_type": task_type,
141
+ "total_samples": len(standardized_data),
142
+ "license": "CC-BY-4.0",
143
+ "description": f"High-quality Thai {task_type} dataset generated using multiple language models"
144
+ }
145
+
146
+ return standardized_data, dataset_metadata
147
+
148
+ def _detect_domain(self, record: Dict) -> str:
149
+ """Detect domain/topic of the record"""
150
+ text = str(record.get('prompt', '')) + str(record.get('generated_text', ''))
151
+ text_lower = text.lower()
152
+
153
+ # Simple domain detection
154
+ if any(word in text_lower for word in ['สุขภาพ', 'โรค', 'ยา', 'แพทย์']):
155
+ return "health"
156
+ elif any(word in text_lower for word in ['การศึกษา', 'โรงเรียน', 'นักเรียน']):
157
+ return "education"
158
+ elif any(word in text_lower for word in ['เทคโนโลยี', 'คอมพิวเตอร์', 'โปรแกรม']):
159
+ return "technology"
160
+ elif any(word in text_lower for word in ['การเงิน', 'ธนาคาร', 'เงิน']):
161
+ return "finance"
162
+ else:
163
+ return "general"
164
+
165
+ def _calculate_quality_score(self, record: Dict) -> float:
166
+ """Calculate quality score for a record (0-1)"""
167
+ score = 1.0
168
+
169
+ prompt = str(record.get('prompt', ''))
170
+ output = str(record.get('generated_text', ''))
171
+
172
+ # Penalize very short outputs
173
+ if len(output) < 10:
174
+ score -= 0.3
175
+
176
+ # Penalize repetitive content
177
+ if len(set(output.split())) / len(output.split()) < 0.7 if output.split() else True:
178
+ score -= 0.2
179
+
180
+ # Penalize incomplete responses
181
+ if output.endswith('...') or len(output) < len(prompt) * 0.5:
182
+ score -= 0.2
183
+
184
+ # Bonus for Thai content
185
+ thai_pattern = re.compile(r'[ก-๏]')
186
+ if thai_pattern.search(output):
187
+ score += 0.1
188
+
189
+ return max(0.0, min(1.0, score))
190
+
191
+ def create_data_splits(self, data: List[Dict], train_ratio: float = 0.8,
192
+ val_ratio: float = 0.1, test_ratio: float = 0.1) -> Dict:
193
+ """Create train/validation/test splits"""
194
+ import random
195
+
196
+ # Shuffle data
197
+ shuffled_data = data.copy()
198
+ random.shuffle(shuffled_data)
199
+
200
+ total = len(shuffled_data)
201
+ train_end = int(total * train_ratio)
202
+ val_end = train_end + int(total * val_ratio)
203
+
204
+ return {
205
+ "train": shuffled_data[:train_end],
206
+ "validation": shuffled_data[train_end:val_end],
207
+ "test": shuffled_data[val_end:]
208
+ }
209
+
210
+ def generate_dataset_card(self, metadata: Dict, quality_metrics: Dict) -> str:
211
+ """Generate dataset card (README) in markdown format"""
212
+
213
+ card_template = f"""# Thai {metadata['task_type'].title()} Dataset
214
+
215
+ ## Dataset Description
216
+
217
+ This is a high-quality Thai {metadata['task_type']} dataset created using multiple state-of-the-art language models.
218
+
219
+ ## Dataset Information
220
+
221
+ - **Language**: Thai (th)
222
+ - **Task Type**: {metadata['task_type']}
223
+ - **Total Samples**: {metadata['total_samples']:,}
224
+ - **Created**: {metadata['created_at']}
225
+ - **Version**: {metadata['version']}
226
+ - **License**: {metadata['license']}
227
+
228
+ ## Quality Metrics
229
+
230
+ - **Average Prompt Length**: {quality_metrics.get('avg_prompt_length', 0):.1f} characters
231
+ - **Average Output Length**: {quality_metrics.get('avg_output_length', 0):.1f} characters
232
+ - **Thai Content Ratio**: {quality_metrics.get('thai_content_ratio', 0):.2%}
233
+
234
+ ## Model Distribution
235
+
236
+ {self._format_model_distribution(quality_metrics.get('model_distribution', {}))}
237
+
238
+ ## Data Fields
239
+
240
+ - `id`: Unique identifier for each sample
241
+ - `task_type`: Type of NLP task
242
+ - `input`: Input prompt or question
243
+ - `output`: Generated response or answer
244
+ - `metadata`: Additional information including model used, quality score, etc.
245
+
246
+ ## Usage
247
+
248
+ ```python
249
+ from datasets import load_dataset
250
+
251
+ dataset = load_dataset("path/to/dataset")
252
+ ```
253
+
254
+ ## License
255
+
256
+ This dataset is released under {metadata['license']} license.
257
+
258
+ ## Citation
259
+
260
+ If you use this dataset in your research, please cite:
261
+
262
+ ```bibtex
263
+ @dataset{{thai_{metadata['task_type']}_dataset,
264
+ title={{Thai {metadata['task_type'].title()} Dataset}},
265
+ author={{Thai Dataset Generator}},
266
+ year={{{datetime.now().year}}},
267
+ version={{{metadata['version']}}},
268
+ url={{https://github.com/your-repo/thai-dataset}}
269
+ }}
270
+ ```
271
+ """
272
+ return card_template
273
+
274
+ def _format_model_distribution(self, model_dist: Dict) -> str:
275
+ """Format model distribution for markdown"""
276
+ if not model_dist:
277
+ return "No model distribution data available."
278
+
279
+ lines = []
280
+ for model, count in model_dist.items():
281
+ lines.append(f"- **{model}**: {count:,} samples")
282
+
283
+ return "\n".join(lines)
284
+
285
+ def export_to_huggingface_format(data_splits: Dict, metadata: Dict, output_dir: str):
286
+ """Export dataset in Hugging Face compatible format"""
287
+ import os
288
+ import json
289
+
290
+ # Create output directory
291
+ os.makedirs(output_dir, exist_ok=True)
292
+
293
+ # Save data splits
294
+ for split_name, split_data in data_splits.items():
295
+ with open(os.path.join(output_dir, f"{split_name}.jsonl"), 'w', encoding='utf-8') as f:
296
+ for record in split_data:
297
+ f.write(json.dumps(record, ensure_ascii=False) + '\n')
298
+
299
+ # Save dataset info
300
+ dataset_info = {
301
+ "dataset_name": metadata["dataset_name"],
302
+ "config_name": "default",
303
+ "version": {"version_str": metadata["version"]},
304
+ "description": metadata["description"],
305
+ "homepage": "",
306
+ "license": metadata["license"],
307
+ "features": {
308
+ "id": {"dtype": "string"},
309
+ "task_type": {"dtype": "string"},
310
+ "input": {"dtype": "string"},
311
+ "output": {"dtype": "string"},
312
+ "metadata": {"dtype": "string"}
313
+ },
314
+ "splits": {
315
+ split_name: {"name": split_name, "num_examples": len(split_data)}
316
+ for split_name, split_data in data_splits.items()
317
+ }
318
+ }
319
+
320
+ with open(os.path.join(output_dir, "dataset_info.json"), 'w', encoding='utf-8') as f:
321
+ json.dump(dataset_info, f, ensure_ascii=False, indent=2)
322
+
323
+ print(f"Dataset exported to {output_dir}")
requirements.txt CHANGED
@@ -6,3 +6,8 @@ datasets>=2.0.0
6
  accelerate>=0.20.0
7
  numpy>=1.21.0
8
  requests>=2.28.0
 
 
 
 
 
 
6
  accelerate>=0.20.0
7
  numpy>=1.21.0
8
  requests>=2.28.0
9
+ pyarrow>=10.0.0
10
+ fasttext>=0.9.2
11
+ langdetect>=1.0.9
12
+ clean-text>=0.6.0
13
+ jsonlines>=3.1.0