Spaces:

JonusNattapong
/

DekGenerate

Running

App Files Files Community

Nattapong Tapachoom commited on 23 days ago

Commit

e7a189a

1 Parent(s): d53141f

Add data quality management features and update requirements

Browse files

Files changed (3) hide show

app.py +69 -5
data_quality.py +323 -0
requirements.txt +5 -0

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import time
 import queue
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import asyncio
 # Predefined task templates with Thai language support
 TASK_TEMPLATES = {
@@ -443,8 +444,8 @@ def generate_dataset_multi_model(selected_models: List[str], task_type: str, cus
 def create_interface():
     with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยด้วย Hugging Face")
-        gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูงโดยใช้โมเดลหลายตัวทำงานร่วมกัน")
         with gr.Row():
             with gr.Column():
@@ -551,24 +552,75 @@ def create_interface():
                         label="ความหลากหลาย (Top-p)"
                     )
                 generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("📊 ตัวอย่างข้อมูล"):
                     dataset_preview = gr.Dataframe(
-                        headers=["id", "ประเภทงาน", "คำสั่ง", "ผลลัพธ์"],
                         interactive=False
                     )
                 with gr.TabItem("💾 ดาวน์โหลด"):
-                    gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลที่สร้างแล้ว")
                     download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
                     with gr.Row():
                         csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
                         json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
                     csv_download = gr.File(
                         label="ไฟล์ CSV",
@@ -579,6 +631,16 @@ def create_interface():
                         label="ไฟล์ JSON",
                         visible=False
                     )
                 with gr.TabItem("📖 คู่มือการใช้งาน"):
                     gr.Markdown("""
@@ -617,6 +679,8 @@ def create_interface():
         csv_data_state = gr.State()
         json_data_state = gr.State()
         file_data_state = gr.State([])
         def update_model_info(model_key):
             if model_key in THAI_MODELS:
@@ -726,7 +790,7 @@ def create_interface():
             outputs=[json_download]
         )
-    return demo
 demo = create_interface()
 demo.launch()

 import queue
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import asyncio
+from data_quality import DataQualityManager, export_to_huggingface_format
 # Predefined task templates with Thai language support
 TASK_TEMPLATES = {
 def create_interface():
     with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
+        gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูง สะอาด และเป็นสากลด้วยโมเดลหลายตัว")
         with gr.Row():
             with gr.Column():
                         label="ความหลากหลาย (Top-p)"
                     )
+                # Data Quality Settings
+                gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
+                enable_cleaning = gr.Checkbox(
+                    label="เปิดใช้การทำความสะอาดข้อมูล",
+                    value=True
+                )
+                remove_duplicates = gr.Checkbox(
+                    label="ลบข้อมูลซ้ำซ้อน",
+                    value=True
+                )
+                min_quality_score = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.5,
+                    step=0.1,
+                    label="คะแนนคุณภาพขั้นต่ำ (0-1)"
+                )
+                # Export Settings
+                gr.Markdown("### 📦 การส่งออกข้อมูล")
+                create_splits = gr.Checkbox(
+                    label="แบ่งข้อมูล Train/Validation/Test",
+                    value=True
+                )
+                export_format = gr.Radio(
+                    choices=[
+                        ("📊 CSV + JSON (พื้นฐาน)", "standard"),
+                        ("🤗 Hugging Face Dataset (มาตรฐานสากล)", "huggingface"),
+                        ("📋 JSONL (สำหรับ Fine-tuning)", "jsonl")
+                    ],
+                    value="huggingface",
+                    label="รูปแบบการส่งออก"
+                )
                 generate_btn = gr.Button("🚀 สร้างชุดข้อมูลแบบทีมเวิร์ก", variant="primary", size="lg")
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("📊 ตัวอย่างข้อมูล"):
                     dataset_preview = gr.Dataframe(
+                        headers=["id", "task_type", "input", "output", "quality_score"],
                         interactive=False
                     )
+                with gr.TabItem("📈 รายงานคุณภาพ"):
+                    quality_report = gr.JSON(
+                        label="รายงานคุณภาพข้อมูล",
+                        visible=True
+                    )
+                    quality_summary = gr.Markdown(
+                        value="สร้างข้อมูลเสร็จแล้วจึงจะแสดงรายงานคุณภาพ"
+                    )
                 with gr.TabItem("💾 ดาวน์โหลด"):
+                    gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลคุณภาพสูง")
                     download_info = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
                     with gr.Row():
                         csv_btn = gr.Button("📄 ดาวน์โหลด CSV", variant="secondary")
                         json_btn = gr.Button("📋 ดาวน์โหลด JSON", variant="secondary")
+                        hf_btn = gr.Button("🤗 ดาวน์โหลด HF Dataset", variant="secondary")
+                        card_btn = gr.Button("📖 ดาวน์โหลด Dataset Card", variant="secondary")
                     csv_download = gr.File(
                         label="ไฟล์ CSV",
                         label="ไฟล์ JSON",
                         visible=False
                     )
+                    dataset_card_download = gr.File(
+                        label="Dataset Card (README.md)",
+                        visible=False
+                    )
+                    hf_dataset_download = gr.File(
+                        label="Hugging Face Dataset",
+                        visible=False
+                    )
                 with gr.TabItem("📖 คู่มือการใช้งาน"):
                     gr.Markdown("""
         csv_data_state = gr.State()
         json_data_state = gr.State()
         file_data_state = gr.State([])
+        dataset_card_state = gr.State()
+        quality_report_state = gr.State()
         def update_model_info(model_key):
             if model_key in THAI_MODELS:
             outputs=[json_download]
         )
+        return demo
 demo = create_interface()
 demo.launch()

data_quality.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import pandas as pd
+import json
+import re
+import hashlib
+from typing import List, Dict, Tuple
+from collections import Counter
+import unicodedata
+from datetime import datetime
+class DataQualityManager:
+    """Data Quality Management and Standardization"""
+    def __init__(self):
+        self.quality_report = {}
+        self.cleaned_data = []
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize Thai text"""
+        if not text or not isinstance(text, str):
+            return ""
+        # Remove HTML tags
+        text = re.sub(r'<[^>]+>', '', text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Normalize Thai characters
+        text = unicodedata.normalize('NFC', text)
+        # Clean Thai specific issues
+        text = re.sub(r'ๆ+', 'ๆ', text)  # Multiple repetition marks
+        text = re.sub(r'[฿๏๎๚๛]', '', text)  # Remove special Thai symbols
+        # Remove URLs
+        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        return text.strip()
+    def detect_duplicates(self, data: List[Dict]) -> Tuple[List[int], Dict]:
+        """Detect duplicate records"""
+        seen_hashes = {}
+        duplicates = []
+        for i, record in enumerate(data):
+            # Create hash from input content
+            content = str(record.get('prompt', '')) + str(record.get('input', ''))
+            content_hash = hashlib.md5(content.encode()).hexdigest()
+            if content_hash in seen_hashes:
+                duplicates.append(i)
+            else:
+                seen_hashes[content_hash] = i
+        return duplicates, {"total_duplicates": len(duplicates), "unique_records": len(seen_hashes)}
+    def validate_completeness(self, data: List[Dict]) -> Dict:
+        """Check data completeness"""
+        required_fields = ['id', 'prompt', 'generated_text']
+        incomplete_records = []
+        for i, record in enumerate(data):
+            missing_fields = [field for field in required_fields if not record.get(field)]
+            if missing_fields:
+                incomplete_records.append({
+                    'record_id': i,
+                    'missing_fields': missing_fields
+                })
+        return {
+            "incomplete_records": len(incomplete_records),
+            "details": incomplete_records[:10]  # Show first 10
+        }
+    def analyze_quality_metrics(self, data: List[Dict]) -> Dict:
+        """Analyze various quality metrics"""
+        if not data:
+            return {}
+        # Text length statistics
+        prompt_lengths = [len(str(record.get('prompt', ''))) for record in data]
+        output_lengths = [len(str(record.get('generated_text', ''))) for record in data]
+        # Language detection (simplified for Thai)
+        thai_pattern = re.compile(r'[ก-๏]')
+        thai_records = sum(1 for record in data if thai_pattern.search(str(record.get('generated_text', ''))))
+        # Model distribution
+        model_usage = Counter([record.get('model_used', 'unknown') for record in data])
+        return {
+            "total_records": len(data),
+            "avg_prompt_length": sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0,
+            "avg_output_length": sum(output_lengths) / len(output_lengths) if output_lengths else 0,
+            "thai_content_ratio": thai_records / len(data) if data else 0,
+            "model_distribution": dict(model_usage),
+            "length_stats": {
+                "min_prompt": min(prompt_lengths) if prompt_lengths else 0,
+                "max_prompt": max(prompt_lengths) if prompt_lengths else 0,
+                "min_output": min(output_lengths) if output_lengths else 0,
+                "max_output": max(output_lengths) if output_lengths else 0
+            }
+        }
+    def standardize_format(self, data: List[Dict], task_type: str) -> Tuple[List[Dict], Dict]:
+        """Standardize dataset format according to international standards"""
+        standardized_data = []
+        for i, record in enumerate(data):
+            # Create standardized record
+            std_record = {
+                "id": f"{task_type}_{i+1:06d}",
+                "task_type": task_type,
+                "input": self.clean_text(str(record.get('prompt', ''))),
+                "output": self.clean_text(str(record.get('generated_text', ''))),
+                "metadata": {
+                    "model_used": record.get('model_used', 'unknown'),
+                    "generation_time": record.get('generation_time'),
+                    "language": "th",
+                    "domain": self._detect_domain(record),
+                    "quality_score": self._calculate_quality_score(record)
+                }
+            }
+            # Add original data if available
+            if record.get('original_data'):
+                std_record["metadata"]["source_data"] = record['original_data']
+            standardized_data.append(std_record)
+        # Create dataset metadata
+        dataset_metadata = {
+            "dataset_name": f"thai_{task_type}_dataset",
+            "created_at": datetime.now().isoformat(),
+            "version": "1.0.0",
+            "language": "th",
+            "task_type": task_type,
+            "total_samples": len(standardized_data),
+            "license": "CC-BY-4.0",
+            "description": f"High-quality Thai {task_type} dataset generated using multiple language models"
+        }
+        return standardized_data, dataset_metadata
+    def _detect_domain(self, record: Dict) -> str:
+        """Detect domain/topic of the record"""
+        text = str(record.get('prompt', '')) + str(record.get('generated_text', ''))
+        text_lower = text.lower()
+        # Simple domain detection
+        if any(word in text_lower for word in ['สุขภาพ', 'โรค', 'ยา', 'แพทย์']):
+            return "health"
+        elif any(word in text_lower for word in ['การศึกษา', 'โรงเรียน', 'นักเรียน']):
+            return "education"
+        elif any(word in text_lower for word in ['เทคโนโลยี', 'คอมพิวเตอร์', 'โปรแกรม']):
+            return "technology"
+        elif any(word in text_lower for word in ['การเงิน', 'ธนาคาร', 'เงิน']):
+            return "finance"
+        else:
+            return "general"
+    def _calculate_quality_score(self, record: Dict) -> float:
+        """Calculate quality score for a record (0-1)"""
+        score = 1.0
+        prompt = str(record.get('prompt', ''))
+        output = str(record.get('generated_text', ''))
+        # Penalize very short outputs
+        if len(output) < 10:
+            score -= 0.3
+        # Penalize repetitive content
+        if len(set(output.split())) / len(output.split()) < 0.7 if output.split() else True:
+            score -= 0.2
+        # Penalize incomplete responses
+        if output.endswith('...') or len(output) < len(prompt) * 0.5:
+            score -= 0.2
+        # Bonus for Thai content
+        thai_pattern = re.compile(r'[ก-๏]')
+        if thai_pattern.search(output):
+            score += 0.1
+        return max(0.0, min(1.0, score))
+    def create_data_splits(self, data: List[Dict], train_ratio: float = 0.8,
+                          val_ratio: float = 0.1, test_ratio: float = 0.1) -> Dict:
+        """Create train/validation/test splits"""
+        import random
+        # Shuffle data
+        shuffled_data = data.copy()
+        random.shuffle(shuffled_data)
+        total = len(shuffled_data)
+        train_end = int(total * train_ratio)
+        val_end = train_end + int(total * val_ratio)
+        return {
+            "train": shuffled_data[:train_end],
+            "validation": shuffled_data[train_end:val_end],
+            "test": shuffled_data[val_end:]
+        }
+    def generate_dataset_card(self, metadata: Dict, quality_metrics: Dict) -> str:
+        """Generate dataset card (README) in markdown format"""
+        card_template = f"""# Thai {metadata['task_type'].title()} Dataset
+## Dataset Description
+This is a high-quality Thai {metadata['task_type']} dataset created using multiple state-of-the-art language models.
+## Dataset Information
+- **Language**: Thai (th)
+- **Task Type**: {metadata['task_type']}
+- **Total Samples**: {metadata['total_samples']:,}
+- **Created**: {metadata['created_at']}
+- **Version**: {metadata['version']}
+- **License**: {metadata['license']}
+## Quality Metrics
+- **Average Prompt Length**: {quality_metrics.get('avg_prompt_length', 0):.1f} characters
+- **Average Output Length**: {quality_metrics.get('avg_output_length', 0):.1f} characters
+- **Thai Content Ratio**: {quality_metrics.get('thai_content_ratio', 0):.2%}
+## Model Distribution
+{self._format_model_distribution(quality_metrics.get('model_distribution', {}))}
+## Data Fields
+- `id`: Unique identifier for each sample
+- `task_type`: Type of NLP task
+- `input`: Input prompt or question
+- `output`: Generated response or answer
+- `metadata`: Additional information including model used, quality score, etc.
+## Usage
+```python
+from datasets import load_dataset
+dataset = load_dataset("path/to/dataset")
+```
+## License
+This dataset is released under {metadata['license']} license.
+## Citation
+If you use this dataset in your research, please cite:
+```bibtex
+@dataset{{thai_{metadata['task_type']}_dataset,
+  title={{Thai {metadata['task_type'].title()} Dataset}},
+  author={{Thai Dataset Generator}},
+  year={{{datetime.now().year}}},
+  version={{{metadata['version']}}},
+  url={{https://github.com/your-repo/thai-dataset}}
+}}
+```
+"""
+        return card_template
+    def _format_model_distribution(self, model_dist: Dict) -> str:
+        """Format model distribution for markdown"""
+        if not model_dist:
+            return "No model distribution data available."
+        lines = []
+        for model, count in model_dist.items():
+            lines.append(f"- **{model}**: {count:,} samples")
+        return "\n".join(lines)
+def export_to_huggingface_format(data_splits: Dict, metadata: Dict, output_dir: str):
+    """Export dataset in Hugging Face compatible format"""
+    import os
+    import json
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Save data splits
+    for split_name, split_data in data_splits.items():
+        with open(os.path.join(output_dir, f"{split_name}.jsonl"), 'w', encoding='utf-8') as f:
+            for record in split_data:
+                f.write(json.dumps(record, ensure_ascii=False) + '\n')
+    # Save dataset info
+    dataset_info = {
+        "dataset_name": metadata["dataset_name"],
+        "config_name": "default",
+        "version": {"version_str": metadata["version"]},
+        "description": metadata["description"],
+        "homepage": "",
+        "license": metadata["license"],
+        "features": {
+            "id": {"dtype": "string"},
+            "task_type": {"dtype": "string"},
+            "input": {"dtype": "string"},
+            "output": {"dtype": "string"},
+            "metadata": {"dtype": "string"}
+        },
+        "splits": {
+            split_name: {"name": split_name, "num_examples": len(split_data)}
+            for split_name, split_data in data_splits.items()
+        }
+    }
+    with open(os.path.join(output_dir, "dataset_info.json"), 'w', encoding='utf-8') as f:
+        json.dump(dataset_info, f, ensure_ascii=False, indent=2)
+    print(f"Dataset exported to {output_dir}")

requirements.txt CHANGED Viewed

@@ -6,3 +6,8 @@ datasets>=2.0.0
 accelerate>=0.20.0
 numpy>=1.21.0
 requests>=2.28.0

 accelerate>=0.20.0
 numpy>=1.21.0
 requests>=2.28.0
+pyarrow>=10.0.0
+fasttext>=0.9.2
+langdetect>=1.0.9
+clean-text>=0.6.0
+jsonlines>=3.1.0