Spaces:

Mohzen321
/

AboutKeywords

Sleeping

App Files Files Community

Mohzen321 commited on Mar 1

Commit

b08284c

verified ·

1 Parent(s): 68c151b

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -253

app.py CHANGED Viewed

@@ -1,280 +1,82 @@
 import streamlit as st
 from transformers import pipeline
 import re
-import time
 # تحميل النموذج
 classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-distilroberta-base")
 # عنوان التطبيق
-st.title("Text Classification App")
-# اختيار العملية
-operation = st.radio("Choose an operation:", ["Filter Keywords", "Extra & Filter Param (URLs)"])
 # إدخال الملف النصي
-uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
 if uploaded_file is not None:
     # قراءة الملف النصي
     content = uploaded_file.read().decode("utf-8")
-    items = [line.strip() for line in content.splitlines() if line.strip()]
-    # تحديد الفئات
-    categories = ["shop", "game", "stream"]
     # قوائم لتخزين النتائج
-    shopping_items = []
-    gaming_items = []
-    streaming_items = []
-    unknown_items = []
-    # قوائم خاصة بالباراميترات
-    param_categories = {
-        "shop_params": [],
-        "game_params": [],
-        "stream_params": [],
-        "unknown_params": []
-    }
-    # قائمة لتحليل الصيغ (Extensions)
-    extensions = {}
-    # قائمة لتحليل أنماط الصفحات الكاملة (Full PageType)
     full_page_types = []
-    # متغيرات للتحكم في العملية
-    progress_bar = st.progress(0)
-    pause_button = st.button("Pause")
-    stop_button = st.button("Stop")
-    continue_button = st.button("Continue")
-    paused = False
-    stopped = False
-    current_index = 0  # مؤشر للكلمة الحالية
-    batch_size = 10  # عدد العناصر التي يتم معالجتها في الدفعة الواحدة
-    # دالة تصنيف الكلمات باستخدام الدفعات
-    def classify_keywords_batch(items, categories, start_index=0):
-        global paused, stopped, current_index
-        total_items = len(items)
-        for i in range(start_index, total_items, batch_size):
-            if stopped:
-                break
-            if paused:
-                time.sleep(0.5)
-                continue
-            # معالجة دفعة من العناصر
-            batch = items[i:i + batch_size]
-            results = classifier(batch, categories)
-            for j, result in enumerate(results):
-                best_category = result['labels'][0]
-                score = result['scores'][0]
-                if best_category == "shop" and score > 0.5:
-                    shopping_items.append(batch[j])
-                elif best_category == "game" and score > 0.5:
-                    gaming_items.append(batch[j])
-                elif best_category == "stream" and score > 0.5:
-                    streaming_items.append(batch[j])
-                else:
-                    unknown_items.append(batch[j])
-            # تحديث المؤشر الحالي
-            current_index = min(i + batch_size, total_items)  # تأكد من عدم تجاوز المؤشر للعدد الإجمالي
-            # تحديث شريط التقدم
-            progress = min((current_index) / total_items, 1.0)  # تأكد من أن قيمة التقدم لا تتجاوز 1.0
-            progress_bar.progress(progress)
-            # تحديث النتائج في الوقت الحقيقي
-            update_results()
-            # إبطاء العملية قليلاً للسماح بتحديث الواجهة
-            time.sleep(0.1)
-    # دالة تصنيف الباراميترات
-    def classify_parameters(items, categories, start_index=0):
-        global paused, stopped, current_index
-        total_items = len(items)
-        for i in range(start_index, total_items, batch_size):
-            if stopped:
-                break
-            if paused:
-                time.sleep(0.5)
-                continue
-            # معالجة دفعة من الروابط
-            batch = items[i:i + batch_size]
-            for url in batch:
-                # استخراج الباراميترات من الرابط باستخدام RegEx
-                params = re.findall(r'(\w+)=\w+', url)
-                for param in params:
-                    # تصنيف الباراميتر باستخدام zero-shot-classification
-                    result = classifier(param, categories)
-                    best_category = result['labels'][0]
-                    score = result['scores'][0]
-                    if best_category == "shop" and score > 0.5:
-                        param_categories["shop_params"].append(param)
-                    elif best_category == "game" and score > 0.5:
-                        param_categories["game_params"].append(param)
-                    elif best_category == "stream" and score > 0.5:
-                        param_categories["stream_params"].append(param)
-                    else:
-                        param_categories["unknown_params"].append(param)
-                # استخراج الصيغ (Extensions) من الروابط
-                match = re.search(r'\.([a-zA-Z0-9]+)$', url)
-                if match:
-                    ext = match.group(1)
-                    if ext not in extensions:
-                        extensions[ext] = 0
-                    extensions[ext] += 1
-                # استخراج أنماط الصفحات الكاملة (Full PageType)
-                page_type_match = re.search(r'(\w+\.php|\w+\.html)\?', url)
-                if page_type_match:
-                    page_type = page_type_match.group(1)
-                    if page_type not in full_page_types:
-                        full_page_types.append(page_type)
-            # تحديث المؤشر الحالي
-            current_index = min(i + batch_size, total_items)  # تأكد من عدم تجاوز المؤشر للعدد الإجمالي
-            # تحديث شريط التقدم
-            progress = min((current_index) / total_items, 1.0)  # تأكد من أن قيمة التقدم لا تتجاوز 1.0
-            progress_bar.progress(progress)
-            # تحديث النتائج في الوقت الحقيقي
-            update_results()
-            # إبطاء العملية قليلاً للسماح بتحديث الواجهة
-            time.sleep(0.1)
-    # دالة تحديث النتائج
-    def update_results():
-        # تحديث محتوى المربعات النصية
-        st.session_state.shopping_text = "\n".join(shopping_items)
-        st.session_state.gaming_text = "\n".join(gaming_items)
-        st.session_state.streaming_text = "\n".join(streaming_items)
-        st.session_state.unknown_text = "\n".join(unknown_items)
-        # تحديث محتوى المربعات الخاصة بالباراميترات
-        st.session_state.shop_params = "\n".join(set(param_categories["shop_params"]))
-        st.session_state.game_params = "\n".join(set(param_categories["game_params"]))
-        st.session_state.stream_params = "\n".join(set(param_categories["stream_params"]))
-        st.session_state.unknown_params = "\n".join(set(param_categories["unknown_params"]))
-        # تحديث محتوى المربع الخاص بالصيغ
-        st.session_state.extensions_text = "\n".join(extensions.keys())
-        # تحديث محتوى المربع الخاص بأنماط الصفحات الكاملة
-        st.session_state.full_page_types = "\n".join(full_page_types)
-    # دالة تصدير النتائج
-    def export_results(key, filename):
-        with open(filename, "w") as f:
-            f.write(st.session_state[key])
-        st.success(f"Results exported to {filename}")
     # زر البدء
     if st.button("Start"):
-        stopped = False
-        paused = False
-        current_index = 0
-        if operation == "Filter Keywords":
-            classify_keywords_batch(items, categories, start_index=current_index)
-        elif operation == "Extra & Filter Param (URLs)":
-            classify_parameters(items, categories, start_index=current_index)
-    # زر الإيقاف المؤقت
-    if pause_button:
-        paused = True
-        st.write("Classification paused.")
-    # زر الاستمرار
-    if continue_button and paused:
-        paused = False
-        st.write("Classification resumed.")
-        if operation == "Filter Keywords":
-            classify_keywords_batch(items, categories, start_index=current_index)
-        elif operation == "Extra & Filter Param (URLs)":
-            classify_parameters(items, categories, start_index=current_index)
-    # زر التوقف الكامل
-    if stop_button:
-        stopped = True
-        st.write("Classification stopped.")
-    # عرض النتائج بناءً على الخيار المختار
-    if operation == "Filter Keywords":
-        # عرض النتائج للكلمات المفتاحية
-        st.header("Shopping Keywords")
-        if 'shopping_text' not in st.session_state:
-            st.session_state.shopping_text = ""
-        st.text_area("Copy the shopping keywords here:", value=st.session_state.shopping_text, height=200, key="shopping")
-        st.button("Export Shopping Keywords", on_click=export_results, args=("shopping_text", "shopping_keywords.txt"))
-        st.header("Gaming Keywords")
-        if 'gaming_text' not in st.session_state:
-            st.session_state.gaming_text = ""
-        st.text_area("Copy the gaming keywords here:", value=st.session_state.gaming_text, height=200, key="gaming")
-        st.button("Export Gaming Keywords", on_click=export_results, args=("gaming_text", "gaming_keywords.txt"))
-        st.header("Streaming Keywords")
-        if 'streaming_text' not in st.session_state:
-            st.session_state.streaming_text = ""
-        st.text_area("Copy the streaming keywords here:", value=st.session_state.streaming_text, height=200, key="streaming")
-        st.button("Export Streaming Keywords", on_click=export_results, args=("streaming_text", "streaming_keywords.txt"))
-        st.header("Unknown Keywords")
-        if 'unknown_text' not in st.session_state:
-            st.session_state.unknown_text = ""
-        st.text_area("Copy the unknown keywords here:", value=st.session_state.unknown_text, height=200, key="unknown")
-        st.button("Export Unknown Keywords", on_click=export_results, args=("unknown_text", "unknown_keywords.txt"))
-    elif operation == "Extra & Filter Param (URLs)":
-        # عرض النتائج للباراميترات
-        st.header("Shop Parameters")
-        if 'shop_params' not in st.session_state:
-            st.session_state.shop_params = ""
-        st.text_area("Copy the shop parameters here:", value=st.session_state.shop_params, height=200, key="shop_params")
-        st.button("Export Shop Parameters", on_click=export_results, args=("shop_params", "shop_params.txt"))
-        st.header("Game Parameters")
-        if 'game_params' not in st.session_state:
-            st.session_state.game_params = ""
-        st.text_area("Copy the game parameters here:", value=st.session_state.game_params, height=200, key="game_params")
-        st.button("Export Game Parameters", on_click=export_results, args=("game_params", "game_params.txt"))
-        st.header("Stream Parameters")
-        if 'stream_params' not in st.session_state:
-            st.session_state.stream_params = ""
-        st.text_area("Copy the stream parameters here:", value=st.session_state.stream_params, height=200, key="stream_params")
-        st.button("Export Stream Parameters", on_click=export_results, args=("stream_params", "stream_params.txt"))
-        st.header("Unknown Parameters")
-        if 'unknown_params' not in st.session_state:
-            st.session_state.unknown_params = ""
-        st.text_area("Copy the unknown parameters here:", value=st.session_state.unknown_params, height=200, key="unknown_params")
-        st.button("Export Unknown Parameters", on_click=export_results, args=("unknown_params", "unknown_params.txt"))
-        # عرض الصيغ (Extensions)
-        st.header("File Extensions")
-        if 'extensions_text' not in st.session_state:
-            st.session_state.extensions_text = ""
-        st.text_area("Copy the file extensions here:", value=st.session_state.extensions_text, height=200, key="extensions")
-        st.button("Export File Extensions", on_click=export_results, args=("extensions_text", "file_extensions.txt"))
-        # عرض أنماط الصفحات الكاملة (Full PageType)
-        st.header("Full PageType")
-        if 'full_page_types' not in st.session_state:
-            st.session_state.full_page_types = ""
-        st.text_area("Copy the full page types here:", value=st.session_state.full_page_types, height=200, key="full_page_types")
-        st.button("Export Full PageTypes", on_click=export_results, args=("full_page_types", "full_page_types.txt"))
 else:
-    st.warning("Please upload a text file to start classification.")

 import streamlit as st
 from transformers import pipeline
 import re
 # تحميل النموذج
 classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-distilroberta-base")
 # عنوان التطبيق
+st.title("URL Analysis App")
 # إدخال الملف النصي
+uploaded_file = st.file_uploader("Upload a text file containing URLs", type=["txt"])
 if uploaded_file is not None:
     # قراءة الملف النصي
     content = uploaded_file.read().decode("utf-8")
+    urls = [line.strip() for line in content.splitlines() if line.strip()]
     # قوائم لتخزين النتائج
+    parameters = []
+    domains = []
     full_page_types = []
+    file_extensions = []
+    # دالة تحليل الروابط
+    def analyze_urls(urls):
+        for url in urls:
+            # استخراج الباراميترات باستخدام RegEx
+            params = re.findall(r'(\w+)=', url)
+            parameters.extend(params)
+            # استخtraction نطاقات (.com, .uk, .au)
+            domain_match = re.search(r'\.([a-zA-Z]+)', url)
+            if domain_match:
+                domain = domain_match.group(1)
+                if domain not in domains:
+                    domains.append(domain)
+            # استخراج أنماط الصفحات الكاملة (product_detail.php?, viewtopic.php?)
+            page_type_match = re.search(r'(\w+\.[a-z]+)\?', url)
+            if page_type_match:
+                page_type = page_type_match.group(1)
+                if page_type not in full_page_types:
+                    full_page_types.append(page_type)
+            # استخراج الصيغ (php, phtml, asp) بدون علامات الاستفهام
+            extension_match = re.search(r'(\w+\.[a-z]+)(\?|$)', url)
+            if extension_match:
+                extension = extension_match.group(1).split('?')[0]
+                if extension not in file_extensions:
+                    file_extensions.append(extension)
     # زر البدء
     if st.button("Start"):
+        # تحليل الروابط
+        analyze_urls(urls)
+    # إزالة التكرارات من القوائم
+    parameters = list(set(parameters))
+    domains = list(set(domains))
+    full_page_types = list(set(full_page_types))
+    file_extensions = list(set(file_extensions))
+    # عرض النتائج
+    st.header("Parameters")
+    st.text_area("Copy the parameters here:", value="\n".join(parameters), height=200, key="parameters")
+    st.button("Copy Parameters", on_click=lambda: st.clipboard.copy("\n".join(parameters)))
+    st.header("Domains")
+    st.text_area("Copy the domains here:", value="\n".join(domains), height=200, key="domains")
+    st.button("Copy Domains", on_click=lambda: st.clipboard.copy("\n".join(domains)))
+    st.header("Full PageType")
+    st.text_area("Copy the full page types here:", value="\n".join(full_page_types), height=200, key="full_page_types")
+    st.button("Copy Full PageTypes", on_click=lambda: st.clipboard.copy("\n".join(full_page_types)))
+    st.header("File Extensions")
+    st.text_area("Copy the file extensions here:", value="\n".join(file_extensions), height=200, key="file_extensions")
+    st.button("Copy File Extensions", on_click=lambda: st.clipboard.copy("\n".join(file_extensions)))
 else:
+    st.warning("Please upload a text file containing URLs to start analysis.")