File size: 4,463 Bytes
d090cc4
6370c8d
d090cc4
 
 
3e11c25
470923a
3e11c25
470923a
 
3e11c25
d090cc4
470923a
3e11c25
470923a
 
3e11c25
 
470923a
3e11c25
d090cc4
126e9d0
d090cc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126e9d0
3e11c25
 
470923a
3e11c25
 
470923a
3e11c25
d090cc4
126e9d0
3e11c25
470923a
 
 
 
d090cc4
 
063c063
470923a
d090cc4
3e11c25
470923a
 
916ed06
6370c8d
916ed06
 
3e11c25
126e9d0
470923a
3e11c25
 
 
 
 
 
 
 
470923a
3e11c25
d090cc4
 
 
3e11c25
d090cc4
 
3e11c25
d090cc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e11c25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import pipeline
import logging
import sys

# Set up logging and debug print to console
logging.basicConfig(level=logging.ERROR, stream=sys.stdout)
logger = logging.getLogger(__name__)

# Use a lighter summarization model
try:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6")
    print("Summarizer loaded successfully")
except Exception as e:
    logger.error(f"Summarizer loading failed: {e}")
    print(f"Summarizer error: {e}")
    summarizer = None

# Sample dataset
papers = [
    {
        "title": "Machine Learning in Healthcare",
        "abstract": "Explores the use of ML to improve diagnostic accuracy.",
        "content": "This paper discusses how machine learning models can enhance healthcare diagnostics by analyzing patient data..."
    },
    {
        "title": "Natural Language Processing Advances",
        "abstract": "Reviews recent NLP techniques and their applications.",
        "content": "Recent advances in NLP have enabled better text understanding, with models like BERT leading the way..."
    },
    {
        "title": "Climate Change Mitigation",
        "abstract": "Analyzes strategies for reducing carbon emissions.",
        "content": "This study proposes new strategies for mitigating climate change through technology and policy..."
    }
]

# Load sentence transformer
try:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SentenceTransformer loaded successfully")
except Exception as e:
    logger.error(f"SentenceTransformer loading failed: {e}")
    print(f"SentenceTransformer error: {e}")
    model = None

# Pre-compute embeddings
paper_embeddings = model.encode([paper["content"] for paper in papers], convert_to_tensor=True) if model else None
if paper_embeddings is not None:
    print("Embeddings computed successfully")
else:
    print("Embeddings computation failed")

def semantic_search(query):
    if model is None or summarizer is None or paper_embeddings is None:
        return {"title": "Error", "abstract": "Error", "summary": "Model or embeddings failed to load. Check console.", "link": ""}
    
    try:
        print(f"Processing query: {query}")
        query_embedding = model.encode([query], convert_to_tensor=True)
        # Use torch.matmul and keep as tensor
        similarities = torch.matmul(paper_embeddings, query_embedding.T)
        # Get the index using torch.argmax
        top_idx = torch.argmax(similarities).item()  # Convert to scalar
        top_paper = papers[top_idx]
        summary = summarizer(top_paper["content"], max_length=30, min_length=10, do_sample=False)[0]["summary_text"]
        print(f"Found paper: {top_paper['title']}")
        return {
            "title": top_paper["title"],
            "abstract": top_paper["abstract"],
            "summary": summary,
            "link": "https://example.com/paper"
        }
    except Exception as e:
        logger.error(f"Search failed: {e}")
        print(f"Search error: {e}")
        return {"title": "Error", "abstract": "Error", "summary": str(e), "link": ""}

def search_interface(query):
    if not query:
        return "Please enter a search query.", "Error", "Error", "Error"
    result = semantic_search(query)
    return (
        "βœ… Search Complete!" if "Error" not in result["title"] else f"❌ Error: {result['summary']}",
        result["title"],
        result["abstract"],
        result["summary"]
    )

with gr.Blocks(title="Semantic Search Engine for Academic Papers") as demo:
    gr.Markdown("# πŸ” Semantic Search Engine for Academic Papers\nSearch for academic papers by entering a research query.")
    with gr.Row():
        query_input = gr.Textbox(label="πŸ“ Enter Research Query", placeholder="e.g., 'machine learning in healthcare'")
        search_btn = gr.Button("πŸ”Ž Search")
    with gr.Row():
        status = gr.Textbox(label="βœ… Status", interactive=False)
        title_output = gr.Textbox(label="πŸ“‘ Paper Title", interactive=False)
    with gr.Row():
        abstract_output = gr.Textbox(label="πŸ“ Abstract", interactive=False, lines=3)
        summary_output = gr.Textbox(label="πŸ“‹ Summary", interactive=False, lines=3)
    search_btn.click(
        fn=search_interface,
        inputs=query_input,
        outputs=[status, title_output, abstract_output, summary_output]
    )

demo.launch()