Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -59,7 +59,8 @@ async def index(files: List[UploadFile] = File(...)):
|
|
59 |
content = await file.read()
|
60 |
pdf_image_list = convert_from_bytes(content)
|
61 |
images.extend(pdf_image_list)
|
62 |
-
|
|
|
63 |
dataloader = DataLoader(
|
64 |
images,
|
65 |
batch_size=4,
|
@@ -130,11 +131,9 @@ async def search(query: str, k: int = 1):
|
|
130 |
|
131 |
@app.post("/search_by_cv")
|
132 |
async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
133 |
-
# Lire le fichier PDF uploadé
|
134 |
content = await file.read()
|
135 |
pdf_image_list = convert_from_bytes(content)
|
136 |
|
137 |
-
# Générer les embeddings pour les pages du PDF uploadé
|
138 |
qs = []
|
139 |
dataloader = DataLoader(
|
140 |
pdf_image_list,
|
@@ -148,14 +147,11 @@ async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
|
148 |
embeddings_query = model(**batch_query)
|
149 |
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
150 |
|
151 |
-
# Comparer les embeddings du CV uploadé avec ceux déjà indexés
|
152 |
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
153 |
scores = retriever_evaluator.evaluate(qs, ds)
|
154 |
|
155 |
-
# Trouver les indices des résultats les plus pertinents
|
156 |
top_k_indices = scores.argsort(axis=1)[0][-k-1:-1][::-1]
|
157 |
|
158 |
-
# Préparer les résultats sous forme d'images
|
159 |
results = []
|
160 |
for idx in top_k_indices:
|
161 |
img_byte_arr = BytesIO()
|
@@ -163,10 +159,8 @@ async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
|
163 |
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
164 |
results.append({"image": img_base64, "page": f"Page {idx}"})
|
165 |
|
166 |
-
# Générer le PDF des résultats
|
167 |
pdf_buffer = generate_pdf(results)
|
168 |
|
169 |
-
# Utiliser StreamingResponse pour renvoyer le fichier PDF généré
|
170 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
171 |
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
172 |
|
|
|
59 |
content = await file.read()
|
60 |
pdf_image_list = convert_from_bytes(content)
|
61 |
images.extend(pdf_image_list)
|
62 |
+
|
63 |
+
# Create embeddings for each file and load in memory storage
|
64 |
dataloader = DataLoader(
|
65 |
images,
|
66 |
batch_size=4,
|
|
|
131 |
|
132 |
@app.post("/search_by_cv")
|
133 |
async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
|
|
134 |
content = await file.read()
|
135 |
pdf_image_list = convert_from_bytes(content)
|
136 |
|
|
|
137 |
qs = []
|
138 |
dataloader = DataLoader(
|
139 |
pdf_image_list,
|
|
|
147 |
embeddings_query = model(**batch_query)
|
148 |
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
149 |
|
|
|
150 |
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
151 |
scores = retriever_evaluator.evaluate(qs, ds)
|
152 |
|
|
|
153 |
top_k_indices = scores.argsort(axis=1)[0][-k-1:-1][::-1]
|
154 |
|
|
|
155 |
results = []
|
156 |
for idx in top_k_indices:
|
157 |
img_byte_arr = BytesIO()
|
|
|
159 |
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
160 |
results.append({"image": img_base64, "page": f"Page {idx}"})
|
161 |
|
|
|
162 |
pdf_buffer = generate_pdf(results)
|
163 |
|
|
|
164 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
165 |
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
166 |
|