Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,8 +11,6 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
11 |
from olmocr.prompts.anchor import get_anchor_text
|
12 |
|
13 |
import re
|
14 |
-
import markdown2
|
15 |
-
from latex2mathml.converter import convert as latex_to_mathml
|
16 |
import html
|
17 |
import json
|
18 |
|
@@ -23,16 +21,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
|
23 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
model.to(device)
|
25 |
|
26 |
-
def convert_latex_to_mathml(text):
|
27 |
-
def replacer(match):
|
28 |
-
try:
|
29 |
-
return f"<math>{latex_to_mathml(match.group(1))}</math>"
|
30 |
-
except Exception:
|
31 |
-
return html.escape(match.group(0))
|
32 |
-
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
|
33 |
-
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
|
34 |
-
return text
|
35 |
-
|
36 |
def clean_page_headers(text):
|
37 |
lines = text.split("\n")
|
38 |
cleaned = []
|
@@ -44,17 +32,17 @@ def clean_page_headers(text):
|
|
44 |
def replace_headers_in_text(text, page_headers):
|
45 |
lines = text.split("\n")
|
46 |
for level, header in page_headers:
|
47 |
-
|
48 |
pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
|
49 |
for idx, line in enumerate(lines):
|
50 |
if pattern.fullmatch(line.strip()):
|
51 |
-
lines[idx] = f"
|
52 |
-
break
|
53 |
else:
|
54 |
-
lines.insert(0, f"
|
55 |
return "\n".join(lines)
|
56 |
|
57 |
-
def
|
58 |
pdf_path = pdf_file.name
|
59 |
doc = fitz.open(pdf_path)
|
60 |
num_pages = len(doc)
|
@@ -64,8 +52,7 @@ def process_pdf_to_html(pdf_file, title, author):
|
|
64 |
for level, header, page in toc_entries:
|
65 |
toc_by_page.setdefault(page, []).append((level, header))
|
66 |
|
67 |
-
all_text = ""
|
68 |
-
cover_img_html = ""
|
69 |
|
70 |
for i in range(num_pages):
|
71 |
page_num = i + 1
|
@@ -134,51 +121,22 @@ def process_pdf_to_html(pdf_file, title, author):
|
|
134 |
if page_num in toc_by_page:
|
135 |
cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
all_text += f"<div>{html_page}</div>\n"
|
142 |
-
|
143 |
-
if page_num == 1:
|
144 |
-
cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
|
145 |
-
|
146 |
-
mathjax_script = """
|
147 |
-
<script type="text/javascript" id="MathJax-script" async
|
148 |
-
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|
149 |
-
</script>
|
150 |
-
"""
|
151 |
-
|
152 |
-
full_html = f"""<!DOCTYPE html>
|
153 |
-
<html>
|
154 |
-
<head>
|
155 |
-
<meta charset="utf-8">
|
156 |
-
<title>{html.escape(title)}</title>
|
157 |
-
{mathjax_script}
|
158 |
-
</head>
|
159 |
-
<body>
|
160 |
-
<h1>{html.escape(title)}</h1>
|
161 |
-
<h3>{html.escape(author)}</h3>
|
162 |
-
{cover_img_html}
|
163 |
-
{all_text}
|
164 |
-
</body>
|
165 |
-
</html>
|
166 |
-
"""
|
167 |
-
|
168 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
|
169 |
-
tmp.write(full_html)
|
170 |
return tmp.name
|
171 |
|
172 |
iface = gr.Interface(
|
173 |
-
fn=
|
174 |
inputs=[
|
175 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
176 |
-
gr.Textbox(label="
|
177 |
gr.Textbox(label="Author(s)")
|
178 |
],
|
179 |
-
outputs=gr.File(label="Download
|
180 |
-
title="PDF to
|
181 |
-
description="Extracts text with structure
|
182 |
allow_flagging="never"
|
183 |
)
|
184 |
|
|
|
11 |
from olmocr.prompts.anchor import get_anchor_text
|
12 |
|
13 |
import re
|
|
|
|
|
14 |
import html
|
15 |
import json
|
16 |
|
|
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
model.to(device)
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def clean_page_headers(text):
|
25 |
lines = text.split("\n")
|
26 |
cleaned = []
|
|
|
32 |
def replace_headers_in_text(text, page_headers):
|
33 |
lines = text.split("\n")
|
34 |
for level, header in page_headers:
|
35 |
+
prefix = "#" * min(level, 6)
|
36 |
pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
|
37 |
for idx, line in enumerate(lines):
|
38 |
if pattern.fullmatch(line.strip()):
|
39 |
+
lines[idx] = f"{prefix} {header.strip()}"
|
40 |
+
break
|
41 |
else:
|
42 |
+
lines.insert(0, f"{prefix} {header.strip()}")
|
43 |
return "\n".join(lines)
|
44 |
|
45 |
+
def process_pdf_to_markdown(pdf_file, title, author):
|
46 |
pdf_path = pdf_file.name
|
47 |
doc = fitz.open(pdf_path)
|
48 |
num_pages = len(doc)
|
|
|
52 |
for level, header, page in toc_entries:
|
53 |
toc_by_page.setdefault(page, []).append((level, header))
|
54 |
|
55 |
+
all_text = f"# {title}\n\n**Author(s):** {author}\n\n"
|
|
|
56 |
|
57 |
for i in range(num_pages):
|
58 |
page_num = i + 1
|
|
|
121 |
if page_num in toc_by_page:
|
122 |
cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
|
123 |
|
124 |
+
all_text += cleaned_text + "\n\n"
|
125 |
+
|
126 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", dir="/tmp", mode="w", encoding="utf-8") as tmp:
|
127 |
+
tmp.write(all_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return tmp.name
|
129 |
|
130 |
iface = gr.Interface(
|
131 |
+
fn=process_pdf_to_markdown,
|
132 |
inputs=[
|
133 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
134 |
+
gr.Textbox(label="Markdown Title"),
|
135 |
gr.Textbox(label="Author(s)")
|
136 |
],
|
137 |
+
outputs=gr.File(label="Download Markdown .txt"),
|
138 |
+
title="PDF to Markdown Converter (for Calibre)",
|
139 |
+
description="Extracts text with structure and outputs it as Markdown in a .txt file compatible with Calibre.",
|
140 |
allow_flagging="never"
|
141 |
)
|
142 |
|