leonarb commited on
Commit
6065374
·
verified ·
1 Parent(s): 338d810

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -57
app.py CHANGED
@@ -11,8 +11,6 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
  import re
14
- import markdown2
15
- from latex2mathml.converter import convert as latex_to_mathml
16
  import html
17
  import json
18
 
@@ -23,16 +21,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  model.to(device)
25
 
26
- def convert_latex_to_mathml(text):
27
- def replacer(match):
28
- try:
29
- return f"<math>{latex_to_mathml(match.group(1))}</math>"
30
- except Exception:
31
- return html.escape(match.group(0))
32
- text = re.sub(r'\\\((.*?)\\\)', replacer, text)
33
- text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
34
- return text
35
-
36
  def clean_page_headers(text):
37
  lines = text.split("\n")
38
  cleaned = []
@@ -44,17 +32,17 @@ def clean_page_headers(text):
44
  def replace_headers_in_text(text, page_headers):
45
  lines = text.split("\n")
46
  for level, header in page_headers:
47
- tag = f"h{min(level, 6)}"
48
  pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
49
  for idx, line in enumerate(lines):
50
  if pattern.fullmatch(line.strip()):
51
- lines[idx] = f"<{tag}>{html.escape(header.strip())}</{tag}>"
52
- break # only replace first match
53
  else:
54
- lines.insert(0, f"<{tag}>{html.escape(header.strip())}</{tag}>") # fallback insert
55
  return "\n".join(lines)
56
 
57
- def process_pdf_to_html(pdf_file, title, author):
58
  pdf_path = pdf_file.name
59
  doc = fitz.open(pdf_path)
60
  num_pages = len(doc)
@@ -64,8 +52,7 @@ def process_pdf_to_html(pdf_file, title, author):
64
  for level, header, page in toc_entries:
65
  toc_by_page.setdefault(page, []).append((level, header))
66
 
67
- all_text = ""
68
- cover_img_html = ""
69
 
70
  for i in range(num_pages):
71
  page_num = i + 1
@@ -134,51 +121,22 @@ def process_pdf_to_html(pdf_file, title, author):
134
  if page_num in toc_by_page:
135
  cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
136
 
137
- mathml_converted = convert_latex_to_mathml(cleaned_text)
138
- markdown_converted = markdown2.markdown(mathml_converted)
139
- html_page = markdown_converted.replace("\n", "<br>")
140
-
141
- all_text += f"<div>{html_page}</div>\n"
142
-
143
- if page_num == 1:
144
- cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
145
-
146
- mathjax_script = """
147
- <script type="text/javascript" id="MathJax-script" async
148
- src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
149
- </script>
150
- """
151
-
152
- full_html = f"""<!DOCTYPE html>
153
- <html>
154
- <head>
155
- <meta charset="utf-8">
156
- <title>{html.escape(title)}</title>
157
- {mathjax_script}
158
- </head>
159
- <body>
160
- <h1>{html.escape(title)}</h1>
161
- <h3>{html.escape(author)}</h3>
162
- {cover_img_html}
163
- {all_text}
164
- </body>
165
- </html>
166
- """
167
-
168
- with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
169
- tmp.write(full_html)
170
  return tmp.name
171
 
172
  iface = gr.Interface(
173
- fn=process_pdf_to_html,
174
  inputs=[
175
  gr.File(label="Upload PDF", file_types=[".pdf"]),
176
- gr.Textbox(label="HTML Title"),
177
  gr.Textbox(label="Author(s)")
178
  ],
179
- outputs=gr.File(label="Download HTML"),
180
- title="PDF to HTML Converter with Structure (olmOCR)",
181
- description="Extracts text with structure, math, and footnotes using olmOCR and renders to styled HTML.",
182
  allow_flagging="never"
183
  )
184
 
 
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
  import re
 
 
14
  import html
15
  import json
16
 
 
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  model.to(device)
23
 
 
 
 
 
 
 
 
 
 
 
24
  def clean_page_headers(text):
25
  lines = text.split("\n")
26
  cleaned = []
 
32
  def replace_headers_in_text(text, page_headers):
33
  lines = text.split("\n")
34
  for level, header in page_headers:
35
+ prefix = "#" * min(level, 6)
36
  pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
37
  for idx, line in enumerate(lines):
38
  if pattern.fullmatch(line.strip()):
39
+ lines[idx] = f"{prefix} {header.strip()}"
40
+ break
41
  else:
42
+ lines.insert(0, f"{prefix} {header.strip()}")
43
  return "\n".join(lines)
44
 
45
+ def process_pdf_to_markdown(pdf_file, title, author):
46
  pdf_path = pdf_file.name
47
  doc = fitz.open(pdf_path)
48
  num_pages = len(doc)
 
52
  for level, header, page in toc_entries:
53
  toc_by_page.setdefault(page, []).append((level, header))
54
 
55
+ all_text = f"# {title}\n\n**Author(s):** {author}\n\n"
 
56
 
57
  for i in range(num_pages):
58
  page_num = i + 1
 
121
  if page_num in toc_by_page:
122
  cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
123
 
124
+ all_text += cleaned_text + "\n\n"
125
+
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", dir="/tmp", mode="w", encoding="utf-8") as tmp:
127
+ tmp.write(all_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  return tmp.name
129
 
130
  iface = gr.Interface(
131
+ fn=process_pdf_to_markdown,
132
  inputs=[
133
  gr.File(label="Upload PDF", file_types=[".pdf"]),
134
+ gr.Textbox(label="Markdown Title"),
135
  gr.Textbox(label="Author(s)")
136
  ],
137
+ outputs=gr.File(label="Download Markdown .txt"),
138
+ title="PDF to Markdown Converter (for Calibre)",
139
+ description="Extracts text with structure and outputs it as Markdown in a .txt file compatible with Calibre.",
140
  allow_flagging="never"
141
  )
142