Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on about 1 month ago

Commit

6065374

verified ·

1 Parent(s): 338d810

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -57

app.py CHANGED Viewed

@@ -11,8 +11,6 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 import re
-import markdown2
-from latex2mathml.converter import convert as latex_to_mathml
 import html
 import json
@@ -23,16 +21,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def convert_latex_to_mathml(text):
-    def replacer(match):
-        try:
-            return f"<math>{latex_to_mathml(match.group(1))}</math>"
-        except Exception:
-            return html.escape(match.group(0))
-    text = re.sub(r'\\\((.*?)\\\)', replacer, text)
-    text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
-    return text
 def clean_page_headers(text):
     lines = text.split("\n")
     cleaned = []
@@ -44,17 +32,17 @@ def clean_page_headers(text):
 def replace_headers_in_text(text, page_headers):
     lines = text.split("\n")
     for level, header in page_headers:
-        tag = f"h{min(level, 6)}"
         pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
         for idx, line in enumerate(lines):
             if pattern.fullmatch(line.strip()):
-                lines[idx] = f"<{tag}>{html.escape(header.strip())}</{tag}>"
-                break  # only replace first match
         else:
-            lines.insert(0, f"<{tag}>{html.escape(header.strip())}</{tag}>")  # fallback insert
     return "\n".join(lines)
-def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
@@ -64,8 +52,7 @@ def process_pdf_to_html(pdf_file, title, author):
     for level, header, page in toc_entries:
         toc_by_page.setdefault(page, []).append((level, header))
-    all_text = ""
-    cover_img_html = ""
     for i in range(num_pages):
         page_num = i + 1
@@ -134,51 +121,22 @@ def process_pdf_to_html(pdf_file, title, author):
         if page_num in toc_by_page:
             cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
-        mathml_converted = convert_latex_to_mathml(cleaned_text)
-        markdown_converted = markdown2.markdown(mathml_converted)
-        html_page = markdown_converted.replace("\n", "<br>")
-        all_text += f"<div>{html_page}</div>\n"
-        if page_num == 1:
-            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
-    mathjax_script = """
-    <script type="text/javascript" id="MathJax-script" async
-      src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
-    </script>
-    """
-    full_html = f"""<!DOCTYPE html>
-    <html>
-    <head>
-        <meta charset="utf-8">
-        <title>{html.escape(title)}</title>
-        {mathjax_script}
-    </head>
-    <body>
-        <h1>{html.escape(title)}</h1>
-        <h3>{html.escape(author)}</h3>
-        {cover_img_html}
-        {all_text}
-    </body>
-    </html>
-    """
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
-        tmp.write(full_html)
         return tmp.name
 iface = gr.Interface(
-    fn=process_pdf_to_html,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Textbox(label="HTML Title"),
         gr.Textbox(label="Author(s)")
     ],
-    outputs=gr.File(label="Download HTML"),
-    title="PDF to HTML Converter with Structure (olmOCR)",
-    description="Extracts text with structure, math, and footnotes using olmOCR and renders to styled HTML.",
     allow_flagging="never"
 )

 from olmocr.prompts.anchor import get_anchor_text
 import re
 import html
 import json
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 def clean_page_headers(text):
     lines = text.split("\n")
     cleaned = []
 def replace_headers_in_text(text, page_headers):
     lines = text.split("\n")
     for level, header in page_headers:
+        prefix = "#" * min(level, 6)
         pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
         for idx, line in enumerate(lines):
             if pattern.fullmatch(line.strip()):
+                lines[idx] = f"{prefix} {header.strip()}"
+                break
         else:
+            lines.insert(0, f"{prefix} {header.strip()}")
     return "\n".join(lines)
+def process_pdf_to_markdown(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
     for level, header, page in toc_entries:
         toc_by_page.setdefault(page, []).append((level, header))
+    all_text = f"# {title}\n\n**Author(s):** {author}\n\n"
     for i in range(num_pages):
         page_num = i + 1
         if page_num in toc_by_page:
             cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
+        all_text += cleaned_text + "\n\n"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", dir="/tmp", mode="w", encoding="utf-8") as tmp:
+        tmp.write(all_text)
         return tmp.name
 iface = gr.Interface(
+    fn=process_pdf_to_markdown,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Textbox(label="Markdown Title"),
         gr.Textbox(label="Author(s)")
     ],
+    outputs=gr.File(label="Download Markdown .txt"),
+    title="PDF to Markdown Converter (for Calibre)",
+    description="Extracts text with structure and outputs it as Markdown in a .txt file compatible with Calibre.",
     allow_flagging="never"
 )