Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,22 @@ from PIL import Image
|
|
8 |
import fitz # PyMuPDF
|
9 |
import numpy as np
|
10 |
from transformers import NougatProcessor, VisionEncoderDecoderModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Set environment variables
|
13 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -90,6 +106,20 @@ def extract_text_from_pdf(pdf_bytes):
|
|
90 |
return default_paper_content
|
91 |
|
92 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
# Load Nougat model
|
94 |
processor, model = load_nougat_model()
|
95 |
|
|
|
8 |
import fitz # PyMuPDF
|
9 |
import numpy as np
|
10 |
from transformers import NougatProcessor, VisionEncoderDecoderModel
|
11 |
+
import nltk
|
12 |
+
import ssl
|
13 |
+
|
14 |
+
# 初始化NLTK
|
15 |
+
try:
|
16 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
17 |
+
except AttributeError:
|
18 |
+
pass
|
19 |
+
else:
|
20 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
21 |
+
|
22 |
+
# 下载NLTK必要的数据
|
23 |
+
try:
|
24 |
+
nltk.data.find('tokenizers/punkt')
|
25 |
+
except LookupError:
|
26 |
+
nltk.download('punkt')
|
27 |
|
28 |
# Set environment variables
|
29 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
106 |
return default_paper_content
|
107 |
|
108 |
try:
|
109 |
+
# 确保NLTK已安装
|
110 |
+
try:
|
111 |
+
import nltk
|
112 |
+
try:
|
113 |
+
nltk.data.find('tokenizers/punkt')
|
114 |
+
except LookupError:
|
115 |
+
nltk.download('punkt')
|
116 |
+
except ImportError:
|
117 |
+
print("Installing NLTK...")
|
118 |
+
import subprocess
|
119 |
+
subprocess.check_call(["pip", "install", "nltk", "python-Levenshtein"])
|
120 |
+
import nltk
|
121 |
+
nltk.download('punkt')
|
122 |
+
|
123 |
# Load Nougat model
|
124 |
processor, model = load_nougat_model()
|
125 |
|