Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -154,11 +154,22 @@ def extract_text_from_pdf(pdf_bytes):
|
|
154 |
|
155 |
# Print progress
|
156 |
print(f"Processed page {page_num+1}/{len(doc)}")
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
# Clear GPU memory
|
159 |
del pixel_values, outputs
|
160 |
torch.cuda.empty_cache()
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
return full_text
|
163 |
except Exception as e:
|
164 |
import traceback
|
@@ -233,12 +244,12 @@ Focus on clear, concise, and evidence-based improvements that align with the ove
|
|
233 |
add_generation_prompt=True
|
234 |
)
|
235 |
|
236 |
-
# Check input length and truncate to
|
237 |
input_tokens = tokenizer.encode(text)
|
238 |
-
if len(input_tokens) >
|
239 |
-
input_tokens = input_tokens[:
|
240 |
text = tokenizer.decode(input_tokens)
|
241 |
-
print(f"Input truncated to
|
242 |
|
243 |
progress(0.5, desc="Generating improved text...")
|
244 |
# Generate non-streaming
|
@@ -250,7 +261,7 @@ Focus on clear, concise, and evidence-based improvements that align with the ove
|
|
250 |
with torch.no_grad():
|
251 |
output_ids = model.generate(
|
252 |
input_ids,
|
253 |
-
attention_mask=attention_mask,
|
254 |
max_new_tokens=max_new_tokens,
|
255 |
do_sample=(temperature > 0),
|
256 |
temperature=temperature if temperature > 0 else 1.0,
|
|
|
154 |
|
155 |
# Print progress
|
156 |
print(f"Processed page {page_num+1}/{len(doc)}")
|
157 |
+
|
158 |
+
# 检查是否已经达到15000个token的限制
|
159 |
+
if len(full_text.split()) > 15000:
|
160 |
+
print("Reached 15000 token limit, stopping extraction")
|
161 |
+
break
|
162 |
|
163 |
# Clear GPU memory
|
164 |
del pixel_values, outputs
|
165 |
torch.cuda.empty_cache()
|
166 |
|
167 |
+
# 确保不超过15000个token
|
168 |
+
words = full_text.split()
|
169 |
+
if len(words) > 15000:
|
170 |
+
full_text = " ".join(words[:15000])
|
171 |
+
print(f"Truncated paper content to 15000 tokens")
|
172 |
+
|
173 |
return full_text
|
174 |
except Exception as e:
|
175 |
import traceback
|
|
|
244 |
add_generation_prompt=True
|
245 |
)
|
246 |
|
247 |
+
# Check input length and truncate to 16384 tokens before encoding
|
248 |
input_tokens = tokenizer.encode(text)
|
249 |
+
if len(input_tokens) > 16384: # 模型的最大上下文长度
|
250 |
+
input_tokens = input_tokens[:16384]
|
251 |
text = tokenizer.decode(input_tokens)
|
252 |
+
print(f"Input truncated to 16384 tokens")
|
253 |
|
254 |
progress(0.5, desc="Generating improved text...")
|
255 |
# Generate non-streaming
|
|
|
261 |
with torch.no_grad():
|
262 |
output_ids = model.generate(
|
263 |
input_ids,
|
264 |
+
attention_mask=attention_mask,
|
265 |
max_new_tokens=max_new_tokens,
|
266 |
do_sample=(temperature > 0),
|
267 |
temperature=temperature if temperature > 0 else 1.0,
|