Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Gradio web interface for invoice processing system. | |
This UI allows users to upload invoice files (PDF, DOCX, TXT, etc.) and download the results as CSV. | |
""" | |
import os | |
import sys | |
import csv | |
import tempfile | |
import logging | |
import pandas as pd | |
from pathlib import Path | |
from typing import Dict, List, Optional, Tuple, Union | |
import gradio as gr | |
from gradio_pdf import PDF # Import the enhanced PDF component | |
from dotenv import load_dotenv | |
# Import the invoice processing functionality | |
from process_invoice import process_file, setup_google_client | |
# Load environment variables | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
# Check if Google API is available | |
GOOGLE_API_AVAILABLE = setup_google_client() is not None | |
def convert_to_csv(invoice_data: Dict) -> str: | |
""" | |
Convert invoice data to CSV format. | |
Args: | |
invoice_data: Dictionary containing invoice data | |
Returns: | |
Path to the generated CSV file | |
""" | |
# Create a temporary file for the CSV | |
fd, temp_csv_path = tempfile.mkstemp(suffix='.csv') | |
os.close(fd) | |
# Extract items from invoice data | |
items = invoice_data.get('items', []) | |
if not items: | |
logger.warning("No items found in invoice data") | |
return temp_csv_path | |
# Get all unique keys from all items to use as headers | |
all_keys = set() | |
for item in items: | |
all_keys.update(item.keys()) | |
# Write to CSV | |
with open(temp_csv_path, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys)) | |
writer.writeheader() | |
writer.writerows(items) | |
logger.info(f"CSV file created at {temp_csv_path}") | |
return temp_csv_path | |
def process_invoice_file( | |
file_obj: tempfile._TemporaryFileWrapper, | |
use_llm: bool = True | |
) -> Tuple[Dict, str, str, Optional[str], Optional[str]]: | |
""" | |
Process an uploaded invoice file and return the results. | |
Args: | |
file_obj: The uploaded file object | |
use_llm: Whether to use LLM for processing | |
Returns: | |
Tuple containing: | |
- Dictionary of extracted data | |
- HTML table for display | |
- Status message | |
- Path to CSV file (or None if processing failed) | |
- Path to PDF file for display (or None if not a PDF) | |
""" | |
if not file_obj: | |
return {}, "", "No file uploaded", None, None | |
# Get the file extension | |
file_path = file_obj.name | |
file_ext = os.path.splitext(file_path)[1].lower() | |
# Check if file format is supported | |
supported_formats = ['.pdf', '.xlsx', '.xls', '.doc', '.docx', '.txt'] | |
if file_ext not in supported_formats: | |
return {}, "", f"Unsupported file format: {file_ext}. Supported formats: {', '.join(supported_formats)}", None, None | |
# Process the file | |
logger.info(f"Processing file: {file_path}") | |
# Create a temporary directory for JSON output | |
result_dir = Path("result") | |
result_dir.mkdir(exist_ok=True) | |
# For PDF display | |
pdf_path = file_path | |
# If the file is not a PDF, convert it to PDF for display | |
if file_ext != '.pdf': | |
temp_pdf = None | |
try: | |
if file_ext in ['.xlsx', '.xls']: | |
from src.excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx | |
if file_ext == '.xls': | |
xlsx_path = convert_xls_to_xlsx(file_path, tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx').name) | |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name | |
pdf_path = excel_to_pdf(xlsx_path, pdf_path=temp_pdf) | |
else: | |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name | |
pdf_path = excel_to_pdf(file_path, pdf_path=temp_pdf) | |
elif file_ext in ['.doc', '.docx']: | |
from src.docx_to_pdf import docx_to_pdf | |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name | |
pdf_path = docx_to_pdf(file_path, temp_pdf) | |
elif file_ext == '.txt': | |
from src.txt_to_pdf import txt_to_pdf | |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name | |
pdf_path = txt_to_pdf(file_path, temp_pdf) | |
logger.info(f"Converted {file_ext} file to PDF: {pdf_path}") | |
except Exception as e: | |
logger.error(f"Error converting file to PDF: {str(e)}") | |
pdf_path = None | |
json_path = process_file(file_path) | |
# Try to read the JSON file that was created | |
if os.path.exists(json_path): | |
import json | |
with open(json_path, 'r', encoding='utf-8') as f: | |
invoice_data = json.load(f) | |
else: | |
return {}, "", "Failed to process file. No output data found.", None, pdf_path | |
# Create a DataFrame for display | |
items = invoice_data.get('items', []) | |
if 'error' in invoice_data and invoice_data['error']: | |
html_table = f"<p class='error' style='color: red; font-weight: bold;'>{invoice_data['error']}</p>" | |
status = f"Error: {invoice_data['error']}" | |
# Still create CSV with any available items | |
csv_path = convert_to_csv(invoice_data) | |
return invoice_data, html_table, status, csv_path, pdf_path | |
elif items: | |
df = pd.DataFrame(items) | |
html_table = df.to_html(classes='table table-striped') | |
status = f"Successfully processed {len(items)} items from {os.path.basename(file_path)}" | |
# Convert to CSV | |
csv_path = convert_to_csv(invoice_data) | |
else: | |
html_table = "<p>No items found in the invoice</p>" | |
status = "No items extracted from the file" | |
# Create empty CSV | |
csv_path = convert_to_csv({"items": []}) | |
return invoice_data, html_table, status, csv_path, pdf_path | |
def create_ui() -> gr.Blocks: | |
"""Create and return the Gradio UI.""" | |
with gr.Blocks(title="Invoice Processing System") as app: | |
gr.Markdown("# Invoice Processing System") | |
gr.Markdown("Upload an invoice file (PDF, Excel, Word, or Text) to extract and download the data as CSV.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File(label="Upload Invoice File") | |
process_button = gr.Button("Process Invoice", variant="primary") | |
status_output = gr.Textbox(label="Status", interactive=False) | |
csv_output = gr.File(label="Download CSV", interactive=False) | |
with gr.Column(scale=2): | |
with gr.Tabs(): | |
with gr.TabItem("Extracted Data"): | |
results_html = gr.HTML(label="Extracted Data") | |
with gr.TabItem("PDF View"): | |
# Use the enhanced PDF component from gradio_pdf | |
pdf_viewer = PDF(label="Invoice PDF", interactive=False) | |
# Define the process flow | |
process_button.click( | |
fn=process_invoice_file, | |
inputs=[file_input], | |
outputs=[gr.State(), results_html, status_output, csv_output, pdf_viewer] | |
) | |
# Add examples if available | |
example_dir = Path("examples") | |
if example_dir.exists(): | |
example_files = list(example_dir.glob("*.pdf")) + list(example_dir.glob("*.xlsx")) | |
if example_files: | |
gr.Examples( | |
examples=[[str(f)] for f in example_files], | |
inputs=[file_input] | |
) | |
return app | |
def main(): | |
"""Main function to launch the Gradio app.""" | |
app = create_ui() | |
app.launch( | |
server_name="0.0.0.0", # Make accessible from other computers | |
share=True, # Create a public link | |
inbrowser=True # Open in browser | |
) | |
if __name__ == "__main__": | |
main() | |