Spaces:

kevansoon
/

tts-endpoint

No application file

App Files Files Community

KevanSoon commited on 4 days ago

Commit

ed6290e

1 Parent(s): 688bfaa

added java dockerfile

Browse files

Files changed (4) hide show

Dockerfile +23 -0
__pycache__/app.cpython-310.pyc +0 -0
app.py +0 -995
requirements.txt +0 -100

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Stage 1: Build Spring Boot app with Maven
+FROM maven:3.9.6-eclipse-temurin-17 AS build
+WORKDIR /app
+# Copy project files
+COPY pom.xml .
+COPY src ./src
+# Build the Spring Boot JAR (skip tests to save time)
+RUN mvn clean package -DskipTests
+# Stage 2: Run the app
+FROM openjdk:17-jdk-slim
+WORKDIR /app
+# Copy the JAR from the build stage
+COPY --from=build /app/target/*.jar app.jar
+# Hugging Face Spaces requires exposing port 7860
+EXPOSE 7860
+# Run Spring Boot on port 7860
+ENTRYPOINT ["java","-jar","app.jar","--server.port=7860"]

__pycache__/app.cpython-310.pyc DELETED Viewed

Binary file (1.07 kB)

app.py DELETED Viewed

@@ -1,995 +0,0 @@
-# backend.py
-import base64
-import json
-import asyncio
-import re
-import os
-import html
-import requests
-import httpx
-import uuid
-import tempfile
-import io
-import traceback
-import atexit
-import functools
-from queue import Queue
-from threading import Event, Thread
-# beautifulsoup
-from bs4 import BeautifulSoup
-# fastapi
-from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, HTMLResponse
-from fastapi import Depends
-from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
-# pydantic
-from pydantic import BaseModel
-# requests
-from requests.exceptions import RequestException
-# dotenv
-from dotenv import load_dotenv
-# google
-import google.generativeai as genai
-from google.api_core import exceptions as google_exceptions
-# gradio
-from gradio_client import Client, handle_file
-# pillow
-from PIL import Image
-# pytesseract
-import pytesseract
-# from auth.clerk import verify_clerk_jwt
-# --- MODIFIED: Replaced old tool imports with the new one ---
-# from tools.tools import analyze_contract
-#numpy and paddleocr
-import numpy as np
-from paddleocr import PaddleOCR
-app = FastAPI(
-    title="Document Translator (Final Architecture)",
-    description="Pipeline: Nemo (JSON) -> Sea-Lion (Translate JSON) -> Gemini (HTML)",
-    version="10.0.1",  # Final Architecture, patched
-)
-# Allow requests from the default React frontend port
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["https://fair-work-contract.vercel.app"],  # or ["*"] for all origins
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-security = HTTPBearer()
-# Load environment variables from a .env file
-load_dotenv()
-SUPABASE_URL = os.getenv("SUPABASE_URL")
-SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
-# --- START: NEW ENDPOINT FOR THE REFACTORED TOOL ---
-# @app.post("/api/analyze_contract")
-# async def analyze_contract_endpoint(file: UploadFile = File(...)):
-#     """
-#     Receives an uploaded HTML contract, analyzes it to extract key clauses
-#     and language, and returns a structured JSON response containing a
-#     user-friendly HTML summary sheet.
-#     """
-#     # 1. Validate file type
-#     if file.content_type != "text/html":
-#         raise HTTPException(
-#             status_code=400, detail="Unsupported file type. Please upload a .html file."
-#         )
-#     try:
-#         # 2. Read HTML content from the uploaded file
-#         html_content_bytes = await file.read()
-#         html_content = html_content_bytes.decode("utf-8")
-#         # 3. Call the new, powerful analysis tool
-#         analysis_results = await analyze_contract(html_content)
-#         # 4. Handle potential errors returned from the tool
-#         if "error" in analysis_results:
-#             # Use a 500 server error for tool-side failures
-#             raise HTTPException(status_code=500, detail=analysis_results["error"])
-#         # 5. Return the successful analysis results
-#         # FastAPI will automatically convert the dictionary to a JSON response
-#         return analysis_results
-#     except Exception as e:
-#         # Catch any other unexpected errors during file processing or the API call
-#         raise HTTPException(
-#             status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
-#         )
-# @app.post("/upload")
-# async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
-#     if not authorization.startswith("Bearer "):
-#         raise HTTPException(status_code=401, detail="Missing Bearer token")
-#     token = authorization.split(" ")[1]
-#     claims = await verify_clerk_jwt(token)
-#     user_id = claims.get("sub")  # Clerk user ID
-#     filename = f"{user_id}/{uuid.uuid4()}.png"
-#     # Upload to Supabase Storage
-#     async with httpx.AsyncClient() as client:
-#         upload_resp = await client.post(
-#             f"{SUPABASE_URL}/storage/v1/object/user-documents/{filename}",
-#             headers={
-#                 "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
-#                 "Content-Type": file.content_type,
-#             },
-#             content=await file.read(),
-#         )
-#     if upload_resp.status_code != 200:
-#         raise HTTPException(
-#             status_code=500, detail="Failed to upload to Supabase Storage"
-#         )
-#     file_url = f"user-documents/{filename}"
-#     # Insert metadata to `documents` table
-#     async with httpx.AsyncClient() as client:
-#         insert_resp = await client.post(
-#             f"{SUPABASE_URL}/rest/v1/documents",
-#             headers={
-#                 "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
-#                 "apikey": SUPABASE_SERVICE_ROLE_KEY,
-#                 "Content-Type": "application/json",
-#                 "Prefer": "return=representation",
-#             },
-#             json={
-#                 "user_id": user_id,
-#                 "filename": filename.split("/")[-1],
-#                 "file_url": file_url,
-#             },
-#         )
-#     if insert_resp.status_code >= 300:
-#         raise HTTPException(
-#             status_code=500, detail="Failed to insert document metadata"
-#         )
-#     return {"message": f"File uploaded as {filename}"}
-# @app.get("/api/documents")
-# async def get_user_documents(
-#     credentials: HTTPAuthorizationCredentials = Depends(security),
-# ):
-#     token = credentials.credentials
-#     claims = await verify_clerk_jwt(token)
-#     user_id = claims.get("sub")
-#     if not user_id:
-#         raise HTTPException(status_code=401, detail="Invalid user")
-#     # Step 1: Get documents from Supabase
-#     async with httpx.AsyncClient() as client:
-#         resp = await client.get(
-#             f"{SUPABASE_URL}/rest/v1/documents?user_id=eq.{user_id}",
-#             headers={
-#                 "apikey": SUPABASE_SERVICE_ROLE_KEY,
-#                 "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
-#                 "Accept": "application/json",
-#             },
-#         )
-#     if resp.status_code != 200:
-#         raise HTTPException(status_code=500, detail="Failed to fetch documents")
-#     documents = resp.json()
-#     # Step 2: Get signed URLs for each file
-#     async with httpx.AsyncClient() as client:
-#         for doc in documents:
-#             file_path = doc["file_url"].split("user-documents/", 1)[-1]
-#             if not file_path:
-#                 doc["signed_url"] = None
-#                 continue
-#             signed_url_resp = await client.post(
-#                 f"{SUPABASE_URL}/storage/v1/object/sign/user-documents/{file_path}",
-#                 headers={
-#                     "apikey": SUPABASE_SERVICE_ROLE_KEY,
-#                     "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
-#                     # "Content-Type": "application/json"
-#                 },
-#                 json={"expiresIn": 3600},  # 1 hour
-#             )
-#             if signed_url_resp.status_code == 200:
-#                 print(
-#                     f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}"
-#                 )
-#                 doc["signed_url"] = (
-#                     f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}"
-#                 )
-#             else:
-#                 doc["signed_url"] = None
-#     print(documents)
-#     return documents
-# --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
-# testing clerk backend authentication
-# @app.post("/upload")
-# async def upload_file(
-#     authorization: str = Header(...),
-#     file: UploadFile = File(...)
-# ):
-#     if not authorization.startswith("Bearer "):
-#         raise HTTPException(status_code=401, detail="Missing Bearer token")
-#     token = authorization.split(" ")[1]
-#     claims = await verify_clerk_jwt(token)
-#     user_id = claims.get("sub")  # Clerk user ID
-#     # ✅ Now the Clerk user is verified
-#     # You can securely store this file, e.g., to Supabase or local
-#     return {"message": f"File uploaded by Clerk user {user_id}"}
-#------------------------ start of gemini workflow ---------------------------------
-# This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
-# but is kept here as requested.
-# async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
-#     """Helper function to call the translation API for a single piece of text."""
-#     if not text_to_translate.strip():
-#         return ""  # Don't send empty strings for translation
-#     url = "https://api.sea-lion.ai/v1/chat/completions"
-#     api_key = os.getenv("SEALION_API_KEY")
-#     if not api_key:
-#         print("Warning: SEALION_API_KEY not set. Skipping translation.")
-#         return f"{text_to_translate} (Translation Skipped)"
-#     headers = {
-#         "Authorization": f"Bearer {api_key}",
-#         "Content-Type": "application/json",
-#     }
-#     # Precise prompt for clean output
-#     prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
-#     payload = {
-#         "max_completion_tokens": 2048,
-#         "messages": [{"role": "user", "content": prompt}],
-#         "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
-#     }
-#     async with httpx.AsyncClient() as client:
-#         try:
-#             response = await client.post(
-#                 url, headers=headers, json=payload, timeout=45.0
-#             )
-#             response.raise_for_status()
-#             response_json = response.json()
-#             translated_text = response_json["choices"][0]["message"]["content"].strip()
-#             # Clean up potential extra quotes that the model might add
-#             return re.sub(r'^"|"$', "", translated_text)
-#         except httpx.RequestError as e:
-#             print(f"Translation request failed: {e}")
-#             return f"Translation Error: {text_to_translate}"
-#         except (KeyError, IndexError) as e:
-#             print(f"Could not parse translation response: {e}")
-#             return f"Translation Parsing Error: {text_to_translate}"
-# # --- NEW GEMINI TRANSLATION FUNCTION ---
-# async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
-#     """
-#     Translates a list of texts using Gemini in a single batch API call.
-#     """
-#     if not texts:
-#         return []
-#     try:
-#         api_key = os.getenv("GEMINI_API_KEY")
-#         if not api_key:
-#             raise ValueError("GEMINI_API_KEY not found in environment variables.")
-#         genai.configure(api_key=api_key)
-#         model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
-#         # Create a single prompt asking for a JSON array response
-#         prompt = f"""
-#         Translate each string in the following JSON array of strings to {target_language}.
-#         Return a single JSON array where each element is the translated string corresponding
-#         to the original at the same index. Your output MUST be only the JSON array and nothing else.
-#         Example Input:
-#         ["Hello world", "How are you?"]
-#         Example Output for target language 'Spanish':
-#         ["Hola mundo", "¿Cómo estás?"]
-#         Input for this task:
-#         {json.dumps(texts)}
-#         """
-#         def do_request():
-#             """Synchronous function to be run in a separate thread."""
-#             response = model.generate_content(prompt)
-#             return response.text.strip()
-#         # Run the synchronous SDK call in a thread to avoid blocking asyncio
-#         response_text = await asyncio.to_thread(do_request)
-#         # Clean the response to ensure it's valid JSON
-#         json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
-#         if not json_response_match:
-#             print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
-#             # Fallback: return original texts if parsing fails
-#             return texts
-#         cleaned_json = json_response_match.group(0)
-#         translated_texts = json.loads(cleaned_json)
-#         if len(translated_texts) != len(texts):
-#             print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
-#             # Fallback in case of length mismatch
-#             return texts
-#         return translated_texts
-#     except Exception as e:
-#         print(f"An error occurred during Gemini translation: {e}")
-#         # Return original texts as a fallback
-#         return texts
-# # --- OCR EXTRACTION FUNCTIONS ---
-# async def get_hocr_from_image(image_bytes: bytes) -> str:
-#     """
-#     Performs OCR using Tesseract to get raw hOCR HTML output.
-#     This function accepts image bytes.
-#     """
-#     if not image_bytes:
-#         raise ValueError("Image bytes cannot be empty.")
-#     try:
-#         image = Image.open(io.BytesIO(image_bytes))
-#     except Exception as e:
-#         raise HTTPException(
-#             status_code=400,
-#             detail=f"Cannot open image for Tesseract. It may be corrupted or unsupported. Error: {e}",
-#         )
-#     # Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
-#     loop = asyncio.get_running_loop()
-#     hocr_bytes = await loop.run_in_executor(
-#         None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
-#     )
-#     return hocr_bytes.decode("utf-8")
-# async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
-#     """
-#     Extracts text and their bounding boxes from an image using PaddleOCR.
-#     Returns the full list of dictionary objects from the OCR tool.
-#     """
-#     with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-#         temp_file.write(image_bytes)
-#         temp_filepath = temp_file.name
-#     try:
-#         def do_ocr() -> list[dict]:
-#             """Synchronous function to be run in a separate thread."""
-#             client = Client("kevansoon/PaddleOCR")
-#             # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
-#             result = client.predict(
-#                 img=handle_file(temp_filepath),
-#                 lang="en",
-#                 api_name="/predict",
-#             )
-#             return result
-#         loop = asyncio.get_running_loop()
-#         extracted_data = await loop.run_in_executor(None, do_ocr)
-#         if not extracted_data:
-#             print("Warning: PaddleOCR returned no data.")
-#             return []
-#         return extracted_data
-#     finally:
-#         os.unlink(temp_filepath)
-# # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
-# async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
-#     """
-#     Parses hOCR, translates all text in a single batch call to Gemini,
-#     and injects translations back into the HTML.
-#     """
-#     soup = BeautifulSoup(hocr_html, "html.parser")
-#     elements_to_translate = soup.find_all(class_="ocrx_word")
-#     if not elements_to_translate:
-#         elements_to_translate = soup.find_all(class_="ocr_line")
-#     original_texts = [el.get_text(strip=True) for el in elements_to_translate]
-#     # Translate all texts in one go
-#     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
-#     # Inject translations back
-#     for i, element in enumerate(elements_to_translate):
-#         if element.string:
-#             # Ensure we don't go out of bounds if translation failed
-#             if i < len(translated_texts):
-#                 element.string.replace_with(translated_texts[i])
-#     return str(soup)
-# async def translate_paddle_data_with_gemini(
-#     paddle_data: list[dict], target_language: str
-# ) -> list[dict]:
-#     """
-#     Translates the 'text' field of each item in the paddle_data list
-#     using a single batch call to Gemini.
-#     """
-#     original_texts = [item.get("text", "") for item in paddle_data]
-#     # Translate all texts in one go
-#     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
-#     translated_data = []
-#     for i, item in enumerate(paddle_data):
-#          # Ensure we don't go out of bounds if translation failed
-#         translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
-#         translated_data.append({"text": translated_text, "box": item.get("box")})
-#     return translated_data
-# # --- FINAL HTML GENERATION ---
-# async def generate_html_from_dual_ocr(
-#     translated_hocr_html: str, translated_paddle_data: list[dict]
-# ) -> str:
-#     """
-#     Receives translated hOCR and PaddleOCR data and uses Gemini to generate
-#     a final, layout-aware HTML document.
-#     """
-#     try:
-#         api_key = os.getenv("GEMINI_API_KEY")
-#         if not api_key:
-#             raise ValueError("GEMINI_API_KEY not found in environment variables.")
-#         genai.configure(api_key=api_key)
-#         model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
-#         prompt = f"""
-#                 You are provided with two different translated OCR outputs for the same document.
-#                 Your task is to MERGE them into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
-#                 Input 1: Translated hOCR HTML
-#                 --- HOCR START ---
-#                 {translated_hocr_html}
-#                 --- HOCR END ---
-#                 Input 2: Translated PaddleOCR data (Python list of dicts with 'text' and 'box'):
-#                 --- PADDLEOCR START ---
-#                 {str(translated_paddle_data)}
-#                 --- PADDLEOCR END ---
-#                 STRICT RULES:
-#                 1. You MUST output ONLY the FINAL RAW HTML code.
-#                 - No ```html, no triple quotes, no markdown, no explanations.
-#                 - Output must begin with <!DOCTYPE html> and end with </html>.
-#                 2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
-#                 - Every PaddleOCR text must appear exactly once in the correct order and location.
-#                 3. The HTML must be fully self-contained:
-#                 - Include <html>, <head>, <style>, and <body>.
-#                 - Include CSS in a <style> block so it renders exactly in an iframe.
-#                 4. Table structure requirement:
-#                 - Use <table>, <tbody>, <tr>, and <td> to organize words into rows and columns.
-#                 - Each PaddleOCR word must be placed in a separate <td> within the correct row based on vertical alignment.
-#                 - Apply CSS for borders, padding, and cell alignment to ensure readability.
-#                 - Use colspan/rowspan where necessary to match the original layout.
-#                 5. Positioning:
-#                 - Use bounding box data to size and place each cell proportionally.
-#                 - Avoid text overlap — if bounding boxes would overlap, adjust table cell spans or widths.
-#                 6. Before outputting:
-#                 - Validate internally that the HTML is valid.
-#                 - Confirm every PaddleOCR text appears in the table.
-#                 - Confirm the table renders correctly in an iframe.
-#                 FINAL OUTPUT REQUIREMENT:
-#                 - Output ONLY the complete, valid HTML — no commentary, no extra text.
-#                 """
-#         def do_request():
-#             """Synchronous function to be run in a separate thread."""
-#             response = model.generate_content(prompt)
-#             return response.text.strip()
-#         return await asyncio.to_thread(do_request)
-#     except Exception as e:
-#         error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
-#         traceback.print_exc()
-#         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-# @app.post("/api/translate_file_gemini", response_class=HTMLResponse)
-# async def translate_document_dual_ocr(
-#     target_language: str = Form(...), file: UploadFile = File(...)
-# ):
-#     """
-#     Processes a document using a dual OCR pipeline:
-#     1. Tesseract and PaddleOCR extract text and coordinates concurrently.
-#     2. Gemini translates the text from both outputs concurrently using a batch method.
-#     3. Gemini uses both translated outputs to generate the final layout-aware HTML.
-#     """
-#     content_type = file.content_type
-#     if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
-#         raise HTTPException(
-#             status_code=400,
-#             detail="Unsupported file type. Please use PNG, JPG, BMP or TIFF.",
-#         )
-#     try:
-#         await file.seek(0)
-#         image_bytes = await file.read()
-#         if not image_bytes:
-#             raise HTTPException(status_code=400, detail="Uploaded file is empty.")
-#         # === STEP 1: Run both OCR extractions concurrently ===
-#         print(
-#             "***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******"
-#         )
-#         hocr_task = get_hocr_from_image(image_bytes)
-#         paddle_task = extract_text_and_boxes_with_paddle(image_bytes)
-#         hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
-#         if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
-#             raise HTTPException(
-#                 status_code=400,
-#                 detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
-#             )
-#         print("***** Step 1 Done: Finished OCR extraction ******")
-#         # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
-#         print("***** Step 2: Starting concurrent translation with Gemini ******")
-#         translated_hocr_task = translate_hocr_html_with_gemini(
-#             hocr_html, target_language
-#         )
-#         translated_paddle_task = translate_paddle_data_with_gemini(
-#             paddle_data, target_language
-#         )
-#         translated_hocr, translated_paddle = await asyncio.gather(
-#             translated_hocr_task, translated_paddle_task
-#         )
-#         print("***** Step 2 Done: Finished translation ******")
-#         # === STEP 3: Generate final HTML from both translated outputs ===
-#         print(
-#             "***** Step 3: Generating final HTML from dual OCR data via Gemini ******"
-#         )
-#         final_html = await generate_html_from_dual_ocr(
-#             translated_hocr, translated_paddle
-#         )
-#         print("***** Step 3 Done: Generated final HTML ******")
-#         return HTMLResponse(content=final_html)
-#     except HTTPException:
-#         raise
-#     except Exception as e:
-#         traceback.print_exc()
-#         raise HTTPException(
-#             status_code=500,
-#             detail=f"An unexpected error occurred during processing: {str(e)}",
-#         )
-#-------------------------- end of gemini workflow ----------------------------------
-#-------------------------- start of updated gemini workflow ----------------------------------
-# --- PADDLEOCR LOCAL MODEL MANAGER SETUP (WITH HUGGING FACE SPACES FIX) ---
-# 1. Define the cache directory in a globally writable location like /tmp.
-# This is the key to fixing "Permission Denied" errors in containerized environments.
-CACHE_DIR = "/tmp/paddleocr_cache"
-# 2. Set the environment variable *before* any PaddleOCR functions are called.
-os.environ['PADDLEOCR_HOME'] = CACHE_DIR
-# 3. Create the directory when the script starts to ensure it exists.
-os.makedirs(CACHE_DIR, exist_ok=True)
-print(f"✅ PaddleOCR model cache is set to a writable directory: {CACHE_DIR}")
-LANG_CONFIG = {
-    "ch": {"num_workers": 2},
-    "en": {"num_workers": 2},
-    "fr": {"num_workers": 1},
-    "german": {"num_workers": 1},
-    "korean": {"num_workers": 1},
-    "japan": {"num_workers": 1},
-}
-CONCURRENCY_LIMIT = 8
-class PaddleOCRModelManager(object):
-    def __init__(self, num_workers, model_factory):
-        super().__init__()
-        self._model_factory = model_factory
-        self._queue = Queue()
-        self._workers = []
-        self._model_initialized_event = Event()
-        for _ in range(num_workers):
-            # Use daemon threads so they don't block app exit
-            worker = Thread(target=self._worker, daemon=True)
-            worker.start()
-            self._model_initialized_event.wait()
-            self._model_initialized_event.clear()
-            self._workers.append(worker)
-    def infer(self, *args, **kwargs):
-        result_queue = Queue(maxsize=1)
-        self._queue.put((args, kwargs, result_queue))
-        success, payload = result_queue.get()
-        if success:
-            return payload
-        else:
-            raise payload
-    def close(self):
-        for _ in self._workers:
-            self._queue.put(None)
-        for worker in self._workers:
-            worker.join()
-    def _worker(self):
-        print("Initializing PaddleOCR model in worker thread...")
-        model = self._model_factory()
-        self._model_initialized_event.set()
-        print("✅ PaddleOCR model initialized in worker.")
-        while True:
-            item = self._queue.get()
-            if item is None:
-                break
-            args, kwargs, result_queue = item
-            try:
-                result = model.ocr(*args, **kwargs)
-                result_queue.put((True, result))
-            except Exception as e:
-                result_queue.put((False, e))
-            finally:
-                self._queue.task_done()
-def create_model(lang):
-    """Creates an instance of the PaddleOCR model."""
-    print(f"Creating PaddleOCR model for language: {lang}")
-    # The cache directory is now set globally, so this function is simplified.
-    return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
-# --- Initialize Model Managers ---
-model_managers = {}
-for lang, config in LANG_CONFIG.items():
-    print(f"Setting up model manager for language: {lang}")
-    model_manager = PaddleOCRModelManager(
-        config["num_workers"], functools.partial(create_model, lang=lang)
-    )
-    model_managers[lang] = model_manager
-def close_model_managers():
-    print("Closing all PaddleOCR model managers...")
-    for manager in model_managers.values():
-        manager.close()
-atexit.register(close_model_managers)
-def local_inference(img_bytes: bytes, lang: str) -> list[dict]:
-    """Performs OCR using the local PaddleOCRModelManager."""
-    ocr_manager = model_managers.get(lang)
-    if not ocr_manager:
-        print(f"Warning: Language '{lang}' not configured. Falling back to 'en'.")
-        ocr_manager = model_managers['en']
-    image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-    img_array = np.array(image)
-    result = ocr_manager.infer(img_array, cls=True)
-    # Standardize result format
-    if result and isinstance(result, list) and len(result) == 1:
-        result = result[0]
-    output = []
-    if result:
-        for line in result:
-            # Ensure line structure is as expected before unpacking
-            if isinstance(line, list) and len(line) == 2:
-                box = line[0]
-                text_info = line[1]
-                if isinstance(text_info, tuple) and len(text_info) == 2:
-                    text = text_info[0]
-                    output.append({"text": text, "box": box})
-    return output
-# --- GEMINI TRANSLATION FUNCTION ---
-async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
-    """Translates a list of texts using Gemini in a single batch API call."""
-    if not texts or all(not s.strip() for s in texts):
-        return [""] * len(texts)
-    try:
-        api_key = os.getenv("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        if not genai:
-            raise ImportError("'google.generativeai' library is not available.")
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
-        prompt = f"""
-        Translate each string in the following JSON array of strings to {target_language}.
-        Return a single JSON array where each element is the translated string corresponding
-        to the original at the same index. Your output MUST be only the JSON array and nothing else.
-        Example Input:
-        ["Hello world", "How are you?"]
-        Example Output for target language 'Spanish':
-        ["Hola mundo", "¿Cómo estás?"]
-        Input for this task:
-        {json.dumps(texts)}
-        """
-        def do_request():
-            response = model.generate_content(prompt)
-            return response.text.strip()
-        response_text = await asyncio.to_thread(do_request)
-        json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
-        if not json_response_match:
-            print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
-            return texts
-        cleaned_json = json_response_match.group(0)
-        translated_texts = json.loads(cleaned_json)
-        if len(translated_texts) != len(texts):
-            print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
-            return texts
-        return translated_texts
-    except Exception as e:
-        print(f"An error occurred during Gemini translation: {e}")
-        return texts
-# --- OCR EXTRACTION FUNCTIONS ---
-async def get_hocr_from_image(image_bytes: bytes) -> str:
-    """Performs OCR using Tesseract to get raw hOCR HTML output."""
-    if not image_bytes:
-        raise ValueError("Image bytes cannot be empty.")
-    try:
-        image = Image.open(io.BytesIO(image_bytes))
-        hocr_bytes = await asyncio.to_thread(
-            pytesseract.image_to_pdf_or_hocr, image, extension="hocr"
-        )
-        return hocr_bytes.decode("utf-8")
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Tesseract OCR failed. Error: {e}")
-async def extract_text_and_boxes_with_paddle(image_bytes: bytes, lang: str = "en") -> list[dict]:
-    """Extracts text and bounding boxes using the local PaddleOCRModelManager."""
-    try:
-        extracted_data = await asyncio.to_thread(local_inference, image_bytes, lang)
-        if not extracted_data:
-            print("Warning: Local PaddleOCR returned no data.")
-        return extracted_data
-    except Exception as e:
-        print(f"An error occurred during local PaddleOCR processing: {e}")
-        traceback.print_exc()
-        return []
-# --- BATCH TRANSLATION FUNCTIONS ---
-async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
-    """Parses hOCR, translates all text, and injects translations back."""
-    soup = BeautifulSoup(hocr_html, "html.parser")
-    elements_to_translate = soup.find_all(class_="ocrx_word")
-    if not elements_to_translate:
-        elements_to_translate = soup.find_all(class_="ocr_line")
-    original_texts = [el.get_text(strip=True) for el in elements_to_translate]
-    if not original_texts:
-        return str(soup)
-    translated_texts = await translate_texts_with_gemini(original_texts, target_language)
-    for i, element in enumerate(elements_to_translate):
-        if element.string and i < len(translated_texts):
-            element.string.replace_with(translated_texts[i])
-    return str(soup)
-async def translate_paddle_data_with_gemini(
-    paddle_data: list[dict], target_language: str
-) -> list[dict]:
-    """Translates the 'text' field of each item in the paddle_data list."""
-    original_texts = [item.get("text", "") for item in paddle_data]
-    if not original_texts:
-        return []
-    translated_texts = await translate_texts_with_gemini(original_texts, target_language)
-    translated_data = []
-    for i, item in enumerate(paddle_data):
-        translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
-        translated_data.append({"text": translated_text, "box": item.get("box")})
-    return translated_data
-# --- FINAL HTML GENERATION ---
-async def generate_html_from_dual_ocr(
-    translated_hocr_html: str, translated_paddle_data: list[dict]
-) -> str:
-    """Uses Gemini to generate a final, layout-aware HTML document."""
-    try:
-        api_key = os.getenv("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        if not genai:
-            raise ImportError("'google.generativeai' library is not available.")
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
-        prompt = f"""
-                You are an expert web developer. Your task is to merge two translated OCR outputs into a single, clean, and well-styled HTML document.
-                Input 1: Translated hOCR HTML (for structural guidance).
-                --- HOCR START ---
-                {translated_hocr_html}
-                --- HOCR END ---
-                Input 2: Translated PaddleOCR data (the source of truth for text and position).
-                --- PADDLEOCR START ---
-                {str(translated_paddle_data)}
-                --- PADDLEOCR END ---
-                STRICT INSTRUCTIONS:
-                1.  **Output Raw HTML Only**: Your output must be a single block of HTML code, starting with `<!DOCTYPE html>` and ending with `</html>`. Do NOT use markdown fences (```html) or add any commentary.
-                2.  **Prioritize PaddleOCR Data**: ALL text from the PaddleOCR input MUST be included. Its bounding boxes are the ground truth for positioning.
-                3.  **Self-Contained HTML**: Embed all CSS in a `<style>` block in the `<head>`.
-                4.  **Layout Reconstruction**: Use absolute positioning for `<span>` or `<div>` elements containing the text. Use the bounding box coordinates from PaddleOCR to set the `top`, `left`, `width`, and `height` CSS properties for each element to reconstruct the original layout.
-                5.  **Coordinate System**: The bounding box format is [[top-left-x, top-left-y], [top-right-x, top-right-y], [bottom-right-x, bottom-right-y], [bottom-left-x, bottom-left-y]]. You MUST use `left: top-left-x`, `top: top-left-y`, `width: top-right-x - top-left-x`, and `height: bottom-left-y - top-left-y`.
-                FINAL OUTPUT: ONLY the complete, valid, self-contained HTML.
-                """
-        def do_request():
-            response = model.generate_content(prompt)
-            clean_text = re.sub(r'^```html\s*', '', response.text.strip(), flags=re.IGNORECASE)
-            clean_text = re.sub(r'\s*```$', '', clean_text)
-            return clean_text
-        return await asyncio.to_thread(do_request)
-    except Exception as e:
-        error_message = f"An error occurred during HTML generation with Gemini: {str(e)}"
-        traceback.print_exc()
-        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-# --- FASTAPI ENDPOINT ---
-@app.post("/api/translate_file_gemini_local", response_class=HTMLResponse)
-async def translate_document_dual_ocr(
-    target_language: str = Form(...),
-    source_language: str = Form("en"),
-    file: UploadFile = File(...)
-):
-    """
-    Processes a document using a dual OCR pipeline with local PaddleOCR.
-    """
-    content_type = file.content_type
-    if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
-        raise HTTPException(
-            status_code=400,
-            detail="Unsupported file type. Please use PNG, JPG, BMP or TIFF.",
-        )
-    try:
-        image_bytes = await file.read()
-        if not image_bytes:
-            raise HTTPException(status_code=400, detail="Uploaded file is empty.")
-        # === STEP 1: Run both OCR extractions concurrently ===
-        print("***** 1. Starting concurrent OCR (Tesseract & PaddleOCR) *****")
-        hocr_task = get_hocr_from_image(image_bytes)
-        paddle_task = extract_text_and_boxes_with_paddle(image_bytes, lang=source_language)
-        hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
-        if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
-            raise HTTPException(
-                status_code=400,
-                detail="Neither Tesseract nor PaddleOCR could extract any data.",
-            )
-        print("***** 1. Finished OCR extraction *****")
-        # === STEP 2: Translate both OCR outputs concurrently ===
-        print("***** 2. Starting concurrent translation with Gemini *****")
-        translated_hocr_task = translate_hocr_html_with_gemini(hocr_html, target_language)
-        translated_paddle_task = translate_paddle_data_with_gemini(paddle_data, target_language)
-        translated_hocr, translated_paddle = await asyncio.gather(
-            translated_hocr_task, translated_paddle_task
-        )
-        print("***** 2. Finished translation *****")
-        # === STEP 3: Generate final HTML from both translated outputs ===
-        print("***** 3. Generating final HTML via Gemini *****")
-        final_html = await generate_html_from_dual_ocr(translated_hocr, translated_paddle)
-        print("***** 3. Generated final HTML *****")
-        return HTMLResponse(content=final_html)
-    except HTTPException:
-        raise
-    except Exception as e:
-        traceback.print_exc()
-        raise HTTPException(
-            status_code=500,
-            detail=f"An unexpected error occurred during processing: {str(e)}",
-        )
-# To run this application:
-# 1. Save the code as a Python file (e.g., `main.py`).
-# 2. Make sure you have a `requirements.txt` file with all dependencies.
-# 3. Set your GEMINI_API_KEY environment variable in your Hugging Face Space secrets.
-# 4. Run the command: uvicorn main:app --host 0.0.0.0 --port 7860
-#-------------------------- end of updated gemini workflow ----------------------------------

requirements.txt DELETED Viewed

@@ -1,100 +0,0 @@
-accelerate==1.9.0
-annotated-types==0.7.0
-anyio==4.9.0
-beautifulsoup4==4.13.4
-cachetools==5.5.2
-certifi==2025.7.14
-cffi==1.17.1
-charset-normalizer==3.4.2
-click==8.2.1
-colorama==0.4.6
-cryptography==45.0.5
-dnspython==2.7.0
-dotenv==0.9.9
-ecdsa==0.19.1
-email_validator==2.2.0
-exceptiongroup==1.3.0
-fastapi==0.116.1
-fastapi-cli==0.0.8
-fastapi-cloud-cli==0.1.4
-filelock==3.13.1
-fsspec==2024.6.1
-google-ai-generativelanguage==0.6.15
-google-api-core==2.25.1
-google-api-python-client==2.177.0
-google-auth==2.40.3
-google-auth-httplib2==0.2.0
-google-generativeai==0.8.5
-googleapis-common-protos==1.70.0
-grpcio==1.74.0
-grpcio-status==1.71.2
-h11==0.16.0
-httpcore==1.0.9
-httplib2==0.22.0
-httptools==0.6.4
-httpx==0.28.1
-huggingface-hub==0.34.3
-idna==3.10
-itsdangerous==2.2.0
-Jinja2==3.1.6
-langdetect==1.0.9
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-mdurl==0.1.2
-mpmath==1.3.0
-networkx==3.3
-numpy==2.1.2
-orjson==3.11.0
-packaging==25.0
-pillow==11.0.0
-proto-plus==1.26.1
-protobuf==5.29.5
-psutil==7.0.0
-pyasn1==0.6.1
-pyasn1_modules==0.4.2
-pycparser==2.22
-pydantic==2.11.7
-pydantic-extra-types==2.10.5
-pydantic-settings==2.10.1
-pydantic_core==2.33.2
-Pygments==2.19.2
-PyMuPDF==1.26.3
-pyparsing==3.2.3
-python-dotenv==1.1.1
-python-jose==3.5.0
-python-multipart==0.0.20
-PyYAML==6.0.2
-regex==2025.7.31
-requests==2.32.4
-rich==14.0.0
-rich-toolkit==0.14.8
-rignore==0.6.4
-rsa==4.9.1
-safetensors==0.5.3
-sentry-sdk==2.33.2
-shellingham==1.5.4
-six==1.17.0
-sniffio==1.3.1
-soupsieve==2.7
-starlette==0.47.2
-sympy==1.13.3
-tokenizers==0.21.4
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
-tqdm==4.67.1
-transformers==4.54.1
-typer==0.16.0
-typing-inspection==0.4.1
-typing_extensions==4.12.2
-ujson==5.10.0
-uritemplate==4.2.0
-urllib3==2.5.0
-uvicorn==0.35.0
-watchfiles==1.1.0
-websockets==15.0.1
-langextract
-gradio_client
-pytesseract
-paddlepaddle
-paddleocr==2.10.0