Spaces:
Sleeping
Sleeping
acharyaaditya26
commited on
Commit
•
aceb54a
1
Parent(s):
ac722ca
changes
Browse files- Dockerfile +31 -0
- app.py +123 -0
- requirements.txt +6 -0
- static/style.css +29 -0
- templates/index.html +15 -0
- templates/result.html +19 -0
Dockerfile
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get upgrade -y
|
7 |
+
RUN apt-get install -y python3-pip python3-dev
|
8 |
+
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
|
9 |
+
RUN apt-get install -y git
|
10 |
+
RUN pip3 install --upgrade pip
|
11 |
+
RUN pip3 install packaging
|
12 |
+
RUN pip install --no-cache-dir numpy==1.23.5
|
13 |
+
|
14 |
+
RUN useradd -m -u 1000 user
|
15 |
+
USER user
|
16 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
17 |
+
|
18 |
+
WORKDIR /app
|
19 |
+
|
20 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
21 |
+
|
22 |
+
# Install torch first
|
23 |
+
RUN pip install --no-cache-dir torch==2.1.2
|
24 |
+
|
25 |
+
# Now install the rest of the packages
|
26 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
27 |
+
RUN pip install flash_attn
|
28 |
+
|
29 |
+
COPY --chown=user . /app
|
30 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
31 |
+
|
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
2 |
+
from fastapi.responses import HTMLResponse
|
3 |
+
from fastapi.staticfiles import StaticFiles
|
4 |
+
from fastapi.templating import Jinja2Templates
|
5 |
+
import fitz # PyMuPDF
|
6 |
+
from transformers import AutoModel, AutoTokenizer
|
7 |
+
from PIL import Image
|
8 |
+
import numpy as np
|
9 |
+
import os
|
10 |
+
import base64
|
11 |
+
import io
|
12 |
+
import uuid
|
13 |
+
import tempfile
|
14 |
+
import time
|
15 |
+
import shutil
|
16 |
+
from pathlib import Path
|
17 |
+
import json
|
18 |
+
from starlette.requests import Request
|
19 |
+
|
20 |
+
app = FastAPI()
|
21 |
+
|
22 |
+
# Load tokenizer and model
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
24 |
+
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cuda', use_safetensors=True)
|
25 |
+
model = model.eval().cuda()
|
26 |
+
|
27 |
+
UPLOAD_FOLDER = "./uploads"
|
28 |
+
RESULTS_FOLDER = "./results"
|
29 |
+
|
30 |
+
# Ensure directories exist
|
31 |
+
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
|
32 |
+
if not os.path.exists(folder):
|
33 |
+
os.makedirs(folder)
|
34 |
+
|
35 |
+
def image_to_base64(image):
|
36 |
+
buffered = io.BytesIO()
|
37 |
+
image.save(buffered, format="PNG")
|
38 |
+
return base64.b64encode(buffered.getvalue()).decode()
|
39 |
+
|
40 |
+
def pdf_to_images(pdf_path):
|
41 |
+
images = []
|
42 |
+
pdf_document = fitz.open(pdf_path)
|
43 |
+
for page_num in range(len(pdf_document)):
|
44 |
+
page = pdf_document.load_page(page_num)
|
45 |
+
pix = page.get_pixmap()
|
46 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
47 |
+
images.append(img)
|
48 |
+
return images
|
49 |
+
|
50 |
+
def run_GOT(pdf_file):
|
51 |
+
unique_id = str(uuid.uuid4())
|
52 |
+
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
53 |
+
shutil.copy(pdf_file, pdf_path)
|
54 |
+
|
55 |
+
images = pdf_to_images(pdf_path)
|
56 |
+
results = []
|
57 |
+
|
58 |
+
try:
|
59 |
+
for i, image in enumerate(images):
|
60 |
+
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
61 |
+
image.save(image_path)
|
62 |
+
|
63 |
+
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
64 |
+
|
65 |
+
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
66 |
+
|
67 |
+
# Read the rendered HTML content
|
68 |
+
with open(result_path, 'r') as f:
|
69 |
+
html_content = f.read()
|
70 |
+
|
71 |
+
results.append({
|
72 |
+
"page_number": i + 1,
|
73 |
+
"text": res,
|
74 |
+
"html": html_content
|
75 |
+
})
|
76 |
+
|
77 |
+
if os.path.exists(image_path):
|
78 |
+
os.remove(image_path)
|
79 |
+
if os.path.exists(result_path):
|
80 |
+
os.remove(result_path)
|
81 |
+
except Exception as e:
|
82 |
+
return f"Error: {str(e)}", None
|
83 |
+
finally:
|
84 |
+
if os.path.exists(pdf_path):
|
85 |
+
os.remove(pdf_path)
|
86 |
+
|
87 |
+
html_output = "".join([result["html"] for result in results])
|
88 |
+
return json.dumps(results, indent=4), html_output
|
89 |
+
|
90 |
+
def cleanup_old_files():
|
91 |
+
current_time = time.time()
|
92 |
+
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
|
93 |
+
for file_path in Path(folder).glob('*'):
|
94 |
+
if current_time - file_path.stat().st_mtime > 3600: # 1 hour
|
95 |
+
file_path.unlink()
|
96 |
+
|
97 |
+
cleanup_old_files()
|
98 |
+
|
99 |
+
# Mount static files
|
100 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
101 |
+
|
102 |
+
# Set up Jinja2 templates
|
103 |
+
templates = Jinja2Templates(directory="templates")
|
104 |
+
|
105 |
+
@app.get("/", response_class=HTMLResponse)
|
106 |
+
async def read_root(request: Request):
|
107 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
108 |
+
|
109 |
+
@app.post("/uploadfile/")
|
110 |
+
async def upload_file(request: Request, file: UploadFile = File(...)):
|
111 |
+
temp_dir = tempfile.TemporaryDirectory()
|
112 |
+
temp_pdf_path = os.path.join(temp_dir.name, file.filename)
|
113 |
+
with open(temp_pdf_path, "wb") as buffer:
|
114 |
+
buffer.write(await file.read())
|
115 |
+
|
116 |
+
json_output, html_output = run_GOT(temp_pdf_path)
|
117 |
+
temp_dir.cleanup()
|
118 |
+
|
119 |
+
return templates.TemplateResponse("result.html", {"request": request, "json_output": json_output, "html_output": html_output})
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
import uvicorn
|
123 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyMuPDF
|
2 |
+
transformers
|
3 |
+
pillow
|
4 |
+
numpy
|
5 |
+
fastapi
|
6 |
+
uvicorn
|
static/style.css
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* static/style.css */
|
2 |
+
body {
|
3 |
+
font-family: Arial, sans-serif;
|
4 |
+
margin: 20px;
|
5 |
+
}
|
6 |
+
|
7 |
+
h1, h2 {
|
8 |
+
color: #333;
|
9 |
+
}
|
10 |
+
|
11 |
+
form {
|
12 |
+
margin-bottom: 20px;
|
13 |
+
}
|
14 |
+
|
15 |
+
#json-output, #html-output {
|
16 |
+
margin-bottom: 20px;
|
17 |
+
}
|
18 |
+
|
19 |
+
pre {
|
20 |
+
background-color: #f4f4f4;
|
21 |
+
padding: 10px;
|
22 |
+
border-radius: 5px;
|
23 |
+
overflow-x: auto;
|
24 |
+
}
|
25 |
+
|
26 |
+
iframe {
|
27 |
+
border: 1px solid #ccc;
|
28 |
+
border-radius: 5px;
|
29 |
+
}
|
templates/index.html
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- templates/index.html -->
|
2 |
+
<!DOCTYPE html>
|
3 |
+
<html>
|
4 |
+
<head>
|
5 |
+
<title>PDF OCR</title>
|
6 |
+
<link href="/static/style.css" rel="stylesheet">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<h1>Upload PDF for OCR</h1>
|
10 |
+
<form action="/uploadfile/" enctype="multipart/form-data" method="post">
|
11 |
+
<input name="file" type="file" accept=".pdf">
|
12 |
+
<button type="submit">Upload</button>
|
13 |
+
</form>
|
14 |
+
</body>
|
15 |
+
</html>
|
templates/result.html
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- templates/result.html -->
|
2 |
+
<!DOCTYPE html>
|
3 |
+
<html>
|
4 |
+
<head>
|
5 |
+
<title>OCR Result</title>
|
6 |
+
<link href="/static/style.css" rel="stylesheet">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<h1>OCR Result</h1>
|
10 |
+
<div id="json-output">
|
11 |
+
<h2>GOT Output</h2>
|
12 |
+
<pre>{{ json_output }}</pre>
|
13 |
+
</div>
|
14 |
+
<div id="html-output">
|
15 |
+
<h2>Rendered HTML</h2>
|
16 |
+
<iframe srcdoc="{{ html_output }}" width="100%" height="600px"></iframe>
|
17 |
+
</div>
|
18 |
+
</body>
|
19 |
+
</html>
|