acharyaaditya26 commited on
Commit
aceb54a
1 Parent(s): ac722ca
Files changed (6) hide show
  1. Dockerfile +31 -0
  2. app.py +123 -0
  3. requirements.txt +6 -0
  4. static/style.css +29 -0
  5. templates/index.html +15 -0
  6. templates/result.html +19 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
5
+
6
+ RUN apt-get update && apt-get upgrade -y
7
+ RUN apt-get install -y python3-pip python3-dev
8
+ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
9
+ RUN apt-get install -y git
10
+ RUN pip3 install --upgrade pip
11
+ RUN pip3 install packaging
12
+ RUN pip install --no-cache-dir numpy==1.23.5
13
+
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV PATH="/home/user/.local/bin:$PATH"
17
+
18
+ WORKDIR /app
19
+
20
+ COPY --chown=user ./requirements.txt requirements.txt
21
+
22
+ # Install torch first
23
+ RUN pip install --no-cache-dir torch==2.1.2
24
+
25
+ # Now install the rest of the packages
26
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
27
+ RUN pip install flash_attn
28
+
29
+ COPY --chown=user . /app
30
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
31
+
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.templating import Jinja2Templates
5
+ import fitz # PyMuPDF
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from PIL import Image
8
+ import numpy as np
9
+ import os
10
+ import base64
11
+ import io
12
+ import uuid
13
+ import tempfile
14
+ import time
15
+ import shutil
16
+ from pathlib import Path
17
+ import json
18
+ from starlette.requests import Request
19
+
20
+ app = FastAPI()
21
+
22
+ # Load tokenizer and model
23
+ tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
24
+ model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cuda', use_safetensors=True)
25
+ model = model.eval().cuda()
26
+
27
+ UPLOAD_FOLDER = "./uploads"
28
+ RESULTS_FOLDER = "./results"
29
+
30
+ # Ensure directories exist
31
+ for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
32
+ if not os.path.exists(folder):
33
+ os.makedirs(folder)
34
+
35
+ def image_to_base64(image):
36
+ buffered = io.BytesIO()
37
+ image.save(buffered, format="PNG")
38
+ return base64.b64encode(buffered.getvalue()).decode()
39
+
40
+ def pdf_to_images(pdf_path):
41
+ images = []
42
+ pdf_document = fitz.open(pdf_path)
43
+ for page_num in range(len(pdf_document)):
44
+ page = pdf_document.load_page(page_num)
45
+ pix = page.get_pixmap()
46
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
47
+ images.append(img)
48
+ return images
49
+
50
+ def run_GOT(pdf_file):
51
+ unique_id = str(uuid.uuid4())
52
+ pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
53
+ shutil.copy(pdf_file, pdf_path)
54
+
55
+ images = pdf_to_images(pdf_path)
56
+ results = []
57
+
58
+ try:
59
+ for i, image in enumerate(images):
60
+ image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
61
+ image.save(image_path)
62
+
63
+ result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
64
+
65
+ res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
66
+
67
+ # Read the rendered HTML content
68
+ with open(result_path, 'r') as f:
69
+ html_content = f.read()
70
+
71
+ results.append({
72
+ "page_number": i + 1,
73
+ "text": res,
74
+ "html": html_content
75
+ })
76
+
77
+ if os.path.exists(image_path):
78
+ os.remove(image_path)
79
+ if os.path.exists(result_path):
80
+ os.remove(result_path)
81
+ except Exception as e:
82
+ return f"Error: {str(e)}", None
83
+ finally:
84
+ if os.path.exists(pdf_path):
85
+ os.remove(pdf_path)
86
+
87
+ html_output = "".join([result["html"] for result in results])
88
+ return json.dumps(results, indent=4), html_output
89
+
90
+ def cleanup_old_files():
91
+ current_time = time.time()
92
+ for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
93
+ for file_path in Path(folder).glob('*'):
94
+ if current_time - file_path.stat().st_mtime > 3600: # 1 hour
95
+ file_path.unlink()
96
+
97
+ cleanup_old_files()
98
+
99
+ # Mount static files
100
+ app.mount("/static", StaticFiles(directory="static"), name="static")
101
+
102
+ # Set up Jinja2 templates
103
+ templates = Jinja2Templates(directory="templates")
104
+
105
+ @app.get("/", response_class=HTMLResponse)
106
+ async def read_root(request: Request):
107
+ return templates.TemplateResponse("index.html", {"request": request})
108
+
109
+ @app.post("/uploadfile/")
110
+ async def upload_file(request: Request, file: UploadFile = File(...)):
111
+ temp_dir = tempfile.TemporaryDirectory()
112
+ temp_pdf_path = os.path.join(temp_dir.name, file.filename)
113
+ with open(temp_pdf_path, "wb") as buffer:
114
+ buffer.write(await file.read())
115
+
116
+ json_output, html_output = run_GOT(temp_pdf_path)
117
+ temp_dir.cleanup()
118
+
119
+ return templates.TemplateResponse("result.html", {"request": request, "json_output": json_output, "html_output": html_output})
120
+
121
+ if __name__ == "__main__":
122
+ import uvicorn
123
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ PyMuPDF
2
+ transformers
3
+ pillow
4
+ numpy
5
+ fastapi
6
+ uvicorn
static/style.css ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* static/style.css */
2
+ body {
3
+ font-family: Arial, sans-serif;
4
+ margin: 20px;
5
+ }
6
+
7
+ h1, h2 {
8
+ color: #333;
9
+ }
10
+
11
+ form {
12
+ margin-bottom: 20px;
13
+ }
14
+
15
+ #json-output, #html-output {
16
+ margin-bottom: 20px;
17
+ }
18
+
19
+ pre {
20
+ background-color: #f4f4f4;
21
+ padding: 10px;
22
+ border-radius: 5px;
23
+ overflow-x: auto;
24
+ }
25
+
26
+ iframe {
27
+ border: 1px solid #ccc;
28
+ border-radius: 5px;
29
+ }
templates/index.html ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- templates/index.html -->
2
+ <!DOCTYPE html>
3
+ <html>
4
+ <head>
5
+ <title>PDF OCR</title>
6
+ <link href="/static/style.css" rel="stylesheet">
7
+ </head>
8
+ <body>
9
+ <h1>Upload PDF for OCR</h1>
10
+ <form action="/uploadfile/" enctype="multipart/form-data" method="post">
11
+ <input name="file" type="file" accept=".pdf">
12
+ <button type="submit">Upload</button>
13
+ </form>
14
+ </body>
15
+ </html>
templates/result.html ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- templates/result.html -->
2
+ <!DOCTYPE html>
3
+ <html>
4
+ <head>
5
+ <title>OCR Result</title>
6
+ <link href="/static/style.css" rel="stylesheet">
7
+ </head>
8
+ <body>
9
+ <h1>OCR Result</h1>
10
+ <div id="json-output">
11
+ <h2>GOT Output</h2>
12
+ <pre>{{ json_output }}</pre>
13
+ </div>
14
+ <div id="html-output">
15
+ <h2>Rendered HTML</h2>
16
+ <iframe srcdoc="{{ html_output }}" width="100%" height="600px"></iframe>
17
+ </div>
18
+ </body>
19
+ </html>