Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
import gradio as gr
|
3 |
+
from PyPDF2 import PdfReader
|
4 |
+
import docx
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
import torch
|
8 |
+
from datetime import datetime
|
9 |
+
import pytz
|
10 |
+
from io import BytesIO
|
11 |
+
from docx import Document
|
12 |
+
import tempfile
|
13 |
+
|
14 |
+
# Load translation model
|
15 |
+
def load_translation_model():
|
16 |
+
try:
|
17 |
+
model_name = "facebook/nllb-200-distilled-600M"
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
20 |
+
return tokenizer, model
|
21 |
+
except Exception as e:
|
22 |
+
print(f"Error loading model: {str(e)}")
|
23 |
+
return None, None
|
24 |
+
|
25 |
+
# Initialize models
|
26 |
+
tokenizer, model = load_translation_model()
|
27 |
+
MODELS = {"nllb": (tokenizer, model)} if tokenizer and model else None
|
28 |
+
|
29 |
+
# Extract text from documents
|
30 |
+
def extract_text(file):
|
31 |
+
try:
|
32 |
+
if isinstance(file, str): # File path provided
|
33 |
+
ext = os.path.splitext(file)[1].lower()
|
34 |
+
else: # File object provided
|
35 |
+
ext = os.path.splitext(file.name)[1].lower()
|
36 |
+
|
37 |
+
if ext == ".pdf":
|
38 |
+
try:
|
39 |
+
# Create a BytesIO object to hold the file content
|
40 |
+
if isinstance(file, str):
|
41 |
+
with open(file, 'rb') as f:
|
42 |
+
file_content = BytesIO(f.read())
|
43 |
+
else:
|
44 |
+
file_content = BytesIO(file.read())
|
45 |
+
|
46 |
+
# Create PdfReader object from the BytesIO
|
47 |
+
reader = PdfReader(file_content)
|
48 |
+
text = ""
|
49 |
+
for page in reader.pages:
|
50 |
+
text += page.extract_text() + "\n"
|
51 |
+
return text.strip()
|
52 |
+
except Exception as e:
|
53 |
+
raise Exception(f"PDF extraction error: {str(e)}")
|
54 |
+
finally:
|
55 |
+
if 'file_content' in locals():
|
56 |
+
file_content.close()
|
57 |
+
|
58 |
+
elif ext == ".docx":
|
59 |
+
if isinstance(file, str):
|
60 |
+
doc = docx.Document(file)
|
61 |
+
else:
|
62 |
+
doc = docx.Document(file)
|
63 |
+
text = ""
|
64 |
+
for para in doc.paragraphs:
|
65 |
+
text += para.text + "\n"
|
66 |
+
return text.strip()
|
67 |
+
|
68 |
+
elif ext == ".txt":
|
69 |
+
if isinstance(file, str):
|
70 |
+
with open(file, 'r', encoding='utf-8') as f:
|
71 |
+
return f.read().strip()
|
72 |
+
else:
|
73 |
+
return file.read().decode("utf-8").strip()
|
74 |
+
else:
|
75 |
+
raise ValueError("Unsupported file format")
|
76 |
+
except Exception as e:
|
77 |
+
raise Exception(f"Error extracting text: {str(e)}")
|
78 |
+
|
79 |
+
# Preprocess idioms
|
80 |
+
def preprocess_idioms(text, src_lang, tgt_lang):
|
81 |
+
idiom_map = {}
|
82 |
+
|
83 |
+
if src_lang == "en" and tgt_lang == "hi":
|
84 |
+
idiom_map = {
|
85 |
+
"no piece of cake": "कोई आसान काम नहीं",
|
86 |
+
"piece of cake": "बहुत आसान काम",
|
87 |
+
"bite the bullet": "दांतों तले उंगली दबाना",
|
88 |
+
"tackle it head-on": "सीधे मुकाबला करना",
|
89 |
+
"fell into place": "सब कुछ ठीक हो गया",
|
90 |
+
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
|
91 |
+
"with a little perseverance": "थोड़े से धैर्य से",
|
92 |
+
|
93 |
+
# Additional common idioms
|
94 |
+
"break a leg": "बहुत बहुत शुभकामनाएं",
|
95 |
+
"hit the nail on the head": "बिल्कुल सही बात कहना",
|
96 |
+
"once in a blue moon": "बहुत कम, कभी-कभार",
|
97 |
+
"under the weather": "तबीयत ठीक नहीं",
|
98 |
+
"cost an arm and a leg": "बहुत महंगा",
|
99 |
+
"beating around the bush": "इधर-उधर की बात करना",
|
100 |
+
"call it a day": "काम समाप्त करना",
|
101 |
+
"burn the midnight oil": "रात-रात भर जागकर काम करना",
|
102 |
+
"get the ball rolling": "शुरुआत करना",
|
103 |
+
"pull yourself together": "खुद को संभालो",
|
104 |
+
"shoot yourself in the foot": "अपना ही नुकसान करना",
|
105 |
+
"take it with a grain of salt": "संदेह से लेना",
|
106 |
+
"the last straw": "सहनशीलता की आखिरी सीमा",
|
107 |
+
"time flies": "समय पंख लगाकर उड़ता है",
|
108 |
+
"wrap your head around": "समझने की कोशिश करना",
|
109 |
+
"cut corners": "काम में छोटा रास्ता अपनाना",
|
110 |
+
"back to square one": "फिर से शुरू से",
|
111 |
+
"blessing in disguise": "छिपा हुआ वरदान",
|
112 |
+
"cry over spilled milk": "बीती बात पर पछताना",
|
113 |
+
"keep your chin up": "हिम्मत रखना",
|
114 |
+
|
115 |
+
# Work-related idioms
|
116 |
+
"think outside the box": "नए तरीके से सोचना",
|
117 |
+
"raise the bar": "मानक ऊंचा करना",
|
118 |
+
"learning curve": "सीखने की प्रक्रिया",
|
119 |
+
"up and running": "चालू और कार्यरत",
|
120 |
+
"back to the drawing board": "फिर से योजना बनाना",
|
121 |
+
|
122 |
+
# Project-related phrases
|
123 |
+
"running into issues": "समस्याओं का सामना करना",
|
124 |
+
"iron out the bugs": "खामियां दूर करना",
|
125 |
+
"in the pipeline": "विचाराधीन",
|
126 |
+
"moving forward": "आगे बढ़ते हुए",
|
127 |
+
"touch base": "संपर्क में रहना",
|
128 |
+
|
129 |
+
# Technical phrases
|
130 |
+
"user-friendly": "उपयोगकर्ता के अनुकूल",
|
131 |
+
"cutting-edge": "अत्याधुनिक",
|
132 |
+
"state of the art": "अत्याधुनिक तकनीक",
|
133 |
+
"proof of concept": "व्यवहार्यता का प्रमाण",
|
134 |
+
"game changer": "खेल बदलने वाला",
|
135 |
+
|
136 |
+
"a blessing in disguise": "छुपा हुआ वरदान",
|
137 |
+
"actions speak louder than words": "कर्म शब्दों से अधिक प्रभावी होते हैं",
|
138 |
+
"add fuel to the fire": "आग में घी डालना",
|
139 |
+
"barking up the wrong tree": "गलत दिशा में प्रयास करना",
|
140 |
+
"best of both worlds": "दोनों चीजों का लाभ",
|
141 |
+
"cut to the chase": "मुद्दे पर आना",
|
142 |
+
"don't judge a book by its cover": "किसी को उसके रूप से मत आंकिए",
|
143 |
+
"easy does it": "धीरे-धीरे करो",
|
144 |
+
"every cloud has a silver lining": "हर मुश्किल में आशा की किरण होती है",
|
145 |
+
"get a taste of your own medicine": "जैसा किया वैसा भुगतो",
|
146 |
+
"hit the sack": "सोने जाना",
|
147 |
+
"let the cat out of the bag": "राज़ खोल देना",
|
148 |
+
"miss the boat": "मौका चूक जाना",
|
149 |
+
"no pain no gain": "बिना मेहनत के कुछ नहीं मिलता",
|
150 |
+
"on the ball": "सचेत और सतर्क",
|
151 |
+
"pull the plug": "काम रोक देना",
|
152 |
+
"spill the beans": "राज़ खोलना",
|
153 |
+
"the ball is in your court": "अब निर्णय तुम्हारे हाथ में है",
|
154 |
+
"through thick and thin": "हर परिस्थिति में",
|
155 |
+
"you can't have your cake and eat it too": "दोनों फायदे एक साथ नहीं हो सकते"
|
156 |
+
}
|
157 |
+
elif src_lang == "en" and tgt_lang == "mr":
|
158 |
+
idiom_map = {
|
159 |
+
"no piece of cake": "सोपं काम नाही",
|
160 |
+
"piece of cake": "अतिशय सोपं काम",
|
161 |
+
"bite the bullet": "कठीण निर्णय घेणे",
|
162 |
+
"tackle it head-on": "समस्येला थेट सामोरे जाणे",
|
163 |
+
"fell into place": "सगळं व्यवस्थित झालं",
|
164 |
+
"see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे",
|
165 |
+
"with a little perseverance": "थोड्या धीराने",
|
166 |
+
"break a leg": "खूप शुभेच्छा",
|
167 |
+
"hit the nail on the head": "अगदी बरोबर बोललात",
|
168 |
+
"once in a blue moon": "क्वचितच, कधीतरी",
|
169 |
+
"under the weather": "तब्येत ठीक नसणे",
|
170 |
+
"cost an arm and a leg": "खूप महाग",
|
171 |
+
"beating around the bush": "गोल गोल फिरवणे",
|
172 |
+
"call it a day": "दिवसाचं काम संपवणे",
|
173 |
+
"burn the midnight oil": "रात्रंदिवस मेहनत करणे",
|
174 |
+
"get the ball rolling": "सुरुवात करणे",
|
175 |
+
"pull yourself together": "स्वतःला सावरा",
|
176 |
+
"shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे",
|
177 |
+
"take it with a grain of salt": "साशंक दृष्टीने पाहणे",
|
178 |
+
"the last straw": "सहनशक्तीची शेवटची मर्यादा",
|
179 |
+
"time flies": "वेळ पंख लावून उडतो",
|
180 |
+
"wrap your head around": "समजून घेण्याचा प्रयत्न करणे",
|
181 |
+
"cut corners": "कमी वेळात काम उरकणे",
|
182 |
+
"back to square one": "पुन्हा सुरुवातीला",
|
183 |
+
"blessing in disguise": "आशीर्वाद लपलेला",
|
184 |
+
"cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे",
|
185 |
+
"keep your chin up": "धीर धरा",
|
186 |
+
|
187 |
+
# Work-related idioms
|
188 |
+
"think outside the box": "वेगळ्या पद्धतीने विचार करणे",
|
189 |
+
"raise the bar": "पातळी उंचावणे",
|
190 |
+
"learning curve": "शिकण्याची प्रक्रिया",
|
191 |
+
"up and running": "सुरू आणि कार्यरत",
|
192 |
+
"back to the drawing board": "पुन्हा नव्याने योजना आखणे",
|
193 |
+
|
194 |
+
# Project-related phrases
|
195 |
+
"running into issues": "अडचणींना सामोरे जाणे",
|
196 |
+
"iron out the bugs": "त्रुटी दूर करणे",
|
197 |
+
"in the pipeline": "विचाराधीन",
|
198 |
+
"moving forward": "पुढे जाताना",
|
199 |
+
"touch base": "संपर्कात राहणे",
|
200 |
+
|
201 |
+
# Technical phrases
|
202 |
+
"user-friendly": "वापरकर्त्यास सोयीस्कर",
|
203 |
+
"cutting-edge": "अत्याधुनिक",
|
204 |
+
"state of the art": "सर्वोत्कृष्ट तंत्रज्ञान",
|
205 |
+
"proof of concept": "संकल्पनेची सिद्धता",
|
206 |
+
"game changer": "खेळ बदलणारी गोष्ट",
|
207 |
+
|
208 |
+
"a blessing in disguise": "छुपलेले वरदान",
|
209 |
+
"actions speak louder than words": "कृती शब्दांपेक्षा प्रभावी असतात",
|
210 |
+
"add fuel to the fire": "आग ला फुंकर घालणे",
|
211 |
+
"barking up the wrong tree": "चुकीच्या गोष्टीकडे लक्ष देणे",
|
212 |
+
"best of both worlds": "दोनही गोष्टींचा लाभ",
|
213 |
+
"cut to the chase": "थेट मुद्द्यावर येणे",
|
214 |
+
"don't judge a book by its cover": "फक्त बाह्यरूप पाहून अंदाज लावू नका",
|
215 |
+
"easy does it": "हळूहळू करा",
|
216 |
+
"every cloud has a silver lining": "प्रत्येक संकटात संधी असते",
|
217 |
+
"get a taste of your own medicine": "जसे कराल तसे भराल",
|
218 |
+
"hit the sack": "झोपायला जाणे",
|
219 |
+
"let the cat out of the bag": "गुपित उघड करणे",
|
220 |
+
"miss the boat": "संधी गमावणे",
|
221 |
+
"no pain no gain": "कष्टाशिवाय यश नाही",
|
222 |
+
"on the ball": "सतर्क असणे",
|
223 |
+
"pull the plug": "काम बंद करणे",
|
224 |
+
"spill the beans": "गुपित सांगणे",
|
225 |
+
"the ball is in your court": "निर्णय तुमच्या हाती आहे",
|
226 |
+
"through thick and thin": "संकटसमयीही साथ देणे",
|
227 |
+
"you can't have your cake and eat it too": "सगळं काही मिळवता येत नाही"
|
228 |
+
}
|
229 |
+
|
230 |
+
if idiom_map:
|
231 |
+
# Sort idioms by length (longest first) to handle overlapping phrases
|
232 |
+
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
|
233 |
+
pattern = '|'.join(map(re.escape, sorted_idioms))
|
234 |
+
|
235 |
+
# Create a regex pattern and replace idioms
|
236 |
+
if pattern:
|
237 |
+
regex = re.compile(pattern, flags=re.IGNORECASE)
|
238 |
+
text = regex.sub(lambda m: idiom_map[m.group(0).lower()], text)
|
239 |
+
|
240 |
+
return text
|
241 |
+
|
242 |
+
# Translation function
|
243 |
+
def translate_text(text, src_lang, tgt_lang):
|
244 |
+
if src_lang == tgt_lang:
|
245 |
+
return text
|
246 |
+
|
247 |
+
lang_map = {"English": "eng_Latn", "Hindi": "hin_Deva", "Marathi": "mar_Deva"}
|
248 |
+
src_lang_code = lang_map.get(src_lang)
|
249 |
+
tgt_lang_code = lang_map.get(tgt_lang)
|
250 |
+
|
251 |
+
if not src_lang_code or not tgt_lang_code:
|
252 |
+
return "Error: Unsupported language combination"
|
253 |
+
|
254 |
+
try:
|
255 |
+
# First apply idiom preprocessing
|
256 |
+
preprocessed_text = preprocess_idioms(text, src_lang[:2].lower(), tgt_lang[:2].lower())
|
257 |
+
tokenizer, model = MODELS["nllb"]
|
258 |
+
|
259 |
+
chunks = []
|
260 |
+
current_chunk = ""
|
261 |
+
|
262 |
+
# Split text into manageable chunks
|
263 |
+
for sentence in re.split('([.!?।]+)', preprocessed_text):
|
264 |
+
if sentence.strip():
|
265 |
+
if len(current_chunk) + len(sentence) < 450:
|
266 |
+
current_chunk += sentence
|
267 |
+
else:
|
268 |
+
if current_chunk:
|
269 |
+
chunks.append(current_chunk)
|
270 |
+
current_chunk = sentence
|
271 |
+
|
272 |
+
if current_chunk:
|
273 |
+
chunks.append(current_chunk)
|
274 |
+
|
275 |
+
translated_text = ""
|
276 |
+
|
277 |
+
# Translate each chunk
|
278 |
+
for chunk in chunks:
|
279 |
+
if chunk.strip():
|
280 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
281 |
+
tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
|
282 |
+
|
283 |
+
translated = model.generate(
|
284 |
+
**inputs,
|
285 |
+
forced_bos_token_id=tgt_lang_id,
|
286 |
+
max_length=512,
|
287 |
+
num_beams=5,
|
288 |
+
length_penalty=1.0,
|
289 |
+
no_repeat_ngram_size=3
|
290 |
+
)
|
291 |
+
|
292 |
+
translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
|
293 |
+
translated_text += translated_chunk + " "
|
294 |
+
|
295 |
+
return translated_text.strip()
|
296 |
+
except Exception as e:
|
297 |
+
return f"Error during translation: {str(e)}"
|
298 |
+
|
299 |
+
# Document translation function
|
300 |
+
def translate_document(file, source_lang, target_lang):
|
301 |
+
try:
|
302 |
+
if file is None:
|
303 |
+
return "Please upload a file", None
|
304 |
+
|
305 |
+
input_ext = os.path.splitext(file.name)[1].lower()
|
306 |
+
temp_dir = tempfile.gettempdir()
|
307 |
+
|
308 |
+
# Change output extension to .txt for PDF inputs
|
309 |
+
if input_ext == '.pdf':
|
310 |
+
output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.txt"
|
311 |
+
else:
|
312 |
+
output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}{input_ext}"
|
313 |
+
|
314 |
+
output_path = os.path.join(temp_dir, output_filename)
|
315 |
+
|
316 |
+
|
317 |
+
if input_ext == '.pdf':
|
318 |
+
try:
|
319 |
+
# Create a BytesIO object for the PDF content
|
320 |
+
if isinstance(file, str):
|
321 |
+
with open(file, 'rb') as f:
|
322 |
+
file_content = BytesIO(f.read())
|
323 |
+
else:
|
324 |
+
file_content = BytesIO(file.read())
|
325 |
+
|
326 |
+
# Create PdfReader object
|
327 |
+
reader = PdfReader(file_content)
|
328 |
+
translated_pages = []
|
329 |
+
|
330 |
+
|
331 |
+
|
332 |
+
# Process each page while preserving structure
|
333 |
+
for page_num, page in enumerate(reader.pages, 1):
|
334 |
+
# Extract text from the page
|
335 |
+
page_text = page.extract_text()
|
336 |
+
if not page_text.strip():
|
337 |
+
continue
|
338 |
+
|
339 |
+
|
340 |
+
# Split into paragraphs while preserving structure
|
341 |
+
paragraphs = page_text.split('\n\n')
|
342 |
+
translated_paragraphs = []
|
343 |
+
|
344 |
+
for paragraph in paragraphs:
|
345 |
+
# Handle individual lines within paragraphs
|
346 |
+
lines = paragraph.split('\n')
|
347 |
+
translated_lines = []
|
348 |
+
|
349 |
+
for line in lines:
|
350 |
+
if line.strip():
|
351 |
+
translated_line = translate_text(line, source_lang, target_lang)
|
352 |
+
translated_lines.append(translated_line)
|
353 |
+
else:
|
354 |
+
translated_lines.append('') # Preserve empty lines
|
355 |
+
|
356 |
+
translated_paragraphs.append('\n'.join(translated_lines))
|
357 |
+
|
358 |
+
# Combine translated paragraphs with proper spacing
|
359 |
+
translated_pages.append('\n\n'.join(translated_paragraphs))
|
360 |
+
|
361 |
+
# Combine all translated pages
|
362 |
+
final_text = '\n\n'.join(translated_pages)
|
363 |
+
|
364 |
+
# Save as formatted txt file
|
365 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
366 |
+
f.write(final_text)
|
367 |
+
|
368 |
+
return final_text, output_path
|
369 |
+
|
370 |
+
except Exception as e:
|
371 |
+
raise Exception(f"PDF processing error: {str(e)}")
|
372 |
+
finally:
|
373 |
+
if 'file_content' in locals():
|
374 |
+
file_content.close()
|
375 |
+
|
376 |
+
elif input_ext == '.docx':
|
377 |
+
# Handle DOCX with formatting preservation
|
378 |
+
doc = Document(file)
|
379 |
+
new_doc = Document()
|
380 |
+
|
381 |
+
# Copy styles from original document
|
382 |
+
for style in doc.styles:
|
383 |
+
if style.name not in new_doc.styles:
|
384 |
+
new_doc.styles.add_style(
|
385 |
+
style.name,
|
386 |
+
style.type,
|
387 |
+
True if style.base_style else False
|
388 |
+
)
|
389 |
+
|
390 |
+
# Process each paragraph while preserving formatting
|
391 |
+
for para in doc.paragraphs:
|
392 |
+
if not para.text.strip():
|
393 |
+
# Preserve empty paragraphs
|
394 |
+
new_doc.add_paragraph()
|
395 |
+
continue
|
396 |
+
|
397 |
+
# Create new paragraph with same style
|
398 |
+
new_para = new_doc.add_paragraph(style=para.style.name if para.style else None)
|
399 |
+
|
400 |
+
# Buffer to collect text for translation
|
401 |
+
runs_buffer = []
|
402 |
+
formatting_map = []
|
403 |
+
|
404 |
+
# Collect text and formatting information
|
405 |
+
for run in para.runs:
|
406 |
+
if run.text.strip():
|
407 |
+
runs_buffer.append(run.text)
|
408 |
+
# Store formatting attributes
|
409 |
+
formatting_map.append({
|
410 |
+
'bold': run.bold,
|
411 |
+
'italic': run.italic,
|
412 |
+
'underline': run.underline,
|
413 |
+
'font_size': run.font.size if run.font.size else None,
|
414 |
+
'font_name': run.font.name if run.font.name else None,
|
415 |
+
'color': run.font.color.rgb if run.font.color and run.font.color.rgb else None
|
416 |
+
})
|
417 |
+
|
418 |
+
if runs_buffer:
|
419 |
+
# Translate the combined text
|
420 |
+
combined_text = " ".join(runs_buffer)
|
421 |
+
translated_text = translate_text(combined_text, source_lang, target_lang)
|
422 |
+
|
423 |
+
# Split translated text approximately matching original structure
|
424 |
+
translated_parts = translated_text.split()
|
425 |
+
avg_len = len(translated_parts) // len(formatting_map)
|
426 |
+
|
427 |
+
# Apply formatting to translated parts
|
428 |
+
current_index = 0
|
429 |
+
for i, format_info in enumerate(formatting_map):
|
430 |
+
# Calculate text chunk for this run
|
431 |
+
end_index = min(current_index + avg_len, len(translated_parts))
|
432 |
+
if i == len(formatting_map) - 1:
|
433 |
+
# Last run gets all remaining text
|
434 |
+
end_index = len(translated_parts)
|
435 |
+
|
436 |
+
chunk_text = " ".join(translated_parts[current_index:end_index])
|
437 |
+
current_index = end_index
|
438 |
+
|
439 |
+
# Create new run with preserved formatting
|
440 |
+
new_run = new_para.add_run(chunk_text + " ")
|
441 |
+
new_run.bold = format_info['bold']
|
442 |
+
new_run.italic = format_info['italic']
|
443 |
+
new_run.underline = format_info['underline']
|
444 |
+
if format_info['font_size']:
|
445 |
+
new_run.font.size = format_info['font_size']
|
446 |
+
if format_info['font_name']:
|
447 |
+
new_run.font.name = format_info['font_name']
|
448 |
+
if format_info['color']:
|
449 |
+
new_run.font.color.rgb = format_info['color']
|
450 |
+
|
451 |
+
# Save the formatted document
|
452 |
+
new_doc.save(output_path)
|
453 |
+
|
454 |
+
# Return both text content and file
|
455 |
+
text_content = "\n".join(para.text for para in new_doc.paragraphs if para.text.strip())
|
456 |
+
return text_content, output_path
|
457 |
+
|
458 |
+
elif input_ext == '.txt':
|
459 |
+
# Handle TXT with line formatting preservation
|
460 |
+
input_text = extract_text(file)
|
461 |
+
if not input_text:
|
462 |
+
return "Could not extract text from the document", None
|
463 |
+
|
464 |
+
# Split into paragraphs while preserving line breaks
|
465 |
+
paragraphs = input_text.split('\n\n')
|
466 |
+
translated_paragraphs = []
|
467 |
+
|
468 |
+
for paragraph in paragraphs:
|
469 |
+
# Handle individual lines within paragraphs
|
470 |
+
lines = paragraph.split('\n')
|
471 |
+
translated_lines = []
|
472 |
+
|
473 |
+
for line in lines:
|
474 |
+
if line.strip():
|
475 |
+
translated_line = translate_text(line, source_lang, target_lang)
|
476 |
+
translated_lines.append(translated_line)
|
477 |
+
else:
|
478 |
+
translated_lines.append('') # Preserve empty lines
|
479 |
+
|
480 |
+
translated_paragraphs.append('\n'.join(translated_lines))
|
481 |
+
|
482 |
+
# Combine translated paragraphs with double line breaks
|
483 |
+
final_text = '\n\n'.join(translated_paragraphs)
|
484 |
+
|
485 |
+
# Save as formatted txt file
|
486 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
487 |
+
f.write(final_text)
|
488 |
+
|
489 |
+
return final_text, output_path
|
490 |
+
|
491 |
+
else:
|
492 |
+
# For other file types, use the original translation logic
|
493 |
+
input_text = extract_text(file)
|
494 |
+
if input_text is None:
|
495 |
+
return "Could not extract text from the document", None
|
496 |
+
|
497 |
+
translated_text = translate_text(input_text, source_lang, target_lang)
|
498 |
+
|
499 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
500 |
+
f.write(translated_text)
|
501 |
+
|
502 |
+
return translated_text, output_path
|
503 |
+
|
504 |
+
except Exception as e:
|
505 |
+
return f"Error: {str(e)}", None
|
506 |
+
|
507 |
+
# Direct text translation function
|
508 |
+
def translate_text_direct(text, source_lang, target_lang):
|
509 |
+
if not text:
|
510 |
+
return "Please enter some text"
|
511 |
+
return translate_text(text, source_lang, target_lang)
|
512 |
+
|
513 |
+
# Get current time in UTC
|
514 |
+
def get_current_time():
|
515 |
+
utc_now = datetime.now(pytz.UTC)
|
516 |
+
return utc_now.strftime("%Y-%m-%d %H:%M:%S")
|
517 |
+
|
518 |
+
# Create Gradio interface
|
519 |
+
def create_interface():
|
520 |
+
# Add header with timestamp and user info
|
521 |
+
header = gr.Markdown(
|
522 |
+
f"""
|
523 |
+
# Document Translation Toolkit
|
524 |
+
*Current Date and Time (UTC):* {get_current_time()}
|
525 |
+
*Current User's Login:* gauravchand
|
526 |
+
"""
|
527 |
+
)
|
528 |
+
|
529 |
+
# Document Translation Interface
|
530 |
+
doc_interface = gr.Interface(
|
531 |
+
fn=translate_document,
|
532 |
+
inputs=[
|
533 |
+
gr.File(label="Upload Document (PDF, DOCX, or TXT)"),
|
534 |
+
gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
|
535 |
+
gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
|
536 |
+
],
|
537 |
+
outputs=[
|
538 |
+
gr.Textbox(label="Translation", lines=10),
|
539 |
+
gr.File(label="Download Translation")
|
540 |
+
],
|
541 |
+
title="Document Translation",
|
542 |
+
description="Upload a document to translate"
|
543 |
+
)
|
544 |
+
|
545 |
+
# Text Translation Interface
|
546 |
+
text_interface = gr.Interface(
|
547 |
+
fn=translate_text_direct,
|
548 |
+
inputs=[
|
549 |
+
gr.Textbox(lines=5, label="Enter text to translate"),
|
550 |
+
gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
|
551 |
+
gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
|
552 |
+
],
|
553 |
+
outputs=gr.Textbox(label="Translation", lines=5),
|
554 |
+
title="Text Translation",
|
555 |
+
description="Enter text directly to translate"
|
556 |
+
)
|
557 |
+
|
558 |
+
# Combine interfaces with header
|
559 |
+
demo = gr.Blocks()
|
560 |
+
with demo:
|
561 |
+
header.render()
|
562 |
+
gr.TabbedInterface(
|
563 |
+
[doc_interface, text_interface],
|
564 |
+
tab_names=["Document Translation", "Text Translation"]
|
565 |
+
)
|
566 |
+
|
567 |
+
return demo
|
568 |
+
|
569 |
+
# Launch the app
|
570 |
+
if __name__ == "__main__":
|
571 |
+
demo = create_interface()
|
572 |
+
demo.launch()
|