Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,14 @@ from PyPDF2 import PdfReader
|
|
4 |
import docx
|
5 |
import os
|
6 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Load NLLB model and tokenizer
|
9 |
@st.cache_resource
|
@@ -19,7 +27,92 @@ def initialize_models():
|
|
19 |
tokenizer, model = load_translation_model()
|
20 |
return {"nllb": (tokenizer, model)}
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def extract_text(file):
|
24 |
ext = os.path.splitext(file.name)[1].lower()
|
25 |
|
@@ -43,7 +136,6 @@ def extract_text(file):
|
|
43 |
else:
|
44 |
raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
|
45 |
|
46 |
-
# Translation function
|
47 |
def translate_text(text, src_lang, tgt_lang, models):
|
48 |
if src_lang == tgt_lang:
|
49 |
return text
|
@@ -61,86 +153,16 @@ def translate_text(text, src_lang, tgt_lang, models):
|
|
61 |
# Preprocess for idioms
|
62 |
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
|
63 |
|
64 |
-
# Split text into
|
65 |
-
sentences =
|
66 |
-
translated_text =
|
67 |
|
68 |
for sentence in sentences:
|
69 |
if sentence.strip():
|
70 |
-
|
71 |
-
# Use lang_code_to_id instead of get_lang_id
|
72 |
-
translated = model.generate(
|
73 |
-
**inputs,
|
74 |
-
forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang_code],
|
75 |
-
max_length=512
|
76 |
-
)
|
77 |
-
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
|
78 |
-
translated_text += translated_sentence + "\n"
|
79 |
-
|
80 |
-
return translated_text
|
81 |
-
|
82 |
-
# Function to save text as a file
|
83 |
-
def save_text_to_file(text, original_filename, prefix="translated"):
|
84 |
-
output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
|
85 |
-
with open(output_filename, "w", encoding="utf-8") as f:
|
86 |
-
f.write(text)
|
87 |
-
return output_filename
|
88 |
-
|
89 |
-
# Main processing function
|
90 |
-
def process_document(file, source_lang, target_lang, models):
|
91 |
-
try:
|
92 |
-
# Extract text from uploaded file
|
93 |
-
text = extract_text(file)
|
94 |
-
|
95 |
-
# Translate the text
|
96 |
-
translated_text = translate_text(text, source_lang, target_lang, models)
|
97 |
-
|
98 |
-
# Save the result (success or error) to a file
|
99 |
-
if translated_text.startswith("Error:"):
|
100 |
-
output_file = save_text_to_file(translated_text, file.name, prefix="error")
|
101 |
-
else:
|
102 |
-
output_file = save_text_to_file(translated_text, file.name)
|
103 |
-
|
104 |
-
return output_file, translated_text
|
105 |
-
except Exception as e:
|
106 |
-
# Save error message to a file
|
107 |
-
error_message = f"Error: {str(e)}"
|
108 |
-
output_file = save_text_to_file(error_message, file.name, prefix="error")
|
109 |
-
return output_file, error_message
|
110 |
-
|
111 |
-
# Streamlit interface
|
112 |
-
def main():
|
113 |
-
st.title("Document Translator (NLLB-200)")
|
114 |
-
st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
|
115 |
-
|
116 |
-
# Initialize models
|
117 |
-
models = initialize_models()
|
118 |
-
|
119 |
-
# File uploader
|
120 |
-
uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
|
121 |
-
|
122 |
-
# Language selection
|
123 |
-
col1, col2 = st.columns(2)
|
124 |
-
with col1:
|
125 |
-
source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
|
126 |
-
with col2:
|
127 |
-
target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
|
128 |
-
|
129 |
-
if uploaded_file is not None and st.button("Translate"):
|
130 |
-
with st.spinner("Translating..."):
|
131 |
-
output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
|
132 |
-
|
133 |
-
# Display result
|
134 |
-
st.text_area("Translated Text", result_text, height=300)
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
file_name=os.path.basename(output_file),
|
142 |
-
mime="text/plain"
|
143 |
-
)
|
144 |
-
|
145 |
-
if __name__ == "__main__":
|
146 |
-
main()
|
|
|
4 |
import docx
|
5 |
import os
|
6 |
import re
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
# Page config
|
10 |
+
st.set_page_config(
|
11 |
+
page_title="Document Translator (NLLB-200)",
|
12 |
+
page_icon="📄",
|
13 |
+
layout="wide"
|
14 |
+
)
|
15 |
|
16 |
# Load NLLB model and tokenizer
|
17 |
@st.cache_resource
|
|
|
27 |
tokenizer, model = load_translation_model()
|
28 |
return {"nllb": (tokenizer, model)}
|
29 |
|
30 |
+
def split_long_sentence(sentence, max_length=200):
|
31 |
+
"""Split long sentences into smaller chunks at appropriate break points."""
|
32 |
+
if len(sentence) <= max_length:
|
33 |
+
return [sentence]
|
34 |
+
|
35 |
+
chunks = []
|
36 |
+
current_chunk = ""
|
37 |
+
words = sentence.split()
|
38 |
+
|
39 |
+
for word in words:
|
40 |
+
if len(current_chunk) + len(word) + 1 <= max_length:
|
41 |
+
current_chunk += (" " + word if current_chunk else word)
|
42 |
+
else:
|
43 |
+
chunks.append(current_chunk)
|
44 |
+
current_chunk = word
|
45 |
+
|
46 |
+
if current_chunk:
|
47 |
+
chunks.append(current_chunk)
|
48 |
+
|
49 |
+
return chunks
|
50 |
+
|
51 |
+
def preprocess_idioms(text, src_lang, tgt_lang):
|
52 |
+
if src_lang == "en" and tgt_lang == "hi":
|
53 |
+
idiom_map = {
|
54 |
+
# Common English-Hindi idiom mappings
|
55 |
+
"no piece of cake": "कोई आसान काम नहीं",
|
56 |
+
"bite the bullet": "दांतों तले उंगली दबाना",
|
57 |
+
"tackle it head-on": "इसे पूरे मन से हाथ में लेना",
|
58 |
+
"fell into place": "ठीक हो गया",
|
59 |
+
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
|
60 |
+
"with a little perseverance": "थोड़े से धैर्य से",
|
61 |
+
"break the ice": "बातचीत की शुरुआत करना",
|
62 |
+
"on cloud nine": "सातवें आसमान पर होना",
|
63 |
+
"once in a blue moon": "कभी-कभार",
|
64 |
+
"beating around the bush": "इधर-उधर की बात करना",
|
65 |
+
"burning the midnight oil": "रात-रात भर जागकर काम करना",
|
66 |
+
"calm before the storm": "तूफान से पहले की शांति",
|
67 |
+
"cost an arm and a leg": "बहुत महंगा होना",
|
68 |
+
"blessing in disguise": "छुपा हुआ वरदान",
|
69 |
+
"kill two birds with one stone": "एक पंथ दो काज",
|
70 |
+
"a piece of cake": "बहुत आसान काम",
|
71 |
+
"under the weather": "तबीयत ठीक न होना",
|
72 |
+
"pull yourself together": "खुद को संभालो",
|
73 |
+
"rise and shine": "जल्दी उठो और तैयार हो जाओ",
|
74 |
+
"time flies": "समय पंख लगाकर उड़ता है",
|
75 |
+
"actions speak louder than words": "कथनी से करनी बड़ी",
|
76 |
+
"all ears": "पूरा ध्यान से सुन रहा हूं",
|
77 |
+
"back to square one": "वापस शुरुआत में",
|
78 |
+
"better late than never": "देर आये दुरुस्त आये",
|
79 |
+
"cry over spilled milk": "बीती बात पर पछताना",
|
80 |
+
"down to earth": "सरल स्वभाव का",
|
81 |
+
"every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
|
82 |
+
"food for thought": "सोचने वाली बात",
|
83 |
+
"give someone the benefit of the doubt": "शक का फायदा देना",
|
84 |
+
"hit the nail on the head": "सटीक बात कहना",
|
85 |
+
"in hot water": "मुसीबत में होना"
|
86 |
+
}
|
87 |
+
|
88 |
+
# Sort idioms by length (longest first) to handle overlapping phrases
|
89 |
+
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
|
90 |
+
|
91 |
+
# Replace idioms with their translations
|
92 |
+
for idiom in sorted_idioms:
|
93 |
+
pattern = r'\b' + re.escape(idiom) + r'\b'
|
94 |
+
text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
|
95 |
+
|
96 |
+
elif src_lang == "en" and tgt_lang == "mr":
|
97 |
+
idiom_map = {
|
98 |
+
"no piece of cake": "सोपं काम नाही",
|
99 |
+
"bite the bullet": "कठीण निर्णय घेणे",
|
100 |
+
"tackle it head-on": "समस्येला थेट सामोरे जाणे",
|
101 |
+
"fell into place": "सगळं व्यवस्थित झालं",
|
102 |
+
"see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
|
103 |
+
"with a little perseverance": "थोड्या धीराने",
|
104 |
+
"break the ice": "संभाषणाची सुरुवात करणे",
|
105 |
+
"on cloud nine": "आनंदात असणे",
|
106 |
+
"once in a blue moon": "क्वचितच",
|
107 |
+
"burning the midnight oil": "रात्रंदिवस मेहनत करणे",
|
108 |
+
"better late than never": "उशीर का होईना पण योग्य वेळी"
|
109 |
+
}
|
110 |
+
for idiom, translation in idiom_map.items():
|
111 |
+
pattern = r'\b' + re.escape(idiom) + r'\b'
|
112 |
+
text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
|
113 |
+
|
114 |
+
return text
|
115 |
+
|
116 |
def extract_text(file):
|
117 |
ext = os.path.splitext(file.name)[1].lower()
|
118 |
|
|
|
136 |
else:
|
137 |
raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
|
138 |
|
|
|
139 |
def translate_text(text, src_lang, tgt_lang, models):
|
140 |
if src_lang == tgt_lang:
|
141 |
return text
|
|
|
153 |
# Preprocess for idioms
|
154 |
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
|
155 |
|
156 |
+
# Split text into smaller chunks (sentences)
|
157 |
+
sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
|
158 |
+
translated_text = []
|
159 |
|
160 |
for sentence in sentences:
|
161 |
if sentence.strip():
|
162 |
+
chunks = split_long_sentence(sentence, max_length=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
for chunk in chunks:
|
165 |
+
try:
|
166 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
167 |
+
translated = model.generate(
|
168 |
+
**inputs
|
|
|
|
|
|
|
|
|
|
|
|