gauravchand11 commited on
Commit
77a6efe
·
verified ·
1 Parent(s): c124a1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -81
app.py CHANGED
@@ -4,6 +4,14 @@ from PyPDF2 import PdfReader
4
  import docx
5
  import os
6
  import re
 
 
 
 
 
 
 
 
7
 
8
  # Load NLLB model and tokenizer
9
  @st.cache_resource
@@ -19,7 +27,92 @@ def initialize_models():
19
  tokenizer, model = load_translation_model()
20
  return {"nllb": (tokenizer, model)}
21
 
22
- # Function to extract text from different file types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def extract_text(file):
24
  ext = os.path.splitext(file.name)[1].lower()
25
 
@@ -43,7 +136,6 @@ def extract_text(file):
43
  else:
44
  raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
45
 
46
- # Translation function
47
  def translate_text(text, src_lang, tgt_lang, models):
48
  if src_lang == tgt_lang:
49
  return text
@@ -61,86 +153,16 @@ def translate_text(text, src_lang, tgt_lang, models):
61
  # Preprocess for idioms
62
  preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
63
 
64
- # Split text into manageable chunks
65
- sentences = preprocessed_text.split("\n")
66
- translated_text = ""
67
 
68
  for sentence in sentences:
69
  if sentence.strip():
70
- inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
71
- # Use lang_code_to_id instead of get_lang_id
72
- translated = model.generate(
73
- **inputs,
74
- forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang_code],
75
- max_length=512
76
- )
77
- translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
78
- translated_text += translated_sentence + "\n"
79
-
80
- return translated_text
81
-
82
- # Function to save text as a file
83
- def save_text_to_file(text, original_filename, prefix="translated"):
84
- output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
85
- with open(output_filename, "w", encoding="utf-8") as f:
86
- f.write(text)
87
- return output_filename
88
-
89
- # Main processing function
90
- def process_document(file, source_lang, target_lang, models):
91
- try:
92
- # Extract text from uploaded file
93
- text = extract_text(file)
94
-
95
- # Translate the text
96
- translated_text = translate_text(text, source_lang, target_lang, models)
97
-
98
- # Save the result (success or error) to a file
99
- if translated_text.startswith("Error:"):
100
- output_file = save_text_to_file(translated_text, file.name, prefix="error")
101
- else:
102
- output_file = save_text_to_file(translated_text, file.name)
103
-
104
- return output_file, translated_text
105
- except Exception as e:
106
- # Save error message to a file
107
- error_message = f"Error: {str(e)}"
108
- output_file = save_text_to_file(error_message, file.name, prefix="error")
109
- return output_file, error_message
110
-
111
- # Streamlit interface
112
- def main():
113
- st.title("Document Translator (NLLB-200)")
114
- st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
115
-
116
- # Initialize models
117
- models = initialize_models()
118
-
119
- # File uploader
120
- uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
121
-
122
- # Language selection
123
- col1, col2 = st.columns(2)
124
- with col1:
125
- source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
126
- with col2:
127
- target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
128
-
129
- if uploaded_file is not None and st.button("Translate"):
130
- with st.spinner("Translating..."):
131
- output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
132
-
133
- # Display result
134
- st.text_area("Translated Text", result_text, height=300)
135
 
136
- # Provide download button
137
- with open(output_file, "rb") as file:
138
- st.download_button(
139
- label="Download Translated Document",
140
- data=file,
141
- file_name=os.path.basename(output_file),
142
- mime="text/plain"
143
- )
144
-
145
- if __name__ == "__main__":
146
- main()
 
4
  import docx
5
  import os
6
  import re
7
+ from datetime import datetime
8
+
9
+ # Page config
10
+ st.set_page_config(
11
+ page_title="Document Translator (NLLB-200)",
12
+ page_icon="📄",
13
+ layout="wide"
14
+ )
15
 
16
  # Load NLLB model and tokenizer
17
  @st.cache_resource
 
27
  tokenizer, model = load_translation_model()
28
  return {"nllb": (tokenizer, model)}
29
 
30
+ def split_long_sentence(sentence, max_length=200):
31
+ """Split long sentences into smaller chunks at appropriate break points."""
32
+ if len(sentence) <= max_length:
33
+ return [sentence]
34
+
35
+ chunks = []
36
+ current_chunk = ""
37
+ words = sentence.split()
38
+
39
+ for word in words:
40
+ if len(current_chunk) + len(word) + 1 <= max_length:
41
+ current_chunk += (" " + word if current_chunk else word)
42
+ else:
43
+ chunks.append(current_chunk)
44
+ current_chunk = word
45
+
46
+ if current_chunk:
47
+ chunks.append(current_chunk)
48
+
49
+ return chunks
50
+
51
+ def preprocess_idioms(text, src_lang, tgt_lang):
52
+ if src_lang == "en" and tgt_lang == "hi":
53
+ idiom_map = {
54
+ # Common English-Hindi idiom mappings
55
+ "no piece of cake": "कोई आसान काम नहीं",
56
+ "bite the bullet": "दांतों तले उंगली दबाना",
57
+ "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
58
+ "fell into place": "ठीक हो गया",
59
+ "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
60
+ "with a little perseverance": "थोड़े से धैर्य से",
61
+ "break the ice": "बातचीत की शुरुआत करना",
62
+ "on cloud nine": "सातवें आसमान पर होना",
63
+ "once in a blue moon": "कभी-कभार",
64
+ "beating around the bush": "इधर-उधर की बात करना",
65
+ "burning the midnight oil": "रात-रात भर जागकर काम करना",
66
+ "calm before the storm": "तूफान से पहले की शांति",
67
+ "cost an arm and a leg": "बहुत महंगा होना",
68
+ "blessing in disguise": "छुपा हुआ वरदान",
69
+ "kill two birds with one stone": "एक पंथ दो काज",
70
+ "a piece of cake": "बहुत आसान काम",
71
+ "under the weather": "तबीयत ठीक न होना",
72
+ "pull yourself together": "खुद को संभालो",
73
+ "rise and shine": "जल्दी उठो और तैयार हो जाओ",
74
+ "time flies": "समय पंख लगाकर उड़ता है",
75
+ "actions speak louder than words": "कथनी से करनी बड़ी",
76
+ "all ears": "पूरा ध्यान से सुन रहा हूं",
77
+ "back to square one": "वापस शुरुआत में",
78
+ "better late than never": "देर आये दुरुस्त आये",
79
+ "cry over spilled milk": "बीती बात पर पछताना",
80
+ "down to earth": "सरल स्वभाव का",
81
+ "every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
82
+ "food for thought": "सोचने वाली बात",
83
+ "give someone the benefit of the doubt": "शक का फायदा देना",
84
+ "hit the nail on the head": "सटीक बात कहना",
85
+ "in hot water": "मुसीबत में होना"
86
+ }
87
+
88
+ # Sort idioms by length (longest first) to handle overlapping phrases
89
+ sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
90
+
91
+ # Replace idioms with their translations
92
+ for idiom in sorted_idioms:
93
+ pattern = r'\b' + re.escape(idiom) + r'\b'
94
+ text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
95
+
96
+ elif src_lang == "en" and tgt_lang == "mr":
97
+ idiom_map = {
98
+ "no piece of cake": "सोपं काम नाही",
99
+ "bite the bullet": "कठीण निर्णय घेणे",
100
+ "tackle it head-on": "समस्येला थेट सामोरे जाणे",
101
+ "fell into place": "सगळं व्यवस्थित झालं",
102
+ "see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
103
+ "with a little perseverance": "थोड्या धीराने",
104
+ "break the ice": "संभाषणाची सुरुवात करणे",
105
+ "on cloud nine": "आनंदात असणे",
106
+ "once in a blue moon": "क्वचितच",
107
+ "burning the midnight oil": "रात्रंदिवस मेहनत करणे",
108
+ "better late than never": "उशीर का होईना पण योग्य वेळी"
109
+ }
110
+ for idiom, translation in idiom_map.items():
111
+ pattern = r'\b' + re.escape(idiom) + r'\b'
112
+ text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
113
+
114
+ return text
115
+
116
  def extract_text(file):
117
  ext = os.path.splitext(file.name)[1].lower()
118
 
 
136
  else:
137
  raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
138
 
 
139
  def translate_text(text, src_lang, tgt_lang, models):
140
  if src_lang == tgt_lang:
141
  return text
 
153
  # Preprocess for idioms
154
  preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
155
 
156
+ # Split text into smaller chunks (sentences)
157
+ sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
158
+ translated_text = []
159
 
160
  for sentence in sentences:
161
  if sentence.strip():
162
+ chunks = split_long_sentence(sentence, max_length=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ for chunk in chunks:
165
+ try:
166
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
167
+ translated = model.generate(
168
+ **inputs