Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,39 @@
|
|
1 |
-
from transformers import
|
2 |
import streamlit as st
|
3 |
from PyPDF2 import PdfReader
|
4 |
import docx
|
5 |
import os
|
|
|
6 |
|
7 |
-
# Load
|
8 |
@st.cache_resource
|
9 |
def load_translation_model():
|
10 |
-
model_name = "facebook/
|
11 |
-
tokenizer =
|
12 |
-
model =
|
13 |
return tokenizer, model
|
14 |
|
15 |
# Initialize model
|
16 |
@st.cache_resource
|
17 |
def initialize_models():
|
18 |
tokenizer, model = load_translation_model()
|
19 |
-
return {"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Function to extract text from different file types
|
22 |
def extract_text(file):
|
@@ -47,8 +64,8 @@ def translate_text(text, src_lang, tgt_lang, models):
|
|
47 |
if src_lang == tgt_lang:
|
48 |
return text
|
49 |
|
50 |
-
# Language codes for
|
51 |
-
lang_map = {"en": "
|
52 |
|
53 |
if src_lang not in lang_map or tgt_lang not in lang_map:
|
54 |
return "Error: Unsupported language combination"
|
@@ -56,19 +73,19 @@ def translate_text(text, src_lang, tgt_lang, models):
|
|
56 |
src_lang_code = lang_map[src_lang]
|
57 |
tgt_lang_code = lang_map[tgt_lang]
|
58 |
|
59 |
-
tokenizer, model = models["
|
60 |
|
61 |
-
#
|
62 |
-
|
63 |
|
64 |
# Split text into manageable chunks
|
65 |
-
sentences =
|
66 |
translated_text = ""
|
67 |
|
68 |
for sentence in sentences:
|
69 |
if sentence.strip():
|
70 |
-
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
71 |
-
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang_code))
|
72 |
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
|
73 |
translated_text += translated_sentence + "\n"
|
74 |
|
@@ -105,7 +122,7 @@ def process_document(file, source_lang, target_lang, models):
|
|
105 |
|
106 |
# Streamlit interface
|
107 |
def main():
|
108 |
-
st.title("Document Translator (
|
109 |
st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
|
110 |
|
111 |
# Initialize models
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
import streamlit as st
|
3 |
from PyPDF2 import PdfReader
|
4 |
import docx
|
5 |
import os
|
6 |
+
import re
|
7 |
|
8 |
+
# Load NLLB model and tokenizer
|
9 |
@st.cache_resource
|
10 |
def load_translation_model():
|
11 |
+
model_name = "facebook/nllb-200-distilled-600M"
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
13 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
14 |
return tokenizer, model
|
15 |
|
16 |
# Initialize model
|
17 |
@st.cache_resource
|
18 |
def initialize_models():
|
19 |
tokenizer, model = load_translation_model()
|
20 |
+
return {"nllb": (tokenizer, model)}
|
21 |
+
|
22 |
+
# Preprocess text to handle idioms (basic mapping for demonstration)
|
23 |
+
def preprocess_idioms(text, src_lang, tgt_lang):
|
24 |
+
if src_lang == "en" and tgt_lang == "hi":
|
25 |
+
idiom_map = {
|
26 |
+
"no piece of cake": "कोई आसान काम नहीं",
|
27 |
+
"bite the bullet": "दांतों तले उंगली दबाना",
|
28 |
+
"tackle it head-on": "इसे पूरे मन से हाथ में लेना",
|
29 |
+
"fell into place": "ठीक हो गया",
|
30 |
+
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
|
31 |
+
"with a little perseverance": "थोड़े से धैर्य से"
|
32 |
+
}
|
33 |
+
for idiom, translation in idiom_map.items():
|
34 |
+
text = re.sub(r'\b' + idiom + r'\b', translation, text, flags=re.IGNORECASE)
|
35 |
+
# Add more mappings for other language pairs (e.g., en-mr) as needed
|
36 |
+
return text
|
37 |
|
38 |
# Function to extract text from different file types
|
39 |
def extract_text(file):
|
|
|
64 |
if src_lang == tgt_lang:
|
65 |
return text
|
66 |
|
67 |
+
# Language codes for NLLB
|
68 |
+
lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
|
69 |
|
70 |
if src_lang not in lang_map or tgt_lang not in lang_map:
|
71 |
return "Error: Unsupported language combination"
|
|
|
73 |
src_lang_code = lang_map[src_lang]
|
74 |
tgt_lang_code = lang_map[tgt_lang]
|
75 |
|
76 |
+
tokenizer, model = models["nllb"]
|
77 |
|
78 |
+
# Preprocess for idioms
|
79 |
+
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
|
80 |
|
81 |
# Split text into manageable chunks
|
82 |
+
sentences = preprocessed_text.split("\n")
|
83 |
translated_text = ""
|
84 |
|
85 |
for sentence in sentences:
|
86 |
if sentence.strip():
|
87 |
+
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512, src_lang=src_lang_code)
|
88 |
+
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang_code), max_length=512)
|
89 |
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
|
90 |
translated_text += translated_sentence + "\n"
|
91 |
|
|
|
122 |
|
123 |
# Streamlit interface
|
124 |
def main():
|
125 |
+
st.title("Document Translator (NLLB-200)")
|
126 |
st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
|
127 |
|
128 |
# Initialize models
|