gauravchand11 commited on
Commit
b317132
·
verified ·
1 Parent(s): ae0d447

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -15
app.py CHANGED
@@ -1,22 +1,39 @@
1
- from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
2
  import streamlit as st
3
  from PyPDF2 import PdfReader
4
  import docx
5
  import os
 
6
 
7
- # Load M2M100 model and tokenizer
8
  @st.cache_resource
9
  def load_translation_model():
10
- model_name = "facebook/m2m100_418M"
11
- tokenizer = M2M100Tokenizer.from_pretrained(model_name)
12
- model = M2M100ForConditionalGeneration.from_pretrained(model_name)
13
  return tokenizer, model
14
 
15
  # Initialize model
16
  @st.cache_resource
17
  def initialize_models():
18
  tokenizer, model = load_translation_model()
19
- return {"m2m100": (tokenizer, model)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Function to extract text from different file types
22
  def extract_text(file):
@@ -47,8 +64,8 @@ def translate_text(text, src_lang, tgt_lang, models):
47
  if src_lang == tgt_lang:
48
  return text
49
 
50
- # Language codes for M2M100 (simplified to match user input)
51
- lang_map = {"en": "en", "hi": "hi", "mr": "mr"}
52
 
53
  if src_lang not in lang_map or tgt_lang not in lang_map:
54
  return "Error: Unsupported language combination"
@@ -56,19 +73,19 @@ def translate_text(text, src_lang, tgt_lang, models):
56
  src_lang_code = lang_map[src_lang]
57
  tgt_lang_code = lang_map[tgt_lang]
58
 
59
- tokenizer, model = models["m2m100"]
60
 
61
- # Set source language
62
- tokenizer.src_lang = src_lang_code
63
 
64
  # Split text into manageable chunks
65
- sentences = text.split("\n")
66
  translated_text = ""
67
 
68
  for sentence in sentences:
69
  if sentence.strip():
70
- inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
71
- translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang_code))
72
  translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
73
  translated_text += translated_sentence + "\n"
74
 
@@ -105,7 +122,7 @@ def process_document(file, source_lang, target_lang, models):
105
 
106
  # Streamlit interface
107
  def main():
108
- st.title("Document Translator (M2M100)")
109
  st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
110
 
111
  # Initialize models
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
  import streamlit as st
3
  from PyPDF2 import PdfReader
4
  import docx
5
  import os
6
+ import re
7
 
8
+ # Load NLLB model and tokenizer
9
  @st.cache_resource
10
  def load_translation_model():
11
+ model_name = "facebook/nllb-200-distilled-600M"
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
14
  return tokenizer, model
15
 
16
  # Initialize model
17
  @st.cache_resource
18
  def initialize_models():
19
  tokenizer, model = load_translation_model()
20
+ return {"nllb": (tokenizer, model)}
21
+
22
+ # Preprocess text to handle idioms (basic mapping for demonstration)
23
+ def preprocess_idioms(text, src_lang, tgt_lang):
24
+ if src_lang == "en" and tgt_lang == "hi":
25
+ idiom_map = {
26
+ "no piece of cake": "कोई आसान काम नहीं",
27
+ "bite the bullet": "दांतों तले उंगली दबाना",
28
+ "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
29
+ "fell into place": "ठीक हो गया",
30
+ "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
31
+ "with a little perseverance": "थोड़े से धैर्य से"
32
+ }
33
+ for idiom, translation in idiom_map.items():
34
+ text = re.sub(r'\b' + idiom + r'\b', translation, text, flags=re.IGNORECASE)
35
+ # Add more mappings for other language pairs (e.g., en-mr) as needed
36
+ return text
37
 
38
  # Function to extract text from different file types
39
  def extract_text(file):
 
64
  if src_lang == tgt_lang:
65
  return text
66
 
67
+ # Language codes for NLLB
68
+ lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
69
 
70
  if src_lang not in lang_map or tgt_lang not in lang_map:
71
  return "Error: Unsupported language combination"
 
73
  src_lang_code = lang_map[src_lang]
74
  tgt_lang_code = lang_map[tgt_lang]
75
 
76
+ tokenizer, model = models["nllb"]
77
 
78
+ # Preprocess for idioms
79
+ preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
80
 
81
  # Split text into manageable chunks
82
+ sentences = preprocessed_text.split("\n")
83
  translated_text = ""
84
 
85
  for sentence in sentences:
86
  if sentence.strip():
87
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512, src_lang=src_lang_code)
88
+ translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang_code), max_length=512)
89
  translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
90
  translated_text += translated_sentence + "\n"
91
 
 
122
 
123
  # Streamlit interface
124
  def main():
125
+ st.title("Document Translator (NLLB-200)")
126
  st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
127
 
128
  # Initialize models