sarahai commited on
Commit
1b6a167
·
verified ·
1 Parent(s): 94cbf8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -59
app.py CHANGED
@@ -1,69 +1,56 @@
1
  import streamlit as st
2
- import torch
3
- from transformers import AutoModelForSeq2SeqLM, NllbTokenizer, T5Tokenizer, T5ForConditionalGeneration
4
- from transformers import pipeline
5
 
6
- # Load translation model and tokenizer (same as before)
7
- model_load_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
8
- model = AutoModelForSeq2SeqLM.from_pretrained(model_load_name)
9
- tokenizer = NllbTokenizer.from_pretrained(model_load_name)
10
 
 
 
 
 
11
 
12
- # Define translation function (same as before)
13
- def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl', a=16, b=1.5, max_input_length=1024):
14
  tokenizer.src_lang = src_lang
15
  tokenizer.tgt_lang = tgt_lang
16
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
17
- result = model.generate(
18
- **inputs,
19
- forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
20
- max_new_tokens=int(a + b * inputs.input_ids.shape[1])
21
-
22
  )
23
- return tokenizer.batch_decode(result, skip_special_tokens=True)
24
-
25
- # Load summarization model and tokenizer
26
- model_name = "sarahai/ruT5-base-summarizer"
27
- summarizer_tokenizer = T5Tokenizer.from_pretrained(model_name)
28
- summarizer_model = T5ForConditionalGeneration.from_pretrained(model_name)
29
-
30
- # Define summarization function (using model and tokenizer)
31
- def summarize(russian_text):
32
- try:
33
- input_ids = summarizer_tokenizer(text, return_tensors="pt", padding="max_length").input_ids
34
- outputs = summarizer_model.generate(
35
- input_ids,
36
- max_length=250,
37
- min_length=150,
38
- length_penalty=2.0,
39
- num_beams=4,
40
- early_stopping=True,
41
- )
42
- summary = summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)
43
- return summary
44
- except Exception as e:
45
- return f"Ошибка: {str(e)}"
46
-
47
- # Streamlit interface
48
- st.title("Перевод и аннотация текста")
49
- text = st.text_area("Введите текст на узбекском языке", height=200)
50
-
51
- if st.button("Перевести и аннотировать"):
52
- if text:
53
- try:
54
- # Translate Uzbek text to Russian
55
- russian_text = translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl', a=16, b=1.5, max_input_length=1024)
56
-
57
- # Summarize the translated Russian text
58
- summary = summarize(russian_text)
59
-
60
- # Display results
61
- st.success("Перевод на русский:")
62
- st.write(russian_text)
63
- st.success("Аннотация русского текста:")
64
- st.write(summary)
65
- except Exception as e:
66
- st.error(f"Ошибка: {e}")
67
  else:
68
- st.warning("Пожалуйста, введите текст на узбекском языке.")
 
69
 
 
1
  import streamlit as st
2
+ from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration
3
+ from transformers import NllbTokenizer, T5Tokenizer
 
4
 
5
+ # Load translation model and tokenizer
6
+ translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
7
+ translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
8
+ translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
9
 
10
+ # Load summarization model and tokenizer
11
+ summarization_model_name = 'sarahai/ruT5-base-summarizer'
12
+ summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
13
+ summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
14
 
15
+ def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl', max_input_length=1024):
 
16
  tokenizer.src_lang = src_lang
17
  tokenizer.tgt_lang = tgt_lang
18
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
19
+ outputs = model.generate(
20
+ inputs['input_ids'],
21
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
22
+ max_length=512
 
23
  )
24
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
+ return translated_text
26
+
27
+ def summarize(translated_text, model, tokenizer, max_length=250, min_length=150):
28
+ input_ids = tokenizer.encode("summarize: " + translated_text, return_tensors="pt", max_length=1024, truncation=True)
29
+ summary_ids = model.generate(
30
+ input_ids,
31
+ max_length=max_length,
32
+ min_length=min_length,
33
+ length_penalty=2.0,
34
+ num_beams=4,
35
+ early_stopping=True
36
+ )
37
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
38
+ return summary
39
+
40
+ # Streamlit app setup
41
+ st.title("Russian to Uzbek Translation and Summarization")
42
+ input_text = st.text_area("Enter text in Russian:", height=200)
43
+
44
+ if st.button("Translate and Summarize"):
45
+ if input_text:
46
+ with st.spinner('Translating...'):
47
+ translated_text = translate(input_text, translation_model, translation_tokenizer)
48
+ st.text_area("Translated Text (Uzbek):", value=translated_text, height=200)
49
+
50
+ with st.spinner('Summarizing...'):
51
+ summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250, min_length=150)
52
+ st.text_area("Summary (Uzbek):", value=summary_text, height=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  else:
54
+ st.warning("Please enter text in Russian to proceed.")
55
+
56