import streamlit as st from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer # Initialize models and tokenizers translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian' translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name) translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name) summarization_model_name = 'sarahai/ruT5-base-summarizer' summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name) summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name) def split_into_chunks(text, tokenizer, max_length=150): # Tokenize the text and get ids tokens = tokenizer.tokenize(text) # Initialize chunks chunks = [] current_chunk = [] current_length = 0 for token in tokens: current_chunk.append(token) current_length += 1 if current_length >= max_length: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) current_chunk = [] current_length = 0 # Add the last chunk if it's not empty if current_chunk: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) return chunks def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'): tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang chunks = split_into_chunks(text, tokenizer) translated_chunks = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128) outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]) translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) return ' '.join(translated_chunks) def summarize(text, model, tokenizer, max_length=250): input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True) summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # Streamlit UI st.title("Перевод с узбекского на русский и суммаризация") text = st.text_area("Введите текст на узбекском:", height=200) if st.button("Перевести и суммаризировать"): if text: with st.spinner('Переводим...'): translated_text = translate(text, translation_model, translation_tokenizer) st.text_area("Переведенный текст (на русском):", value=translated_text, height=200) with st.spinner('Суммаризируем...'): summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250) st.text_area("Суммаризация (на русском):", value=summary_text, height=100) else: st.warning("Пожалуйста, введите текст на узбекском языке для перевода.")