import streamlit as st from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer import easyocr from PIL import Image import numpy as np translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian' translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name) translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name) summarization_model_name = 'sarahai/ruT5-base-summarizer' summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name) summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name) def extract_text(image_path, lang='uzb_Cyrl'): reader = easyocr.Reader([lang]) results = reader.readtext(np.array(image_path)) all_text = '' confidences = [] for (bbox, text, prob) in results: all_text += ' ' + text confidences.append(prob) final_confidence = sum(confidences) / len(confidences) if confidences else 0 return all_text.strip(), final_confidence def split_into_chunks(text, tokenizer, max_length=150): tokens = tokenizer.tokenize(text) chunks = [] current_chunk = [] current_length = 0 for token in tokens: current_chunk.append(token) current_length += 1 if current_length >= max_length: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) return chunks def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'): tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang chunks = split_into_chunks(text, tokenizer) translated_chunks = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128) outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]) translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) return ' '.join(translated_chunks) def summarize(text, model, tokenizer, max_length=250): input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True) summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # Streamlit UI setup st.title('Текстовая обработка изображений, перевод с узбекского на русский и суммаризация') uploaded_file = st.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption='Загруженное изображение', use_column_width=True) st.write("Процесс извлечения текста...") extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary st.write("Извлеченный текст:") st.text_area("Результат", extracted_text, height=150) st.write(f"Точность распознавания: {confidence*100:.2f}%") if st.button("Перевести и суммаризировать"): if extracted_text: with st.spinner('Переводим...'): translated_text = translate(extracted_text, translation_model, translation_tokenizer) st.text_area("Переведенный текст (на русском):", value=translated_text, height=200) with st.spinner('Суммаризируем...'): summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250) st.text_area("Суммаризация (на русском):", value=summary_text, height=100) else: st.warning("Текст для перевода не найден.")