sarahai's picture
Update app.py
b8553ac verified
raw
history blame
4.22 kB
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer
import easyocr
from PIL import Image
import numpy as np
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
summarization_model_name = 'sarahai/ruT5-base-summarizer'
summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
def extract_text(image_path, lang='uzb_Cyrl'):
reader = easyocr.Reader([lang])
results = reader.readtext(np.array(image_path))
all_text = ''
confidences = []
for (bbox, text, prob) in results:
all_text += ' ' + text
confidences.append(prob)
final_confidence = sum(confidences) / len(confidences) if confidences else 0
return all_text.strip(), final_confidence
def split_into_chunks(text, tokenizer, max_length=150):
tokens = tokenizer.tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for token in tokens:
current_chunk.append(token)
current_length += 1
if current_length >= max_length:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
return chunks
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
chunks = split_into_chunks(text, tokenizer)
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
return ' '.join(translated_chunks)
def summarize(text, model, tokenizer, max_length=250):
input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True)
summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
# Streamlit UI setup
st.title('Текстовая обработка изображений, перевод с узбекского на русский и суммаризация')
uploaded_file = st.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption='Загруженное изображение', use_column_width=True)
st.write("Процесс извлечения текста...")
extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary
st.write("Извлеченный текст:")
st.text_area("Результат", extracted_text, height=150)
st.write(f"Точность распознавания: {confidence*100:.2f}%")
if st.button("Перевести и суммаризировать"):
if extracted_text:
with st.spinner('Переводим...'):
translated_text = translate(extracted_text, translation_model, translation_tokenizer)
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
with st.spinner('Суммаризируем...'):
summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
else:
st.warning("Текст для перевода не найден.")