Spaces:
Sleeping
Sleeping
File size: 4,217 Bytes
4bc6460 a1af8f5 b8553ac 4bc6460 1b6a167 daf09bb 1b6a167 4bc6460 b8553ac a1af8f5 1b6a167 b8553ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer
import easyocr
from PIL import Image
import numpy as np
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
summarization_model_name = 'sarahai/ruT5-base-summarizer'
summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
def extract_text(image_path, lang='uzb_Cyrl'):
reader = easyocr.Reader([lang])
results = reader.readtext(np.array(image_path))
all_text = ''
confidences = []
for (bbox, text, prob) in results:
all_text += ' ' + text
confidences.append(prob)
final_confidence = sum(confidences) / len(confidences) if confidences else 0
return all_text.strip(), final_confidence
def split_into_chunks(text, tokenizer, max_length=150):
tokens = tokenizer.tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for token in tokens:
current_chunk.append(token)
current_length += 1
if current_length >= max_length:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
return chunks
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
chunks = split_into_chunks(text, tokenizer)
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
return ' '.join(translated_chunks)
def summarize(text, model, tokenizer, max_length=250):
input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True)
summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
# Streamlit UI setup
st.title('Текстовая обработка изображений, перевод с узбекского на русский и суммаризация')
uploaded_file = st.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption='Загруженное изображение', use_column_width=True)
st.write("Процесс извлечения текста...")
extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary
st.write("Извлеченный текст:")
st.text_area("Результат", extracted_text, height=150)
st.write(f"Точность распознавания: {confidence*100:.2f}%")
if st.button("Перевести и суммаризировать"):
if extracted_text:
with st.spinner('Переводим...'):
translated_text = translate(extracted_text, translation_model, translation_tokenizer)
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
with st.spinner('Суммаризируем...'):
summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
else:
st.warning("Текст для перевода не найден.")
|