Spaces:

sarahai
/

uzbek-russian-summarize

Running

App Files Files Community

sarahai commited on Apr 15, 2024

Commit

ff08925

verified ·

1 Parent(s): 50c79fe

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -5

app.py CHANGED Viewed

@@ -13,8 +13,52 @@ summarization_model_name = 'sarahai/ruT5-base-summarizer'
 summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
 summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
-# Define functions
-# Your existing functions here ...
 # Custom CSS styles
 st.markdown("""
@@ -32,19 +76,20 @@ st.markdown("""
 # Sidebar
 st.sidebar.markdown('## Навигация')
 uploaded_file = st.sidebar.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.sidebar.image(image, caption='Загруженное изображение', use_column_width=True)
     process_btn = st.sidebar.button("Перевести и суммаризировать")
-# Use HTML tags to apply styles
 st.markdown('<h1 class="big-font">Текстовая обработка изображений</h1>', unsafe_allow_html=True)
 st.markdown('<div class="big-font">Перевод с узбекского на русский и суммаризация</div>', unsafe_allow_html=True)
-if process_btn:
     st.write("Процесс извлечения текста...")
-    extracted_text, confidence = extract_text(image, 'tjk')  # Adjust the language code if necessary
     st.write("Извлеченный текст:")
     st.text_area("Результат", extracted_text, height=150)
     st.write(f"Точность распознавания: {confidence*100:.2f}%")

 summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
 summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
+def extract_text(image_path, lang='uzb_Cyrl'):
+    reader = easyocr.Reader([lang])
+    results = reader.readtext(np.array(image_path))
+    all_text = ''
+    confidences = []
+    for (bbox, text, prob) in results:
+        all_text += ' ' + text
+        confidences.append(prob)
+    final_confidence = sum(confidences) / len(confidences) if confidences else 0
+    return all_text.strip(), final_confidence
+def split_into_chunks(text, tokenizer, max_length=150):
+    tokens = tokenizer.tokenize(text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for token in tokens:
+        current_chunk.append(token)
+        current_length += 1
+        if current_length >= max_length:
+            chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
+            current_chunk = []
+            current_length = 0
+    if current_chunk:
+        chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
+    return chunks
+def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
+    tokenizer.src_lang = src_lang
+    tokenizer.tgt_lang = tgt_lang
+    chunks = split_into_chunks(text, tokenizer)
+    translated_chunks = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
+        outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
+        translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    return ' '.join(translated_chunks)
+def summarize(text, model, tokenizer, max_length=250):
+    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True)
+    summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
 # Custom CSS styles
 st.markdown("""
 # Sidebar
 st.sidebar.markdown('## Навигация')
 uploaded_file = st.sidebar.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
+process_btn = False  # Define button state here
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.sidebar.image(image, caption='Загруженное изображение', use_column_width=True)
     process_btn = st.sidebar.button("Перевести и суммаризировать")
+# Title and Description
 st.markdown('<h1 class="big-font">Текстовая обработка изображений</h1>', unsafe_allow_html=True)
 st.markdown('<div class="big-font">Перевод с узбекского на русский и суммаризация</div>', unsafe_allow_html=True)
+if process_btn and uploaded_file:
     st.write("Процесс извлечения текста...")
+    extracted_text, confidence = extract_text(image, 'uzb_Cyrl')  # Adjust the language code if necessary
     st.write("Извлеченный текст:")
     st.text_area("Результат", extracted_text, height=150)
     st.write(f"Точность распознавания: {confidence*100:.2f}%")