Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import easyocr
|
|
4 |
from PIL import Image
|
5 |
import numpy as np
|
6 |
|
7 |
-
|
8 |
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
|
9 |
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
|
10 |
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
|
@@ -13,75 +13,48 @@ summarization_model_name = 'sarahai/ruT5-base-summarizer'
|
|
13 |
summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
|
14 |
summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
current_chunk = []
|
41 |
-
current_length = 0
|
42 |
-
if current_chunk:
|
43 |
-
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
|
44 |
-
return chunks
|
45 |
-
|
46 |
-
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
|
47 |
-
tokenizer.src_lang = src_lang
|
48 |
-
tokenizer.tgt_lang = tgt_lang
|
49 |
-
chunks = split_into_chunks(text, tokenizer)
|
50 |
-
translated_chunks = []
|
51 |
-
for chunk in chunks:
|
52 |
-
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
|
53 |
-
outputs = model.generate(inputs['input_ids'], forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
|
54 |
-
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
55 |
-
return ' '.join(translated_chunks)
|
56 |
-
|
57 |
-
def summarize(text, model, tokenizer, max_length=250):
|
58 |
-
input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2048, truncation=True)
|
59 |
-
summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=2.0, num_beams=4, early_stopping=True)
|
60 |
-
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
61 |
-
return summary
|
62 |
|
63 |
-
|
64 |
-
st.
|
65 |
|
66 |
-
|
67 |
-
if uploaded_file is not None:
|
68 |
-
image = Image.open(uploaded_file)
|
69 |
-
st.image(image, caption='Загруженное изображение', use_column_width=True)
|
70 |
st.write("Процесс извлечения текста...")
|
71 |
extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary
|
72 |
st.write("Извлеченный текст:")
|
73 |
st.text_area("Результат", extracted_text, height=150)
|
74 |
st.write(f"Точность распознавания: {confidence*100:.2f}%")
|
75 |
|
76 |
-
if
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
|
81 |
-
|
82 |
-
with st.spinner('Суммаризируем...'):
|
83 |
-
summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
|
84 |
-
st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
|
85 |
-
else:
|
86 |
-
st.warning("Текст для перевода не найден.")
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from PIL import Image
|
5 |
import numpy as np
|
6 |
|
7 |
+
# Load models and tokenizers
|
8 |
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
|
9 |
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
|
10 |
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
|
|
|
13 |
summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
|
14 |
summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
|
15 |
|
16 |
+
# Define functions
|
17 |
+
# Your existing functions here ...
|
18 |
+
|
19 |
+
# Custom CSS styles
|
20 |
+
st.markdown("""
|
21 |
+
<style>
|
22 |
+
.big-font {
|
23 |
+
font-size:30px !important;
|
24 |
+
font-weight: bold;
|
25 |
+
}
|
26 |
+
.small-font {
|
27 |
+
font-size:18px !important;
|
28 |
+
}
|
29 |
+
</style>
|
30 |
+
""", unsafe_allow_html=True)
|
31 |
+
|
32 |
+
# Sidebar
|
33 |
+
st.sidebar.markdown('## Навигация')
|
34 |
+
uploaded_file = st.sidebar.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
|
35 |
+
|
36 |
+
if uploaded_file:
|
37 |
+
image = Image.open(uploaded_file)
|
38 |
+
st.sidebar.image(image, caption='Загруженное изображение', use_column_width=True)
|
39 |
+
process_btn = st.sidebar.button("Перевести и суммаризировать")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
st.title('Текстовая обработка изображений', className="big-font")
|
42 |
+
st.markdown('Перевод с узбекского на русский и суммаризация', className="big-font")
|
43 |
|
44 |
+
if process_btn:
|
|
|
|
|
|
|
45 |
st.write("Процесс извлечения текста...")
|
46 |
extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary
|
47 |
st.write("Извлеченный текст:")
|
48 |
st.text_area("Результат", extracted_text, height=150)
|
49 |
st.write(f"Точность распознавания: {confidence*100:.2f}%")
|
50 |
|
51 |
+
if extracted_text:
|
52 |
+
with st.spinner('Переводим...'):
|
53 |
+
translated_text = translate(extracted_text, translation_model, translation_tokenizer)
|
54 |
+
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
with st.spinner('Суммаризируем...'):
|
57 |
+
summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
|
58 |
+
st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
|
59 |
+
else:
|
60 |
+
st.error("Текст для перевода не найден.")
|