Spaces:
Sleeping
Sleeping
| from PyPDF2 import PdfReader | |
| from transformers import pipeline, MarianMTModel, MarianTokenizer | |
| import torch | |
| import gradio as gr | |
| from gtts import gTTS | |
| # Проверка доступности GPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Инициализация модели для суммаризации | |
| summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum", device=0 if device.type == "cuda" else -1) | |
| # Инициализация модели для перевода | |
| translation_model_name = "Helsinki-NLP/opus-mt-en-ru" | |
| translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name) | |
| translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device) | |
| def parse_pdf(pdf_file): | |
| """Функция для извлечения текста из PDF файла.""" | |
| reader = PdfReader(pdf_file) | |
| extracted_text = "" | |
| for page in reader.pages: | |
| extracted_text += page.extract_text() or "" | |
| return extracted_text | |
| def summarize(text, max_length=130, min_length=30): | |
| """Функция для суммаризации текста.""" | |
| # Разделяем текст на части, если он слишком большой | |
| max_chunk_size = 1024 | |
| chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)] | |
| summaries = [] | |
| for chunk in chunks: | |
| result = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False) | |
| summaries.append(result[0]['summary_text']) | |
| return " ".join(summaries) | |
| def translate(text): | |
| """Функция для перевода текста на русский.""" | |
| # Токенизация и перевод | |
| inputs = translation_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device) | |
| translated_tokens = translation_model.generate(**inputs) | |
| translated_text = translation_tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
| return translated_text | |
| def text_to_speech(text, language='ru'): | |
| """Функция для преобразования текста в аудиофайл.""" | |
| tts = gTTS(text=text, lang=language) | |
| audio_file = "output.mp3" | |
| tts.save(audio_file) | |
| return audio_file | |
| def process_pdf(pdf_file, language): | |
| """Основная функция обработки PDF файла.""" | |
| if not pdf_file: | |
| return "No input provided." | |
| # Извлечение текста из PDF | |
| extracted_text = parse_pdf(pdf_file) | |
| print(f"Extracted Text: {extracted_text}") | |
| # Суммаризация текста | |
| summary = summarize(extracted_text) | |
| print(f"Summary: {summary}") | |
| # Если выбран русский язык, переводим текст | |
| if language == "rus": | |
| translated_text = translate(summary) | |
| print(f"Translated Text: {translated_text}") | |
| final_text = translated_text | |
| audio_language = 'ru' | |
| else: | |
| final_text = summary | |
| audio_language = 'en' | |
| # Преобразование текста в аудио | |
| audio_file = text_to_speech(final_text, language=audio_language) | |
| return final_text, audio_file | |
| # Создание Gradio интерфейса | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF Summarizer, Translator, and Text-to-Speech") | |
| gr.Markdown("Upload a PDF file to summarize, translate (if needed), and convert to audio.") | |
| language = gr.Radio(choices=["eng", "rus"], label="Output Language", value="rus") | |
| pdf_input = gr.File(label="Upload PDF File", type="filepath") | |
| text_output = gr.Textbox(label="Processed Text", lines=10) | |
| audio_output = gr.Audio(label="Generated Audio", type="filepath") | |
| process_button = gr.Button("Process PDF") | |
| process_button.click(process_pdf, inputs=[pdf_input, language], outputs=[text_output, audio_output]) | |
| # Запуск приложения | |
| demo.launch(debug=True) |