book_summarizer / app.py
Awell00's picture
Update app.py
2f8e19b verified
import gradio as gr
import warnings
import requests
from bs4 import BeautifulSoup
import subprocess
import io
import ebooklib
from ebooklib import epub
from huggingface_hub import InferenceClient
from epubsplit import SplitEpub
import re
import os
import spaces
def install_calibre():
try:
subprocess.run(["apt-get", "update"], check=True)
subprocess.run(["apt-get", "install", "-y", "calibre"], check=True),
subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True)
print("Calibre installed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error installing calibre: {e}")
install_calibre()
# Suppress specific warnings
warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.")
warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.")
# Constants
EPUB_PATH = 'book.epub'
OUTPUT_EPUB_PATH = 'output.epub'
OUTPUT_PDF_PATH = 'output.pdf'
LIBRARY_URL = os.getenv("LIBRARY_URL")
COOKIE_CONFIG = {
'remix_userkey': os.getenv("LIBRARY_KEY"),
'remix_userid': '14009766',
'selectedSiteMode': 'books',
'domainsNotWorking': os.getenv("NOT_WORKING")
}
def fetch_library_search_url():
try:
response = requests.get(LIBRARY_URL)
soup = BeautifulSoup(response.content, 'html5lib')
library_div = soup.find('div', attrs={'class': 'plainlist'})
if library_div:
links = library_div.find_all('a', class_='external text')
return next((link.get('href') for link in links if link.get('href').startswith('https')), "")
except Exception as e:
print(f"Error fetching library URL: {e}")
return ""
SEARCH_URL = fetch_library_search_url()
def fetch_book_details(isbn):
if not SEARCH_URL:
print("Search URL not available.")
return
search_endpoint = f"{SEARCH_URL}/s/{isbn}"
try:
response = requests.get(search_endpoint)
soup = BeautifulSoup(response.content, 'html5lib')
bookcards = soup.find_all('z-bookcard')
book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None)
if not book_url:
print("No book URL found.")
return
download_book(book_url)
except Exception as e:
print(f"Error fetching book details: {e}")
def download_book(book_url):
try:
response = requests.get(book_url, cookies=COOKIE_CONFIG)
soup = BeautifulSoup(response.content, 'html5lib')
download_link = soup.find('a', class_='addDownloadedBook')
if download_link and download_link.has_attr('href'):
download_url = SEARCH_URL + download_link['href']
download_and_convert_epub(download_url)
else:
print("Download link not found or invalid.")
except Exception as e:
print(f"Error downloading book: {e}")
def download_and_convert_epub(download_url):
try:
response = requests.get(download_url, cookies=COOKIE_CONFIG)
if response.status_code == 200:
with open(EPUB_PATH, 'wb') as epub_file:
epub_file.write(response.content)
print("EPUB downloaded successfully.")
else:
print(f"Failed to download EPUB. Status code: {response.status_code}")
except Exception as e:
print(f"Error downloading EPUB: {e}")
def extract_chapter_text(input_epub_path, chapter_indices):
print(f"Extracting chapter text for indices: {chapter_indices}")
try:
with open(input_epub_path, 'rb') as epub_file:
split_epub = SplitEpub(epub_file)
output_io = io.BytesIO()
split_epub.write_split_epub(output_io, chapter_indices)
with open(OUTPUT_EPUB_PATH, 'wb') as output_file:
output_file.write(output_io.getvalue())
return read_text_from_epub(OUTPUT_EPUB_PATH)
except Exception as e:
print(f"Error extracting chapter text: {e}")
return ""
def read_text_from_epub(output_epub_path):
try:
book = epub.read_epub(output_epub_path)
text_content = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
soup = BeautifulSoup(item.get_body_content(), 'html.parser')
paragraphs = soup.find_all('p')
text_content.extend(para.get_text() for para in paragraphs)
return '\n'.join(text_content)
except Exception as e:
print(f"Error reading text from EPUB: {e}")
return ""
def generate_table_of_contents():
try:
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
print(result)
return {int(line_number): title for line_number, title in pattern.findall(result.stdout)}
except Exception as e:
print(f"Error generating table of contents: {e}")
return {}
def summarize_chapter(chapter_index):
if chapter_index < 0:
return "Invalid chapter selection."
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
chapter_text = ""
for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]):
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
if chapter_to_summarize and len(chapter_to_summarize) > 50:
chapter_text += generate_summary(chapter_to_summarize)
chapter_text += "\n\n"
if not chapter_text:
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1])
if chapter_to_summarize and len(chapter_to_summarize) > 50:
chapter_text += generate_summary(chapter_to_summarize)
chapter_text += "\n\n"
return chapter_text if chapter_text else "No content found for the selected chapter."
@spaces.GPU(duration=100)
def generate_summary(text):
try:
client = InferenceClient(api_key=TOKEN)
user_prompt = (
"Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. "
"Do not include introductory or concluding remarks, just focus on the main points."
f"\n\nChapter Text:\n{text}"
)
system_message = {
"role": "system",
"content": (
"You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, "
"informative summary that highlights the most important events, themes, and character developments. "
"Avoid filler and ensure the summary is succinct yet comprehensive."
)
}
messages = [
system_message,
{"role": "user", "content": user_prompt}
]
stream = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=0.5,
max_tokens=2048,
top_p=0.7,
stream=True
)
out = ""
for chunk in stream:
if chunk.choices and len(chunk.choices) > 0:
new_content = chunk.choices[0].delta.content
out += new_content
yield out
return out
except Exception as e:
print(f"Error generating summary: {e}")
return "Error generating summary."
# Model Initialization
MODEL = "meta-llama/Llama-3.3-70B-Instruct"
TOKEN = os.getenv("TOKEN")
# Gradio App
with gr.Blocks() as app:
isbn_input = gr.Textbox(label="Enter ISBN")
chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[])
summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)
def update_chapter_dropdown(isbn):
fetch_book_details(isbn)
chapters = generate_table_of_contents()
print(chapters)
return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()])
def stream_summarize_chapter(chapter_index):
if chapter_index < 0:
yield "Invalid chapter selection."
return
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
if not chapter:
yield "No content found for the selected chapter."
return
for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]):
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
if chapter_to_summarize and len(chapter_to_summarize) > 50:
for text_chunk in generate_summary(chapter_to_summarize):
yield text_chunk
else:
yield "No significant content found for this chapter."
isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown])
chapter_dropdown.change(
stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output]
)
app.launch()