Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import warnings | |
import requests | |
from bs4 import BeautifulSoup | |
import subprocess | |
import io | |
import ebooklib | |
from ebooklib import epub | |
from huggingface_hub import InferenceClient | |
from epubsplit import SplitEpub | |
import re | |
import os | |
import spaces | |
def install_calibre(): | |
try: | |
subprocess.run(["apt-get", "update"], check=True) | |
subprocess.run(["apt-get", "install", "-y", "calibre"], check=True), | |
subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True) | |
print("Calibre installed successfully.") | |
except subprocess.CalledProcessError as e: | |
print(f"Error installing calibre: {e}") | |
install_calibre() | |
# Suppress specific warnings | |
warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.") | |
warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.") | |
# Constants | |
EPUB_PATH = 'book.epub' | |
OUTPUT_EPUB_PATH = 'output.epub' | |
OUTPUT_PDF_PATH = 'output.pdf' | |
LIBRARY_URL = os.getenv("LIBRARY_URL") | |
COOKIE_CONFIG = { | |
'remix_userkey': os.getenv("LIBRARY_KEY"), | |
'remix_userid': '14009766', | |
'selectedSiteMode': 'books', | |
'domainsNotWorking': os.getenv("NOT_WORKING") | |
} | |
def fetch_library_search_url(): | |
try: | |
response = requests.get(LIBRARY_URL) | |
soup = BeautifulSoup(response.content, 'html5lib') | |
library_div = soup.find('div', attrs={'class': 'plainlist'}) | |
if library_div: | |
links = library_div.find_all('a', class_='external text') | |
return next((link.get('href') for link in links if link.get('href').startswith('https')), "") | |
except Exception as e: | |
print(f"Error fetching library URL: {e}") | |
return "" | |
SEARCH_URL = fetch_library_search_url() | |
def fetch_book_details(isbn): | |
if not SEARCH_URL: | |
print("Search URL not available.") | |
return | |
search_endpoint = f"{SEARCH_URL}/s/{isbn}" | |
try: | |
response = requests.get(search_endpoint) | |
soup = BeautifulSoup(response.content, 'html5lib') | |
bookcards = soup.find_all('z-bookcard') | |
book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None) | |
if not book_url: | |
print("No book URL found.") | |
return | |
download_book(book_url) | |
except Exception as e: | |
print(f"Error fetching book details: {e}") | |
def download_book(book_url): | |
try: | |
response = requests.get(book_url, cookies=COOKIE_CONFIG) | |
soup = BeautifulSoup(response.content, 'html5lib') | |
download_link = soup.find('a', class_='addDownloadedBook') | |
if download_link and download_link.has_attr('href'): | |
download_url = SEARCH_URL + download_link['href'] | |
download_and_convert_epub(download_url) | |
else: | |
print("Download link not found or invalid.") | |
except Exception as e: | |
print(f"Error downloading book: {e}") | |
def download_and_convert_epub(download_url): | |
try: | |
response = requests.get(download_url, cookies=COOKIE_CONFIG) | |
if response.status_code == 200: | |
with open(EPUB_PATH, 'wb') as epub_file: | |
epub_file.write(response.content) | |
print("EPUB downloaded successfully.") | |
else: | |
print(f"Failed to download EPUB. Status code: {response.status_code}") | |
except Exception as e: | |
print(f"Error downloading EPUB: {e}") | |
def extract_chapter_text(input_epub_path, chapter_indices): | |
print(f"Extracting chapter text for indices: {chapter_indices}") | |
try: | |
with open(input_epub_path, 'rb') as epub_file: | |
split_epub = SplitEpub(epub_file) | |
output_io = io.BytesIO() | |
split_epub.write_split_epub(output_io, chapter_indices) | |
with open(OUTPUT_EPUB_PATH, 'wb') as output_file: | |
output_file.write(output_io.getvalue()) | |
return read_text_from_epub(OUTPUT_EPUB_PATH) | |
except Exception as e: | |
print(f"Error extracting chapter text: {e}") | |
return "" | |
def read_text_from_epub(output_epub_path): | |
try: | |
book = epub.read_epub(output_epub_path) | |
text_content = [] | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
soup = BeautifulSoup(item.get_body_content(), 'html.parser') | |
paragraphs = soup.find_all('p') | |
text_content.extend(para.get_text() for para in paragraphs) | |
return '\n'.join(text_content) | |
except Exception as e: | |
print(f"Error reading text from EPUB: {e}") | |
return "" | |
def generate_table_of_contents(): | |
try: | |
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) | |
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') | |
print(result) | |
return {int(line_number): title for line_number, title in pattern.findall(result.stdout)} | |
except Exception as e: | |
print(f"Error generating table of contents: {e}") | |
return {} | |
def summarize_chapter(chapter_index): | |
if chapter_index < 0: | |
return "Invalid chapter selection." | |
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) | |
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') | |
chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)] | |
chapter_text = "" | |
for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]): | |
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i]) | |
if chapter_to_summarize and len(chapter_to_summarize) > 50: | |
chapter_text += generate_summary(chapter_to_summarize) | |
chapter_text += "\n\n" | |
if not chapter_text: | |
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1]) | |
if chapter_to_summarize and len(chapter_to_summarize) > 50: | |
chapter_text += generate_summary(chapter_to_summarize) | |
chapter_text += "\n\n" | |
return chapter_text if chapter_text else "No content found for the selected chapter." | |
def generate_summary(text): | |
try: | |
client = InferenceClient(api_key=TOKEN) | |
user_prompt = ( | |
"Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. " | |
"Do not include introductory or concluding remarks, just focus on the main points." | |
f"\n\nChapter Text:\n{text}" | |
) | |
system_message = { | |
"role": "system", | |
"content": ( | |
"You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, " | |
"informative summary that highlights the most important events, themes, and character developments. " | |
"Avoid filler and ensure the summary is succinct yet comprehensive." | |
) | |
} | |
messages = [ | |
system_message, | |
{"role": "user", "content": user_prompt} | |
] | |
stream = client.chat.completions.create( | |
model=MODEL, | |
messages=messages, | |
temperature=0.5, | |
max_tokens=2048, | |
top_p=0.7, | |
stream=True | |
) | |
out = "" | |
for chunk in stream: | |
if chunk.choices and len(chunk.choices) > 0: | |
new_content = chunk.choices[0].delta.content | |
out += new_content | |
yield out | |
return out | |
except Exception as e: | |
print(f"Error generating summary: {e}") | |
return "Error generating summary." | |
# Model Initialization | |
MODEL = "meta-llama/Llama-3.3-70B-Instruct" | |
TOKEN = os.getenv("TOKEN") | |
# Gradio App | |
with gr.Blocks() as app: | |
isbn_input = gr.Textbox(label="Enter ISBN") | |
chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[]) | |
summary_output = gr.Textbox(label="Summary", lines=10, interactive=False) | |
def update_chapter_dropdown(isbn): | |
fetch_book_details(isbn) | |
chapters = generate_table_of_contents() | |
print(chapters) | |
return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()]) | |
def stream_summarize_chapter(chapter_index): | |
if chapter_index < 0: | |
yield "Invalid chapter selection." | |
return | |
result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) | |
pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') | |
chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)] | |
if not chapter: | |
yield "No content found for the selected chapter." | |
return | |
for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]): | |
chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i]) | |
if chapter_to_summarize and len(chapter_to_summarize) > 50: | |
for text_chunk in generate_summary(chapter_to_summarize): | |
yield text_chunk | |
else: | |
yield "No significant content found for this chapter." | |
isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown]) | |
chapter_dropdown.change( | |
stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output] | |
) | |
app.launch() | |