import gradio as gr import json import time import requests from groq import Groq import google.generativeai as genai from datetime import datetime, timedelta import pytz from playwright.async_api import async_playwright import asyncio import random from fake_useragent import UserAgent from urllib.parse import urlparse, urljoin from tenacity import retry, stop_after_attempt, wait_exponential import os import subprocess # Install Playwright and browsers # subprocess.run(["playwright", "install"], check=True) # subprocess.run(["playwright", "install-deps"], check=True) # os.system("apt-get update") os.system( "apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 " "libxcomposite1 libxdamage1 libatspi2.0-0 libxrandr2 libgbm1 libpango-1.0-0 " "libasound2 libxshmfence1 libwayland-server0 libwayland-client0 " "libgdk-pixbuf2.0-0" ) # Install Playwright browsers # subprocess.run(["sudo","playwright", "install"], check=True) # Install Playwright dependencies # subprocess.run(["sudo","playwright", "install-deps"], check=True) # print("Playwright and its dependencies have been installed successfully!") # Constants CHAPTERS_FILE = 'scraped_chapters.json' SPLIT_CHAPTERS_FILE = 'split_scraped_chapters.json' TRANSLATIONS_FILE = 'chapter_translated.json' GLOSSARY_FILE = 'chapter_glossary.json' ua = UserAgent() # Function to scrape chapters from xbanxia async def scrape_xbanxia(first_chapter_url, final_url=None): async with async_playwright() as p: browser = await p.chromium.launch() context = await browser.new_context(user_agent=ua.random) page = await context.new_page() try: page = await fetch_page(page, first_chapter_url) chapters = [] next_url = first_chapter_url chapter_count = 0 while next_url and (not final_url or next_url != final_url): try: if chapter_count % 5 == 0: await context.set_extra_http_headers({"User-Agent": ua.random}) page = await fetch_page(page, next_url) # Wait for content to load await page.wait_for_selector('#nr_title', state='visible', timeout=60000) await page.wait_for_selector('#nr1', state='visible', timeout=60000) # Extract title title_element = await page.query_selector('#nr_title') title = await title_element.inner_text() if title_element else None # Extract content content_element = await page.query_selector('#nr1') content = await content_element.inner_text() if content_element else None # Extract next URL next_link = await page.query_selector('.nav2 .next a') next_url = await next_link.get_attribute('href') if next_link else None if next_url and not next_url.startswith('http'): base_url = '/'.join(first_chapter_url.split('/')[:3]) next_url = base_url + next_url if title and content: # Clean up the content content_lines = content.split('\n') clean_content = '\n'.join(line.strip() for line in content_lines if line.strip() and not line.strip().startswith('第')) chapters.append({ 'title': title.strip(), 'content': clean_content, 'url': next_url }) print(f"Scraped chapter {chapter_count + 1}: {title}") chapter_count += 1 # Random delay between requests await asyncio.sleep(random.uniform(2, 5)) except Exception as e: print(f"Error scraping chapter at {next_url}: {str(e)}") await asyncio.sleep(60) # Wait for 1 minute before retrying await browser.close() return chapters except Exception as e: print(f"An error occurred during scraping: {str(e)}") await browser.close() return None # Function to scrape chapters from 69shuba.cx async def scrape_69shu(first_chapter_url, final_url=None): async with async_playwright() as p: browser = await p.chromium.launch() context = await browser.new_context(user_agent=ua.random) page = await context.new_page() try: # Navigate to the first chapter page = await fetch_page(page, first_chapter_url) chapters = [] next_url = first_chapter_url chapter_count = 0 while next_url and (not final_url or next_url != final_url): try: # Change user agent every 5 chapters if chapter_count % 5 == 0: await context.set_extra_http_headers({"User-Agent": ua.random}) page = await fetch_page(page, next_url) await page.wait_for_selector('.txtnav', state='visible', timeout=60000) # Extract title title_element = await page.query_selector('.txtnav h1') title = await title_element.inner_text() if title_element else None if not title: title_element = await page.query_selector('.txtnav') if title_element: title_text = await title_element.inner_text() title = title_text.split('\n')[0].strip() # Extract content content_element = await page.query_selector('.txtnav') content = await content_element.inner_text() if content_element else None # Extract next URL next_link = await page.query_selector('.page1 a:nth-child(4)') next_url = await next_link.get_attribute('href') if next_link else None if title and content: # Clean up the content content_lines = content.split('\n') clean_content = '\n'.join(line.strip() for line in content_lines if line.strip() and not line.strip().startswith('Chapter')) chapters.append({ 'title': title, 'content': clean_content, 'url': next_url }) print(f"Scraped chapter {chapter_count + 1}: {title}") chapter_count += 1 # Add a random delay between requests await asyncio.sleep(random.uniform(2, 5)) except Exception as e: print(f"Error scraping chapter at {next_url}: {str(e)}") await asyncio.sleep(60) # Wait for 1 minute before trying the next chapter await browser.close() return chapters except Exception as e: print(f"An error occurred during scraping: {str(e)}") await browser.close() return None # Function to fetch a page @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) async def fetch_page(page, url): await page.goto(url) await page.wait_for_load_state('networkidle') return page # Function to scrape chapters based on the domain async def scrape_task(first_chapter_url, final_url=None): """Scrape chapters and save to JSON file.""" domain = urlparse(first_chapter_url).netloc if 'xbanxia' in domain: result = await scrape_xbanxia(first_chapter_url, final_url) elif '69shuba.cx' in domain: result = await scrape_69shu(first_chapter_url, final_url) else: print(f"Unsupported domain: {domain}") return if result: with open(CHAPTERS_FILE, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f'Scraping completed. Data saved to {CHAPTERS_FILE}') else: print('Scraping failed or was interrupted.') # Function to split long chapters def split_long_chapter(title, content, max_length=2000): """ Split a long chapter into multiple parts while preserving paragraph and sentence integrity. Splits occur at newline (\n) or sentence-ending symbol (。). """ # Count only Chinese characters for length check chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff') if chinese_char_count <= max_length: return [{"title": title, "content": content}] parts = [] current_part = [] current_chinese_count = 0 part_number = 1 # First split by paragraphs (newlines) paragraphs = content.split('\n') for paragraph in paragraphs: if not paragraph.strip(): continue # Split paragraph into sentences sentences = paragraph.split('。') sentences = [s.strip() + '。' for s in sentences if s.strip()] for sentence in sentences: sentence_chinese_count = sum(1 for char in sentence if '\u4e00' <= char <= '\u9fff') # If adding this sentence would exceed the limit if current_chinese_count + sentence_chinese_count > max_length and current_part: # Save current part part_content = '\n'.join(current_part) parts.append({ "title": f"{title} Part {part_number}", "content": part_content }) # Start new part current_part = [sentence] current_chinese_count = sentence_chinese_count part_number += 1 else: current_part.append(sentence) current_chinese_count += sentence_chinese_count # Save the last part if there's anything remaining if current_part: part_content = '\n'.join(current_part) parts.append({ "title": f"{title} Part {part_number}", "content": part_content }) return parts # Function to process chapters def process_chapters(input_file, output_file, max_length=5000): """ Process chapters from an input JSON file, splitting long chapters if necessary, and save the result to an output JSON file. """ try: # Check if the input file exists if not os.path.exists(input_file): raise FileNotFoundError(f"Input file '{input_file}' not found. Please ensure the scraping process runs first.") with open(input_file, 'r', encoding='utf-8') as f: chapters = json.load(f) processed_chapters = [] for chapter in chapters: title = chapter['title'] content = chapter['content'] split_chapters = split_long_chapter(title, content, max_length) processed_chapters.extend(split_chapters) with open(output_file, 'w', encoding='utf-8') as f: json.dump(processed_chapters, f, ensure_ascii=False, indent=2) return len(processed_chapters) except Exception as e: print(f"Error processing chapters: {str(e)}") raise def create_glossary(gemini_api_key, groq_api_key=None): """Create a glossary from random chapters using Groq or Gemini API.""" with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f: book_data = json.load(f) # book_data is a list of chapters # Select 20 random chapters from the first 100 chapters random_chapters = random.sample(book_data, min(2, len(book_data))) preliminary_glossary = [] for i, chapter in enumerate(random_chapters): max_retries = 3 retry_count = 0 while retry_count < max_retries: try: prompt = f"""Analyze the following Chinese web novel chapter and create a glossary of 5 important terms or names. Each entry should include the Chinese term and its English equivalent or explanation. Translate character names, locations names, unique concepts, cultivation levels, power levels, power techniques, or culturally specific terms to English. The target audience are people from USA that don't know much about Chinese language and culture. Very important Note: Only Use Pinyin for Character's Name. Chinese chapter: {chapter['content']} Create a glossary of 5 terms in the following format: Chinese Term: English Equivalent for example: 朱士久 : Zhu Shijiu """ if groq_api_key: # Use Groq API if the key is provided client = Groq(api_key=groq_api_key) chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-70b-8192", timeout=30 ) chapter_glossary = chat_completion.choices[0].message.content else: # Fallback to Gemini API if Groq key is not provided genai.configure(api_key=gemini_api_key) gemini_model = genai.GenerativeModel( model_name="gemini-1.5-flash", generation_config={ "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, } ) gemini_response = gemini_model.generate_content(prompt) chapter_glossary = gemini_response.text preliminary_glossary.extend(chapter_glossary.split('\n')) print(f"Created glossary entries for chapter: {chapter['title']}") break except Exception as e: retry_count += 1 if retry_count < max_retries: print(f"Error processing chapter {chapter['title']}: {str(e)}") print(f"Retrying in 60 seconds... (Attempt {retry_count + 1} of {max_retries})") time.sleep(60) else: print(f"Failed to process chapter {chapter['title']} after {max_retries} attempts: {str(e)}") time.sleep(5) # Refine the glossary refine_prompt = """Refine the following glossary for a Chinese web novel. Remove duplicates, redundant entries, and irrelevant words. Ensure consistency in naming and explanations. Provide the output in JSON Format. Preliminary Glossary: {} Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers. Provide the refined glossary in the following format: Chinese Characters: English Equivalent (No Explanations) for example: 朱士久 : Zhu Shijiu 白家 : Bai Family 成长系统: Growth System ( not "Chengzhang Xitong) """.format('\n'.join(preliminary_glossary)) try: if groq_api_key: # Use Groq API for refinement if the key is provided client = Groq(api_key=groq_api_key) chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": refine_prompt}], model="llama3-70b-8192", timeout=60 ) refined_glossary = chat_completion.choices[0].message.content else: # Fallback to Gemini API for refinement gemini_response = gemini_model.generate_content(refine_prompt) refined_glossary = gemini_response.text # Save the refined glossary with open(GLOSSARY_FILE, 'w', encoding='utf-8') as f: json.dump(refined_glossary.split('\n'), f, ensure_ascii=False, indent=2) print(f'Glossary creation completed. Glossary saved to {GLOSSARY_FILE}') except Exception as e: print(f"Error refining glossary: {str(e)}") raise # Function to translate chapters def translate_task(gemini_api_key, groq_api_key): # Configure Gemini genai.configure(api_key=gemini_api_key) gemini_model = genai.GenerativeModel( model_name="gemini-1.5-flash", generation_config={ "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, } ) # Load data and configuration with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f: book_data = json.load(f) with open(GLOSSARY_FILE, 'r', encoding='utf-8') as f: glossary = json.load(f) formatted_glossary = "\n".join(glossary) # Configure Groq groq_client = Groq(api_key=groq_api_key) if groq_api_key else None translations = [] for i, chapter in enumerate(book_data): prompt = f"""Translate the following Chinese web novel chapter to English. Maintain the original tone and style of the novel. Preserve any cultural references or idioms, providing brief explanations in parentheses if necessary. If Paragraphs are stuck together, split them. Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers. You should translate every chinese character to English. The chapter should be fully translated. Glossary: {formatted_glossary} Chinese chapter: {chapter['content']} Note: No introductory sentences nor concluding sentences. Just directly provide the translation. Translate the above text to English, using the glossary for consistent translations of key terms:""" translation = None # Try Gemini first print("Falling back to Gemini...") for attempt in range(2): try: gemini_response = gemini_model.generate_content(prompt) translation = gemini_response.text break except Exception as e: print(f"Gemini error (attempt {attempt + 1}): {str(e)}") if attempt == 1: print("Gemini failed. Falling back to Groq LLaMA model") else: time.sleep(30) # If Gemini failed, try Groq if not translation and groq_client: for attempt in range(2): try: chat_completion = groq_client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-70b-8192", timeout=30 ) translation = chat_completion.choices[0].message.content break except Exception as e: print(f"Groq error (attempt {attempt + 1}): {str(e)}") if attempt == 1: print(f"Failed to translate Chapter {i + 1} after all attempts") translation = f"TRANSLATION FAILED: {chapter['title']}" else: time.sleep(30) # Add the complete chapter translation to results translations.append({ 'title': chapter['title'], 'translated_content': translation }) print(f"Completed translation of Chapter {i + 1}") print("First 500 characters of translation:") print(translation[:500] + "...") print('=======================================') time.sleep(5) # Sleep between requests # Save all translations with open(TRANSLATIONS_FILE, 'w', encoding='utf-8') as f: json.dump(translations, f, ensure_ascii=False, indent=2) print(f'Translation completed. Translations saved to {TRANSLATIONS_FILE}') # Gradio Interface def process_novel(first_chapter_url, final_url, novel_name, gemini_api_key, groq_api_key): # Scrape chapters asyncio.run(scrape_task(first_chapter_url, final_url)) # Process chapters (split long chapters) process_chapters(CHAPTERS_FILE, SPLIT_CHAPTERS_FILE) create_glossary(gemini_api_key, groq_api_key) # Translate chapters translate_task(gemini_api_key, groq_api_key) return "Scraping, Processing, and Translation Completed!" # Gradio Interface iface = gr.Interface( fn=process_novel, inputs=[ gr.Textbox(label="First Chapter URL"), gr.Textbox(label="Final Chapter URL (optional)"), gr.Textbox(label="Novel Name"), gr.Textbox(label="Gemini API Key"), gr.Textbox(label="Groq API Key (optional)"), ], outputs="text", title="Novel Scraper and Translator", description="Input the first chapter URL, final chapter URL (optional), novel name, and API keys to scrape and translate the novel." ) iface.launch()