|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import re
|
|
from datetime import datetime
|
|
import logging
|
|
|
|
import ebooklib
|
|
from bs4 import BeautifulSoup
|
|
from ebooklib import epub
|
|
|
|
|
|
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_epub(file_path):
|
|
"""Read and extract text from an EPUB file."""
|
|
book = epub.read_epub(file_path)
|
|
chapters = []
|
|
for item in book.get_items():
|
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
chapters.append(item.get_content())
|
|
|
|
text = ""
|
|
for html_content in chapters:
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
text += soup.get_text() + "\n\n"
|
|
return text
|
|
|
|
|
|
|
|
def extract_epub_metadata(content):
|
|
title_match = re.search(r'Title:\s*(.*?)\n', content)
|
|
author_match = re.search(r'Author:\s*(.*?)\n', content)
|
|
|
|
title = title_match.group(1) if title_match else None
|
|
author = author_match.group(1) if author_match else None
|
|
|
|
return title, author
|
|
|
|
|
|
def ingest_text_file(file_path, title=None, author=None, keywords=None):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
|
|
if 'epub_converted' in (keywords or ''):
|
|
extracted_title, extracted_author = extract_epub_metadata(content)
|
|
title = title or extracted_title
|
|
author = author or extracted_author
|
|
|
|
|
|
if not title:
|
|
title = os.path.splitext(os.path.basename(file_path))[0]
|
|
|
|
|
|
if not author:
|
|
author = 'Unknown'
|
|
|
|
|
|
if not keywords:
|
|
keywords = 'text_file,epub_converted'
|
|
else:
|
|
keywords = f'text_file,epub_converted,{keywords}'
|
|
|
|
|
|
add_media_with_keywords(
|
|
url=file_path,
|
|
title=title,
|
|
media_type='document',
|
|
content=content,
|
|
keywords=keywords,
|
|
prompt='No prompt for text files',
|
|
summary='No summary for text files',
|
|
transcription_model='None',
|
|
author=author,
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
|
|
return f"Text file '{title}' by {author} ingested successfully."
|
|
except Exception as e:
|
|
logging.error(f"Error ingesting text file: {str(e)}")
|
|
return f"Error ingesting text file: {str(e)}"
|
|
|
|
|
|
def ingest_folder(folder_path, keywords=None):
|
|
results = []
|
|
for filename in os.listdir(folder_path):
|
|
if filename.lower().endswith('.txt'):
|
|
file_path = os.path.join(folder_path, filename)
|
|
result = ingest_text_file(file_path, keywords=keywords)
|
|
results.append(result)
|
|
|
|
|
|
def epub_to_markdown(epub_path):
|
|
book = epub.read_epub(epub_path)
|
|
markdown_content = "# Table of Contents\n\n"
|
|
chapters = []
|
|
|
|
|
|
toc = book.toc
|
|
for item in toc:
|
|
if isinstance(item, tuple):
|
|
section, children = item
|
|
level = 1
|
|
markdown_content += format_toc_item(section, level)
|
|
for child in children:
|
|
markdown_content += format_toc_item(child, level + 1)
|
|
else:
|
|
markdown_content += format_toc_item(item, 1)
|
|
|
|
markdown_content += "\n---\n\n"
|
|
|
|
|
|
for item in book.get_items():
|
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
chapter_content = item.get_content().decode('utf-8')
|
|
soup = BeautifulSoup(chapter_content, 'html.parser')
|
|
|
|
|
|
title = soup.find(['h1', 'h2', 'h3'])
|
|
if title:
|
|
chapter_title = title.get_text()
|
|
markdown_content += f"# {chapter_title}\n\n"
|
|
|
|
|
|
for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
|
|
if elem.name.startswith('h'):
|
|
level = int(elem.name[1])
|
|
markdown_content += f"{'#' * level} {elem.get_text()}\n\n"
|
|
elif elem.name == 'p':
|
|
markdown_content += f"{elem.get_text()}\n\n"
|
|
elif elem.name in ['ul', 'ol']:
|
|
for li in elem.find_all('li'):
|
|
markdown_content += f"- {li.get_text()}\n"
|
|
markdown_content += "\n"
|
|
|
|
markdown_content += "---\n\n"
|
|
|
|
return markdown_content
|
|
|
|
|
|
def format_toc_item(item, level):
|
|
return f"{' ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n"
|
|
|
|
|
|
def slugify(text):
|
|
return re.sub(r'[\W_]+', '-', text.lower())
|
|
|
|
|
|
|
|
|
|
|