Spaces:

Ocillus
/

Arcana

Sleeping

File size: 3,682 Bytes

c20f7c1
 
 
 
 
 
 
 
3f56112
 
c20f7c1
 
 
 
b790fc2
 
c20f7c1
 
 
3f56112
 
 
 
 
 
 
 
 
 
 
 
 
c20f7c1
b790fc2
 
 
 
c20f7c1
f7f0bde
c20f7c1
 
 
 
 
 
 
b790fc2
c20f7c1
 
 
 
 
3f56112
c20f7c1
 
 
3f56112
 
c20f7c1
3f56112
 
 
 
 
 
 
 
 
 
 
 
 
c20f7c1
 
 
 
b790fc2
c20f7c1
 
 
e0fa681
c20f7c1
f7f0bde
 
c20f7c1
 
f7f0bde
e0fa681
3f56112
 
 
 
 
 
 
 
 
 
 
c20f7c1

import os
import io
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
import re
from tqdm import tqdm
import time
from nylon import ChatDatabase, get_keywords
from docx import Document
from pptx import Presentation

def extract_text_from_pdf(pdf_path):
    output_string = io.StringIO()
    with open(pdf_path, 'rb') as fin:
        laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1)
        extract_text_to_fp(fin, output_string, laparams=laparams, 
                           output_type='text', codec='utf-8')
    return output_string.getvalue()

def extract_text_from_docx(docx_path):
    document = Document(docx_path)
    return '\n'.join([para.text for para in document.paragraphs])

def extract_text_from_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slides_text = []
    for slide in presentation.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                slides_text.append(shape.text)
    return '\n'.join(slides_text)

def process_text_into_paragraphs(text):
    # Remove page numbers and headers/footers
    text = re.sub(r'\n\d+\n', '\n', text)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Split text into paragraphs
    paragraphs = list(set(re.split(r'\n{2,}', text)))
    
    # Clean up each paragraph
    cleaned_paragraphs = []
    for para in paragraphs:
        # Remove extra whitespace and join broken words
        cleaned_para = re.sub(r'\s+', ' ', para).strip()
        cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
        cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para)
        if cleaned_para:  # Only add non-empty paragraphs
            cleaned_paragraphs.append(cleaned_para)
    
    return cleaned_paragraphs

def process_files(directory, db):
    fixed_timestamp = "2024-10-22 12:00:00"
    sender = "Arcana"  # Set sender to "Arcana" for all messages
    
    files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
    total_files = len(files)
    
    with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
        for filename in files:
            file_path = os.path.join(directory, filename)
            tag = os.path.splitext(filename)[0]  # Use filename without extension as tag
            
            if filename.endswith('.pdf'):
                text = extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                text = extract_text_from_docx(file_path)
            elif filename.endswith('.pptx'):
                text = extract_text_from_pptx(file_path)
            else:
                continue  # Skip unsupported file types
            
            paragraphs = process_text_into_paragraphs(text)
            
            for paragraph in paragraphs:
                db.add_message(sender, fixed_timestamp, str(paragraph), tag)
            
            pbar.update(1)
            pbar.set_postfix({"Current File": filename})
            print(filename)

def main(foldername):
    db_filename = foldername
    
    if os.path.exists(db_filename):
        db_filename += '.txt'
        print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
    
    print(f"Creating new database '{db_filename}'...")
    db = ChatDatabase(db_filename)
    file_directory = foldername
    
    start_time = time.time()
    process_files(file_directory, db)
    end_time = time.time()
    
    total_time = end_time - start_time
    print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")

if __name__ == "__main__":
    main()