Spaces:
Sleeping
Sleeping
File size: 3,682 Bytes
c20f7c1 3f56112 c20f7c1 b790fc2 c20f7c1 3f56112 c20f7c1 b790fc2 c20f7c1 f7f0bde c20f7c1 b790fc2 c20f7c1 3f56112 c20f7c1 3f56112 c20f7c1 3f56112 c20f7c1 b790fc2 c20f7c1 e0fa681 c20f7c1 f7f0bde c20f7c1 f7f0bde e0fa681 3f56112 c20f7c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
import io
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
import re
from tqdm import tqdm
import time
from nylon import ChatDatabase, get_keywords
from docx import Document
from pptx import Presentation
def extract_text_from_pdf(pdf_path):
output_string = io.StringIO()
with open(pdf_path, 'rb') as fin:
laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1)
extract_text_to_fp(fin, output_string, laparams=laparams,
output_type='text', codec='utf-8')
return output_string.getvalue()
def extract_text_from_docx(docx_path):
document = Document(docx_path)
return '\n'.join([para.text for para in document.paragraphs])
def extract_text_from_pptx(pptx_path):
presentation = Presentation(pptx_path)
slides_text = []
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
slides_text.append(shape.text)
return '\n'.join(slides_text)
def process_text_into_paragraphs(text):
# Remove page numbers and headers/footers
text = re.sub(r'\n\d+\n', '\n', text)
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
# Split text into paragraphs
paragraphs = list(set(re.split(r'\n{2,}', text)))
# Clean up each paragraph
cleaned_paragraphs = []
for para in paragraphs:
# Remove extra whitespace and join broken words
cleaned_para = re.sub(r'\s+', ' ', para).strip()
cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para)
if cleaned_para: # Only add non-empty paragraphs
cleaned_paragraphs.append(cleaned_para)
return cleaned_paragraphs
def process_files(directory, db):
fixed_timestamp = "2024-10-22 12:00:00"
sender = "Arcana" # Set sender to "Arcana" for all messages
files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
total_files = len(files)
with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
for filename in files:
file_path = os.path.join(directory, filename)
tag = os.path.splitext(filename)[0] # Use filename without extension as tag
if filename.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif filename.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif filename.endswith('.pptx'):
text = extract_text_from_pptx(file_path)
else:
continue # Skip unsupported file types
paragraphs = process_text_into_paragraphs(text)
for paragraph in paragraphs:
db.add_message(sender, fixed_timestamp, str(paragraph), tag)
pbar.update(1)
pbar.set_postfix({"Current File": filename})
print(filename)
def main(foldername):
db_filename = foldername
if os.path.exists(db_filename):
db_filename += '.txt'
print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
print(f"Creating new database '{db_filename}'...")
db = ChatDatabase(db_filename)
file_directory = foldername
start_time = time.time()
process_files(file_directory, db)
end_time = time.time()
total_time = end_time - start_time
print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
if __name__ == "__main__":
main()
|