Spaces:

Ocillus
/

Arcana

Sleeping

App Files Files Community

Ocillus commited on Jan 21

Commit

3f56112

verified ·

1 Parent(s): 3857cd0

Now Arcana could render PDFs Word and PPTs intol DBMS

Browse files

Files changed (1) hide show

Arcana.py +42 -21

Arcana.py CHANGED Viewed

@@ -6,6 +6,8 @@ import re
 from tqdm import tqdm
 import time
 from nylon import ChatDatabase, get_keywords
 def extract_text_from_pdf(pdf_path):
     output_string = io.StringIO()
@@ -15,6 +17,19 @@ def extract_text_from_pdf(pdf_path):
                            output_type='text', codec='utf-8')
     return output_string.getvalue()
 def process_text_into_paragraphs(text):
     # Remove page numbers and headers/footers
     text = re.sub(r'\n\d+\n', '\n', text)
@@ -35,23 +50,30 @@ def process_text_into_paragraphs(text):
     return cleaned_paragraphs
-def process_pdfs(directory, db):
     fixed_timestamp = "2024-10-22 12:00:00"
     sender = "Arcana"  # Set sender to "Arcana" for all messages
-    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
-    total_files = len(pdf_files)
-    with tqdm(total=total_files, desc="Processing PDFs", unit="file") as pbar:
-        for filename in pdf_files:
-            pdf_path = os.path.join(directory, filename)
-            tag = os.path.splitext(filename)[0]  # Use filename without .pdf as tag
-            text = extract_text_from_pdf(pdf_path)
             paragraphs = process_text_into_paragraphs(text)
             for paragraph in paragraphs:
-                #print(paragraph)
                 db.add_message(sender, fixed_timestamp, str(paragraph), tag)
             pbar.update(1)
@@ -64,18 +86,17 @@ def main(foldername):
     if os.path.exists(db_filename):
         db_filename += '.txt'
         print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
-        #db = ChatDatabase(db_filename)
-        #else:
-        print(f"Creating new database '{db_filename}'...")
-        db = ChatDatabase(db_filename)
-        pdf_directory = foldername
-        start_time = time.time()
-        process_pdfs(pdf_directory, db)
-        end_time = time.time()
-        total_time = end_time - start_time
-        print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
 if __name__ == "__main__":
     main()

 from tqdm import tqdm
 import time
 from nylon import ChatDatabase, get_keywords
+from docx import Document
+from pptx import Presentation
 def extract_text_from_pdf(pdf_path):
     output_string = io.StringIO()
                            output_type='text', codec='utf-8')
     return output_string.getvalue()
+def extract_text_from_docx(docx_path):
+    document = Document(docx_path)
+    return '\n'.join([para.text for para in document.paragraphs])
+def extract_text_from_pptx(pptx_path):
+    presentation = Presentation(pptx_path)
+    slides_text = []
+    for slide in presentation.slides:
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                slides_text.append(shape.text)
+    return '\n'.join(slides_text)
 def process_text_into_paragraphs(text):
     # Remove page numbers and headers/footers
     text = re.sub(r'\n\d+\n', '\n', text)
     return cleaned_paragraphs
+def process_files(directory, db):
     fixed_timestamp = "2024-10-22 12:00:00"
     sender = "Arcana"  # Set sender to "Arcana" for all messages
+    files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
+    total_files = len(files)
+    with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
+        for filename in files:
+            file_path = os.path.join(directory, filename)
+            tag = os.path.splitext(filename)[0]  # Use filename without extension as tag
+            if filename.endswith('.pdf'):
+                text = extract_text_from_pdf(file_path)
+            elif filename.endswith('.docx'):
+                text = extract_text_from_docx(file_path)
+            elif filename.endswith('.pptx'):
+                text = extract_text_from_pptx(file_path)
+            else:
+                continue  # Skip unsupported file types
             paragraphs = process_text_into_paragraphs(text)
             for paragraph in paragraphs:
                 db.add_message(sender, fixed_timestamp, str(paragraph), tag)
             pbar.update(1)
     if os.path.exists(db_filename):
         db_filename += '.txt'
         print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
+    print(f"Creating new database '{db_filename}'...")
+    db = ChatDatabase(db_filename)
+    file_directory = foldername
+    start_time = time.time()
+    process_files(file_directory, db)
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
 if __name__ == "__main__":
     main()