Ocillus commited on
Commit
3f56112
·
verified ·
1 Parent(s): 3857cd0

Now Arcana could render PDFs Word and PPTs intol DBMS

Browse files
Files changed (1) hide show
  1. Arcana.py +42 -21
Arcana.py CHANGED
@@ -6,6 +6,8 @@ import re
6
  from tqdm import tqdm
7
  import time
8
  from nylon import ChatDatabase, get_keywords
 
 
9
 
10
  def extract_text_from_pdf(pdf_path):
11
  output_string = io.StringIO()
@@ -15,6 +17,19 @@ def extract_text_from_pdf(pdf_path):
15
  output_type='text', codec='utf-8')
16
  return output_string.getvalue()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def process_text_into_paragraphs(text):
19
  # Remove page numbers and headers/footers
20
  text = re.sub(r'\n\d+\n', '\n', text)
@@ -35,23 +50,30 @@ def process_text_into_paragraphs(text):
35
 
36
  return cleaned_paragraphs
37
 
38
- def process_pdfs(directory, db):
39
  fixed_timestamp = "2024-10-22 12:00:00"
40
  sender = "Arcana" # Set sender to "Arcana" for all messages
41
 
42
- pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
43
- total_files = len(pdf_files)
44
 
45
- with tqdm(total=total_files, desc="Processing PDFs", unit="file") as pbar:
46
- for filename in pdf_files:
47
- pdf_path = os.path.join(directory, filename)
48
- tag = os.path.splitext(filename)[0] # Use filename without .pdf as tag
 
 
 
 
 
 
 
 
 
49
 
50
- text = extract_text_from_pdf(pdf_path)
51
  paragraphs = process_text_into_paragraphs(text)
52
 
53
  for paragraph in paragraphs:
54
- #print(paragraph)
55
  db.add_message(sender, fixed_timestamp, str(paragraph), tag)
56
 
57
  pbar.update(1)
@@ -64,18 +86,17 @@ def main(foldername):
64
  if os.path.exists(db_filename):
65
  db_filename += '.txt'
66
  print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
67
- #db = ChatDatabase(db_filename)
68
- #else:
69
- print(f"Creating new database '{db_filename}'...")
70
- db = ChatDatabase(db_filename)
71
- pdf_directory = foldername
72
-
73
- start_time = time.time()
74
- process_pdfs(pdf_directory, db)
75
- end_time = time.time()
76
-
77
- total_time = end_time - start_time
78
- print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
79
 
80
  if __name__ == "__main__":
81
  main()
 
6
  from tqdm import tqdm
7
  import time
8
  from nylon import ChatDatabase, get_keywords
9
+ from docx import Document
10
+ from pptx import Presentation
11
 
12
  def extract_text_from_pdf(pdf_path):
13
  output_string = io.StringIO()
 
17
  output_type='text', codec='utf-8')
18
  return output_string.getvalue()
19
 
20
+ def extract_text_from_docx(docx_path):
21
+ document = Document(docx_path)
22
+ return '\n'.join([para.text for para in document.paragraphs])
23
+
24
+ def extract_text_from_pptx(pptx_path):
25
+ presentation = Presentation(pptx_path)
26
+ slides_text = []
27
+ for slide in presentation.slides:
28
+ for shape in slide.shapes:
29
+ if shape.has_text_frame:
30
+ slides_text.append(shape.text)
31
+ return '\n'.join(slides_text)
32
+
33
  def process_text_into_paragraphs(text):
34
  # Remove page numbers and headers/footers
35
  text = re.sub(r'\n\d+\n', '\n', text)
 
50
 
51
  return cleaned_paragraphs
52
 
53
+ def process_files(directory, db):
54
  fixed_timestamp = "2024-10-22 12:00:00"
55
  sender = "Arcana" # Set sender to "Arcana" for all messages
56
 
57
+ files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
58
+ total_files = len(files)
59
 
60
+ with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
61
+ for filename in files:
62
+ file_path = os.path.join(directory, filename)
63
+ tag = os.path.splitext(filename)[0] # Use filename without extension as tag
64
+
65
+ if filename.endswith('.pdf'):
66
+ text = extract_text_from_pdf(file_path)
67
+ elif filename.endswith('.docx'):
68
+ text = extract_text_from_docx(file_path)
69
+ elif filename.endswith('.pptx'):
70
+ text = extract_text_from_pptx(file_path)
71
+ else:
72
+ continue # Skip unsupported file types
73
 
 
74
  paragraphs = process_text_into_paragraphs(text)
75
 
76
  for paragraph in paragraphs:
 
77
  db.add_message(sender, fixed_timestamp, str(paragraph), tag)
78
 
79
  pbar.update(1)
 
86
  if os.path.exists(db_filename):
87
  db_filename += '.txt'
88
  print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
89
+
90
+ print(f"Creating new database '{db_filename}'...")
91
+ db = ChatDatabase(db_filename)
92
+ file_directory = foldername
93
+
94
+ start_time = time.time()
95
+ process_files(file_directory, db)
96
+ end_time = time.time()
97
+
98
+ total_time = end_time - start_time
99
+ print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
 
100
 
101
  if __name__ == "__main__":
102
  main()