Spaces:
Sleeping
Sleeping
Now Arcana could render PDFs Word and PPTs intol DBMS
Browse files
Arcana.py
CHANGED
@@ -6,6 +6,8 @@ import re
|
|
6 |
from tqdm import tqdm
|
7 |
import time
|
8 |
from nylon import ChatDatabase, get_keywords
|
|
|
|
|
9 |
|
10 |
def extract_text_from_pdf(pdf_path):
|
11 |
output_string = io.StringIO()
|
@@ -15,6 +17,19 @@ def extract_text_from_pdf(pdf_path):
|
|
15 |
output_type='text', codec='utf-8')
|
16 |
return output_string.getvalue()
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def process_text_into_paragraphs(text):
|
19 |
# Remove page numbers and headers/footers
|
20 |
text = re.sub(r'\n\d+\n', '\n', text)
|
@@ -35,23 +50,30 @@ def process_text_into_paragraphs(text):
|
|
35 |
|
36 |
return cleaned_paragraphs
|
37 |
|
38 |
-
def
|
39 |
fixed_timestamp = "2024-10-22 12:00:00"
|
40 |
sender = "Arcana" # Set sender to "Arcana" for all messages
|
41 |
|
42 |
-
|
43 |
-
total_files = len(
|
44 |
|
45 |
-
with tqdm(total=total_files, desc="Processing
|
46 |
-
for filename in
|
47 |
-
|
48 |
-
tag = os.path.splitext(filename)[0] # Use filename without
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
text = extract_text_from_pdf(pdf_path)
|
51 |
paragraphs = process_text_into_paragraphs(text)
|
52 |
|
53 |
for paragraph in paragraphs:
|
54 |
-
#print(paragraph)
|
55 |
db.add_message(sender, fixed_timestamp, str(paragraph), tag)
|
56 |
|
57 |
pbar.update(1)
|
@@ -64,18 +86,17 @@ def main(foldername):
|
|
64 |
if os.path.exists(db_filename):
|
65 |
db_filename += '.txt'
|
66 |
print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
|
79 |
|
80 |
if __name__ == "__main__":
|
81 |
main()
|
|
|
6 |
from tqdm import tqdm
|
7 |
import time
|
8 |
from nylon import ChatDatabase, get_keywords
|
9 |
+
from docx import Document
|
10 |
+
from pptx import Presentation
|
11 |
|
12 |
def extract_text_from_pdf(pdf_path):
|
13 |
output_string = io.StringIO()
|
|
|
17 |
output_type='text', codec='utf-8')
|
18 |
return output_string.getvalue()
|
19 |
|
20 |
+
def extract_text_from_docx(docx_path):
|
21 |
+
document = Document(docx_path)
|
22 |
+
return '\n'.join([para.text for para in document.paragraphs])
|
23 |
+
|
24 |
+
def extract_text_from_pptx(pptx_path):
|
25 |
+
presentation = Presentation(pptx_path)
|
26 |
+
slides_text = []
|
27 |
+
for slide in presentation.slides:
|
28 |
+
for shape in slide.shapes:
|
29 |
+
if shape.has_text_frame:
|
30 |
+
slides_text.append(shape.text)
|
31 |
+
return '\n'.join(slides_text)
|
32 |
+
|
33 |
def process_text_into_paragraphs(text):
|
34 |
# Remove page numbers and headers/footers
|
35 |
text = re.sub(r'\n\d+\n', '\n', text)
|
|
|
50 |
|
51 |
return cleaned_paragraphs
|
52 |
|
53 |
+
def process_files(directory, db):
|
54 |
fixed_timestamp = "2024-10-22 12:00:00"
|
55 |
sender = "Arcana" # Set sender to "Arcana" for all messages
|
56 |
|
57 |
+
files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
|
58 |
+
total_files = len(files)
|
59 |
|
60 |
+
with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
|
61 |
+
for filename in files:
|
62 |
+
file_path = os.path.join(directory, filename)
|
63 |
+
tag = os.path.splitext(filename)[0] # Use filename without extension as tag
|
64 |
+
|
65 |
+
if filename.endswith('.pdf'):
|
66 |
+
text = extract_text_from_pdf(file_path)
|
67 |
+
elif filename.endswith('.docx'):
|
68 |
+
text = extract_text_from_docx(file_path)
|
69 |
+
elif filename.endswith('.pptx'):
|
70 |
+
text = extract_text_from_pptx(file_path)
|
71 |
+
else:
|
72 |
+
continue # Skip unsupported file types
|
73 |
|
|
|
74 |
paragraphs = process_text_into_paragraphs(text)
|
75 |
|
76 |
for paragraph in paragraphs:
|
|
|
77 |
db.add_message(sender, fixed_timestamp, str(paragraph), tag)
|
78 |
|
79 |
pbar.update(1)
|
|
|
86 |
if os.path.exists(db_filename):
|
87 |
db_filename += '.txt'
|
88 |
print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
|
89 |
+
|
90 |
+
print(f"Creating new database '{db_filename}'...")
|
91 |
+
db = ChatDatabase(db_filename)
|
92 |
+
file_directory = foldername
|
93 |
+
|
94 |
+
start_time = time.time()
|
95 |
+
process_files(file_directory, db)
|
96 |
+
end_time = time.time()
|
97 |
+
|
98 |
+
total_time = end_time - start_time
|
99 |
+
print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
|
|
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
main()
|