Spaces:
Runtime error
Runtime error
Delete auditqa/doc_process_0.py
Browse files- auditqa/doc_process_0.py +0 -76
auditqa/doc_process_0.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import os
|
3 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
|
4 |
-
from transformers import AutoTokenizer
|
5 |
-
from torch import cuda
|
6 |
-
from langchain_community.document_loaders import PyMuPDFLoader
|
7 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
8 |
-
from langchain_community.vectorstores import Qdrant
|
9 |
-
device = 'cuda' if cuda.is_available() else 'cpu'
|
10 |
-
#from dotenv import load_dotenv
|
11 |
-
#load_dotenv()
|
12 |
-
|
13 |
-
#HF_token = os.environ["HF_TOKEN"]
|
14 |
-
path_to_data = "./data/"
|
15 |
-
|
16 |
-
|
17 |
-
def process_pdf():
|
18 |
-
files = {'MWTS2021':'./data/MWTS2021.pdf',
|
19 |
-
'MWTS2022':'./data/MWTS2022.pdf',
|
20 |
-
'Consolidated2021':'./data/Consolidated2021.pdf'}
|
21 |
-
docs = {}
|
22 |
-
for file,value in files.items():
|
23 |
-
try:
|
24 |
-
docs[file] = PyMuPDFLoader(value).load()
|
25 |
-
except Exception as e:
|
26 |
-
print("Exception: ", e)
|
27 |
-
|
28 |
-
|
29 |
-
# text splitter based on the tokenizer of a model of your choosing
|
30 |
-
# to make texts fit exactly a transformer's context window size
|
31 |
-
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
32 |
-
chunk_size = 256
|
33 |
-
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
34 |
-
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
35 |
-
chunk_size=chunk_size,
|
36 |
-
chunk_overlap=10,
|
37 |
-
add_start_index=True,
|
38 |
-
strip_whitespace=True,
|
39 |
-
separators=["\n\n", "\n"],
|
40 |
-
)
|
41 |
-
|
42 |
-
all_documents = {'Consolidated':[], 'MWTS':[]}
|
43 |
-
|
44 |
-
for file,value in docs.items():
|
45 |
-
doc_processed = text_splitter.split_documents(value)
|
46 |
-
for doc in doc_processed:
|
47 |
-
doc.metadata["source"] = file
|
48 |
-
doc.metadata["year"] = file[-4:]
|
49 |
-
for key in all_documents:
|
50 |
-
if key in file:
|
51 |
-
print(key)
|
52 |
-
all_documents[key].append(doc_processed)
|
53 |
-
|
54 |
-
for key, docs_processed in all_documents.items():
|
55 |
-
docs_processed = [item for sublist in docs_processed for item in sublist]
|
56 |
-
all_documents[key] = docs_processed
|
57 |
-
|
58 |
-
|
59 |
-
embeddings = HuggingFaceEmbeddings(
|
60 |
-
model_kwargs = {'device': device},
|
61 |
-
encode_kwargs = {'normalize_embeddings': True},
|
62 |
-
model_name="BAAI/bge-small-en-v1.5"
|
63 |
-
)
|
64 |
-
|
65 |
-
qdrant_collections = {}
|
66 |
-
|
67 |
-
for file,value in all_documents.items():
|
68 |
-
print("emebddings for:",file)
|
69 |
-
qdrant_collections[file] = Qdrant.from_documents(
|
70 |
-
value,
|
71 |
-
embeddings,
|
72 |
-
location=":memory:",
|
73 |
-
collection_name=file,
|
74 |
-
)
|
75 |
-
print("done")
|
76 |
-
return qdrant_collections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|