ppsingh commited on
Commit
f5e5ccb
·
verified ·
1 Parent(s): 1963b0a

Create process_chunks.py

Browse files
Files changed (1) hide show
  1. auditqa/process_chunks.py +85 -0
auditqa/process_chunks.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
+ from transformers import AutoTokenizer
5
+ from torch import cuda
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
7
+ from langchain_community.vectorstores import Qdrant
8
+ from qdrant_client import QdrantClient
9
+ from auditqa.reports import files, report_list
10
+ from langchain.docstore.document import Document
11
+ device = 'cuda' if cuda.is_available() else 'cpu'
12
+
13
+ path_to_data = "./reports/"
14
+
15
+ def open_file(filepath):
16
+ with open(filepath) as file:
17
+ simple_json = json.load(file)
18
+ return simple_json
19
+
20
+ def load_chunks():
21
+ """
22
+ this method reads through the files and report_list to create the vector database
23
+ """
24
+
25
+ # we iterate through the files which contain information about its
26
+ # 'source'=='category', 'subtype', these are used in UI for document selection
27
+ # which will be used later for filtering database
28
+ all_documents = {}
29
+ categories = list(files.keys())
30
+ # iterate through 'source'
31
+ for category in categories:
32
+ print("documents splitting in source:",category)
33
+ all_documents[category] = []
34
+ subtypes = list(files[category].keys())
35
+ # iterate through 'subtype' within the source
36
+ # example source/category == 'District', has subtypes which is district names
37
+ for subtype in subtypes:
38
+ print("document splitting for subtype:",subtype)
39
+ for file in files[category][subtype]:
40
+
41
+ # load the chunks
42
+ doc_processed = open_file(path_to_data + file + "/"+ file+ ".chunks.json" )
43
+ print("chunks in subtype:",subtype, "are:",len(doc_processed))
44
+
45
+ # add metadata information
46
+ chunks_list = []
47
+ for doc in doc_processed:
48
+ chunks_list.append(Document(page_content=doc['content'],
49
+ metadata={"source": category,
50
+ "subtype":subtype,
51
+ "year":file[-4:],
52
+ "filename":file,
53
+ "page":doc['metadata']['page']}))
54
+
55
+ all_documents[category].append(chunks_list)
56
+
57
+ # convert list of list to flat list
58
+ for key, docs_processed in all_documents.items():
59
+ docs_processed = [item for sublist in docs_processed for item in sublist]
60
+ print("length of chunks in source:",key, "are:",len(docs_processed))
61
+ all_documents[key] = docs_processed
62
+ all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
63
+ all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
64
+ # define embedding model
65
+ embeddings = HuggingFaceEmbeddings(
66
+ model_kwargs = {'device': device},
67
+ encode_kwargs = {'normalize_embeddings': True},
68
+ model_name="BAAI/bge-large-en-v1.5"
69
+ )
70
+ # placeholder for collection
71
+ qdrant_collections = {}
72
+
73
+
74
+ for file,value in all_documents.items():
75
+ if file == "allreports":
76
+ print("emebddings for:",file)
77
+ qdrant_collections[file] = Qdrant.from_documents(
78
+ value,
79
+ embeddings,
80
+ location=":memory:",
81
+ collection_name=file,
82
+ )
83
+ print(qdrant_collections)
84
+ print("vector embeddings done")
85
+ return qdrant_collections