Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Aug 6, 2024

Commit

376f540

verified ·

1 Parent(s): ccf8ca1

Delete auditqa/doc_process_0.py

Browse files

Files changed (1) hide show

auditqa/doc_process_0.py +0 -76

auditqa/doc_process_0.py DELETED Viewed

@@ -1,76 +0,0 @@
-import glob
-import os
-from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
-from transformers import AutoTokenizer
-from torch import cuda
-from langchain_community.document_loaders import PyMuPDFLoader
-from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
-from langchain_community.vectorstores import Qdrant
-device = 'cuda' if cuda.is_available() else 'cpu'
-#from dotenv import load_dotenv
-#load_dotenv()
-#HF_token = os.environ["HF_TOKEN"]
-path_to_data = "./data/"
-def process_pdf():
-    files = {'MWTS2021':'./data/MWTS2021.pdf',
-            'MWTS2022':'./data/MWTS2022.pdf',
-            'Consolidated2021':'./data/Consolidated2021.pdf'}
-    docs = {}
-    for file,value in files.items():
-        try:
-            docs[file] = PyMuPDFLoader(value).load()
-        except Exception as e:
-            print("Exception: ", e)
-    # text splitter based on the tokenizer of a model of your choosing
-    # to make texts fit exactly a transformer's context window size
-    # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
-    chunk_size = 256
-    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
-            chunk_size=chunk_size,
-            chunk_overlap=10,
-            add_start_index=True,
-            strip_whitespace=True,
-            separators=["\n\n", "\n"],
-    )
-    all_documents = {'Consolidated':[], 'MWTS':[]}
-    for file,value in docs.items():
-        doc_processed = text_splitter.split_documents(value)
-        for doc in doc_processed:
-            doc.metadata["source"] = file
-            doc.metadata["year"] = file[-4:]
-        for key in all_documents:
-            if key in file:
-                print(key)
-                all_documents[key].append(doc_processed)
-    for key, docs_processed in all_documents.items():
-        docs_processed = [item for sublist in docs_processed for item in sublist]
-        all_documents[key] = docs_processed
-    embeddings = HuggingFaceEmbeddings(
-        model_kwargs = {'device': device},
-        encode_kwargs = {'normalize_embeddings': True},
-        model_name="BAAI/bge-small-en-v1.5"
-    )
-    qdrant_collections = {}
-    for file,value in all_documents.items():
-        print("emebddings for:",file)
-        qdrant_collections[file] = Qdrant.from_documents(
-            value,
-            embeddings,
-            location=":memory:",
-            collection_name=file,
-        )
-    print("done")
-    return qdrant_collections