yinong333 commited on
Commit
b40d2e1
1 Parent(s): 8d8d97a

Deploying RAG

Browse files
Files changed (1) hide show
  1. app.py +53 -18
app.py CHANGED
@@ -11,6 +11,9 @@ from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
 
 
 
14
 
15
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
16
  # ---- ENV VARIABLES ---- #
@@ -49,25 +52,57 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
49
  huggingfacehub_api_token=HF_TOKEN,
50
  )
51
 
52
- if os.path.exists("./data/vectorstore"):
53
- vectorstore = FAISS.load_local(
54
- "./data/vectorstore",
55
- hf_embeddings,
56
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
57
- )
58
- hf_retriever = vectorstore.as_retriever()
59
- print("Loaded Vectorstore")
60
- else:
 
 
 
 
61
  print("Indexing Files")
62
- os.makedirs("./data/vectorstore", exist_ok=True)
63
- for i in range(0, len(split_documents), 32):
64
- if i == 0:
65
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
66
- continue
67
- vectorstore.add_documents(split_documents[i:i+32])
68
- vectorstore.save_local("./data/vectorstore")
69
-
70
- hf_retriever = vectorstore.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # -- AUGMENTED -- #
73
  """
 
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
+ from tqdm.asyncio import tqdm_asyncio
15
+ import asyncio
16
+ from tqdm.asyncio import tqdm
17
 
18
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
19
  # ---- ENV VARIABLES ---- #
 
52
  huggingfacehub_api_token=HF_TOKEN,
53
  )
54
 
55
+ async def add_documents_async(vectorstore, documents):
56
+ await vectorstore.aadd_documents(documents)
57
+
58
+ async def process_batch(vectorstore, batch, is_first_batch, pbar):
59
+ if is_first_batch:
60
+ result = await FAISS.afrom_documents(batch, hf_embeddings)
61
+ else:
62
+ await add_documents_async(vectorstore, batch)
63
+ result = vectorstore
64
+ pbar.update(len(batch))
65
+ return result
66
+
67
+ async def main():
68
  print("Indexing Files")
69
+
70
+ vectorstore = None
71
+ batch_size = 32
72
+
73
+ batches = [split_documents[i:i+batch_size] for i in range(0, len(split_documents), batch_size)]
74
+
75
+ async def process_all_batches():
76
+ nonlocal vectorstore
77
+ tasks = []
78
+ pbars = []
79
+
80
+ for i, batch in enumerate(batches):
81
+ pbar = tqdm(total=len(batch), desc=f"Batch {i+1}/{len(batches)}", position=i)
82
+ pbars.append(pbar)
83
+
84
+ if i == 0:
85
+ vectorstore = await process_batch(None, batch, True, pbar)
86
+ else:
87
+ tasks.append(process_batch(vectorstore, batch, False, pbar))
88
+
89
+ if tasks:
90
+ await asyncio.gather(*tasks)
91
+
92
+ for pbar in pbars:
93
+ pbar.close()
94
+
95
+ await process_all_batches()
96
+
97
+ hf_retriever = vectorstore.as_retriever()
98
+ print("\nIndexing complete. Vectorstore is ready for use.")
99
+ return hf_retriever
100
+
101
+ async def run():
102
+ retriever = await main()
103
+ return retriever
104
+
105
+ hf_retriever = asyncio.run(run())
106
 
107
  # -- AUGMENTED -- #
108
  """