Sean-Case commited on
Commit
51de353
·
1 Parent(s): 49e32ea

Changed app space image, removed unnecessary files

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Light PDF web QA chatbot
3
- emoji: 📈
4
  colorFrom: yellow
5
  colorTo: yellow
6
  sdk: gradio
 
1
  ---
2
  title: Light PDF web QA chatbot
3
+ emoji: 🌍
4
  colorFrom: yellow
5
  colorTo: yellow
6
  sdk: gradio
chatfuncs/.ipynb_checkpoints/chatfuncs-checkpoint.py DELETED
@@ -1,553 +0,0 @@
1
- # ---
2
- # jupyter:
3
- # jupytext:
4
- # formats: ipynb,py:light
5
- # text_representation:
6
- # extension: .py
7
- # format_name: light
8
- # format_version: '1.5'
9
- # jupytext_version: 1.14.6
10
- # kernelspec:
11
- # display_name: Python 3 (ipykernel)
12
- # language: python
13
- # name: python3
14
- # ---
15
-
16
- # +
17
- import os
18
- import datetime
19
- from typing import Dict, List, Tuple
20
- from itertools import compress
21
- import pandas as pd
22
-
23
- from langchain import PromptTemplate
24
- from langchain.chains import LLMChain
25
- from langchain.chains.base import Chain
26
- from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
27
- from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
28
- from langchain.chains.qa_with_sources import load_qa_with_sources_chain
29
- from langchain.prompts import PromptTemplate
30
- from langchain.retrievers import TFIDFRetriever, SVMRetriever
31
- from langchain.vectorstores import FAISS
32
- from langchain.llms import HuggingFacePipeline
33
-
34
- from pydantic import BaseModel
35
-
36
- import nltk
37
- from nltk.corpus import stopwords
38
- from nltk.tokenize import word_tokenize
39
-
40
- import torch
41
- #from transformers import pipeline
42
- from optimum.pipelines import pipeline
43
- from transformers import AutoTokenizer, TextStreamer, AutoModelForSeq2SeqLM, TextIteratorStreamer
44
- from threading import Thread
45
-
46
- import gradio as gr
47
-
48
-
49
- # -
50
-
51
- # # Pre-load stopwords, vectorstore, models
52
-
53
- # +
54
- def get_faiss_store(faiss_vstore_folder,embeddings):
55
- import zipfile
56
- with zipfile.ZipFile(faiss_vstore_folder + '/faiss_lambeth_census_embedding.zip', 'r') as zip_ref:
57
- zip_ref.extractall(faiss_vstore_folder)
58
-
59
- faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
60
- os.remove(faiss_vstore_folder + "/index.faiss")
61
- os.remove(faiss_vstore_folder + "/index.pkl")
62
-
63
- return faiss_vstore
64
-
65
- #def set_hf_api_key(api_key, chain_agent):
66
- #if api_key:
67
- #os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
68
- #vectorstore = get_faiss_store(faiss_vstore_folder="faiss_lambeth_census_embedding.zip",embeddings=embeddings)
69
- #qa_chain = create_prompt_templates(vectorstore)
70
- #print(qa_chain)
71
- #os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
72
- #return qa_chain
73
-
74
-
75
- # -
76
-
77
- def create_hf_model(model_name = "declare-lab/flan-alpaca-large"):
78
-
79
- model_id = model_name
80
- torch_device = "cuda" if torch.cuda.is_available() else "cpu"
81
- print("Running on device:", torch_device)
82
- print("CPU threads:", torch.get_num_threads())
83
-
84
-
85
-
86
- if torch_device == "cuda":
87
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
88
- else:
89
- #torch.set_num_threads(8)
90
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
91
- tokenizer = AutoTokenizer.from_pretrained(model_id)
92
-
93
- return model, tokenizer, torch_device
94
-
95
- # +
96
- # Add some stopwords to nltk default
97
-
98
- nltk.download('stopwords')
99
- stopwords = nltk.corpus.stopwords.words('english')
100
- #print(stopwords.words('english'))
101
- newStopWords = ['what','how', 'when', 'which', 'who', 'change', 'changed', 'do', 'did', 'increase', 'decrease', 'increased',
102
- 'decreased', 'proportion', 'percentage', 'report', 'reporting','say', 'said']
103
- stopwords.extend(newStopWords)
104
- # -
105
-
106
- # Embeddings
107
- #model_name = "sentence-transformers/all-MiniLM-L6-v2"
108
- #embeddings = HuggingFaceEmbeddings(model_name=model_name)
109
- embed_model_name = "hkunlp/instructor-large"
110
- embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model_name)
111
- vectorstore = get_faiss_store(faiss_vstore_folder="faiss_lambeth_census_embedding",embeddings=embeddings)
112
-
113
- # +
114
- # Models
115
-
116
- #checkpoint = 'declare-lab/flan-alpaca-base' # Flan Alpaca Base incorrectly interprets text based on input (e.g. if you use words like increase or decrease in the question it will respond falsely often). Flan Alpaca Large is much more consistent
117
- checkpoint = 'declare-lab/flan-alpaca-large'
118
-
119
- model, tokenizer, torch_device = create_hf_model(model_name = checkpoint)
120
-
121
-
122
- # Look at this for streaming text with huggingface and langchain (last example): https://github.com/hwchase17/langchain/issues/2918
123
-
124
- streamer = TextStreamer(tokenizer, skip_prompt=True)
125
-
126
- pipe = pipeline('text2text-generation',
127
- model = checkpoint,
128
- # tokenizer = tokenizer,
129
- max_length=512,
130
- #do_sample=True,
131
- temperature=0.000001,
132
- #top_p=0.95,
133
- #repetition_penalty=1.15,
134
- accelerator="bettertransformer",
135
- streamer=streamer
136
- )
137
-
138
- checkpoint_keywords = 'ml6team/keyphrase-generation-t5-small-inspec'
139
-
140
- keyword_model = pipeline('text2text-generation',
141
- model = checkpoint_keywords,
142
- accelerator="bettertransformer"
143
- )
144
-
145
-
146
- # -
147
-
148
- # # Chat history
149
-
150
- def clear_chat(chat_history_state, sources, chat_message):
151
- chat_history_state = []
152
- sources = ''
153
- chat_message = ''
154
- return chat_history_state, sources, chat_message
155
-
156
-
157
- def _get_chat_history(chat_history: List[Tuple[str, str]]): # Limit to last 3 interactions only
158
- max_chat_length = 3
159
-
160
- if len(chat_history) > max_chat_length:
161
- chat_history = chat_history[-max_chat_length:]
162
-
163
- print(chat_history)
164
-
165
- first_q = ""
166
- for human_s, ai_s in chat_history:
167
- first_q = human_s
168
- break
169
-
170
- conversation = ""
171
- for human_s, ai_s in chat_history:
172
- human = f"Human: " + human_s
173
- ai = f"Assistant: " + ai_s
174
- conversation += "\n" + "\n".join([human, ai])
175
-
176
- return conversation, first_q
177
-
178
-
179
- def adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history):
180
- t5_small_keyphrase = HuggingFacePipeline(pipeline=keyword_model)
181
- memory_llm = t5_small_keyphrase#flan_alpaca#flan_t5_xxl
182
- new_q_memory_llm = t5_small_keyphrase#flan_alpaca#flan_t5_xxl
183
-
184
-
185
- memory_prompt = PromptTemplate(
186
- template = "{chat_history_first_q}",
187
- input_variables=["chat_history_first_q"]
188
- )
189
- #template = "Extract the names of people, things, or places from the following text: {chat_history}",#\n Original question: {question}\n New list:",
190
- #template = "Extract keywords, and the names of people or places from the following text: {chat_history}",#\n Original question: {question}\n New list:",
191
- #\n Original question: {question}\n New list:",
192
-
193
-
194
- #example_prompt=_eg_prompt,
195
- #input_variables=["question", "chat_history"]
196
- #input_variables=["chat_history"]
197
-
198
- memory_extractor = LLMChain(llm=memory_llm, prompt=memory_prompt)
199
-
200
- #new_question_keywords = #remove_stopwords(question)
201
-
202
- print("new_question_keywords:")
203
- print(new_question_keywords)
204
-
205
- chat_history_str, chat_history_first_q = _get_chat_history(chat_history)
206
- if chat_history_str:
207
-
208
- extracted_memory = memory_extractor.run(
209
- chat_history_first_q=chat_history_first_q # question=question, chat_history=chat_history_str,
210
- )
211
-
212
- new_question_kworded = extracted_memory + " " + new_question_keywords
213
- new_question = extracted_memory + " " + question
214
-
215
- else:
216
- new_question = question
217
- new_question_kworded = new_question_keywords
218
-
219
- return new_question, new_question_kworded
220
-
221
-
222
- # # Prompt creation
223
-
224
- def remove_q_stopwords(question):
225
- # Prepare question by removing keywords
226
- text = question.lower()
227
- text_tokens = word_tokenize(text)
228
- tokens_without_sw = [word for word in text_tokens if not word in stopwords]
229
- new_question_keywords = ' '.join(tokens_without_sw)
230
- return new_question_keywords, question
231
-
232
-
233
- def create_final_prompt(inputs: Dict[str, str], vectorstore, instruction_prompt, content_prompt):
234
-
235
- question = inputs["question"]
236
- chat_history = inputs["chat_history"]
237
-
238
- new_question_keywords, question = remove_q_stopwords(question)
239
-
240
- new_question, new_question_kworded = adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history)
241
-
242
-
243
- print("The question passed to the vector search is:")
244
- print(new_question_kworded)
245
-
246
- docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, embeddings, k_val = 3, out_passages = 2, vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
247
-
248
- if docs_keep_as_doc == []:
249
- {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
250
-
251
- #new_inputs = inputs.copy()
252
- #new_inputs["question"] = new_question
253
- #new_inputs["chat_history"] = chat_history_str
254
-
255
- string_docs_content = '\n\n\n'.join(docs_content)
256
-
257
- #print("The draft instruction prompt is:")
258
- #print(instruction_prompt)
259
-
260
- instruction_prompt_out = instruction_prompt.format(question=new_question, summaries=string_docs_content)
261
- #print("The final instruction prompt:")
262
- #print(instruction_prompt_out)
263
-
264
-
265
- return instruction_prompt_out, string_docs_content
266
-
267
-
268
- # +
269
- def create_prompt_templates():
270
-
271
- #EXAMPLE_PROMPT = PromptTemplate(
272
- # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
273
- # input_variables=["page_content", "source"],
274
- #)
275
-
276
- CONTENT_PROMPT = PromptTemplate(
277
- template="{page_content}\n\n",#\n\nSOURCE: {source}\n\n",
278
- input_variables=["page_content"]
279
- )
280
-
281
-
282
- # The main prompt:
283
-
284
- #main_prompt_template = """
285
- #Answer the question using the CONTENT below:
286
-
287
- #CONTENT: {summaries}
288
-
289
- #QUESTION: {question}
290
-
291
- #ANSWER: """
292
-
293
- instruction_prompt_template = """
294
- {summaries}
295
-
296
- QUESTION: {question}
297
-
298
- Quote relevant text above."""
299
-
300
-
301
- INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template, input_variables=['question', 'summaries'])
302
-
303
- return INSTRUCTION_PROMPT, CONTENT_PROMPT
304
-
305
-
306
- # -
307
-
308
- def get_history_sources_final_input_prompt(user_input, history):
309
-
310
- #if chain_agent is None:
311
- # history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
312
- # return history, history, "", ""
313
- print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
314
- print("User input: " + user_input)
315
-
316
- history = history or []
317
-
318
-
319
-
320
- # Create instruction prompt
321
- instruction_prompt, content_prompt = create_prompt_templates()
322
- instruction_prompt_out, string_docs_content =\
323
- create_final_prompt({"question": user_input, "chat_history": history}, vectorstore,
324
- instruction_prompt, content_prompt)
325
-
326
- sources_txt = string_docs_content
327
-
328
- #print('sources_txt:')
329
- #print(sources_txt)
330
-
331
- history.append(user_input)
332
-
333
- print("Output history is:")
334
- print(history)
335
-
336
- print("The output prompt is:")
337
- print(instruction_prompt_out)
338
-
339
- return history, sources_txt, instruction_prompt_out
340
-
341
-
342
- # # Chat functions
343
-
344
- def produce_streaming_answer_chatbot(history, full_prompt):
345
-
346
- print("The question is: ")
347
- print(full_prompt)
348
-
349
- # Get the model and tokenizer, and tokenize the user text.
350
- model_inputs = tokenizer(text=full_prompt, return_tensors="pt").to(torch_device)
351
-
352
- # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
353
- # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
354
- streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
355
- generate_kwargs = dict(
356
- model_inputs,
357
- streamer=streamer,
358
- max_new_tokens=512,
359
- do_sample=True,
360
- #top_p=top_p,
361
- temperature=float(0.00001)#,
362
- #top_k=top_k
363
- )
364
- t = Thread(target=model.generate, kwargs=generate_kwargs)
365
- t.start()
366
-
367
- # Pull the generated text from the streamer, and update the model output.
368
-
369
- history[-1][1] = ""
370
- for new_text in streamer:
371
- history[-1][1] += new_text
372
- yield history
373
-
374
-
375
- def user(user_message, history):
376
- return gr.update(value="", interactive=False), history + [[user_message, None]]
377
-
378
-
379
- def add_inputs_answer_to_history(user_message, history):
380
- #history.append((user_message, [-1]))
381
-
382
- print("History after appending is:")
383
- print(history)
384
-
385
-
386
- return history
387
-
388
-
389
- # # Vector / hybrid search
390
-
391
- def find_relevant_passages(new_question_kworded, embeddings, k_val, out_passages, vec_score_cut_off, vec_weight, tfidf_weight, svm_weight, vectorstore=vectorstore):
392
-
393
- docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
394
- #docs = self.vstore.similarity_search_with_score(new_question_kworded, k=k_val)
395
-
396
- # Keep only documents with a certain score
397
- #docs_orig = [x[0] for x in docs]
398
- docs_scores = [x[1] for x in docs]
399
-
400
- # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
401
- score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
402
- docs_keep = list(compress(docs, score_more_limit))
403
-
404
- if docs_keep == []:
405
- docs_keep_as_doc = []
406
- docs_content = []
407
- docs_url = []
408
- return docs_keep_as_doc, docs_content, docs_url
409
-
410
-
411
-
412
- docs_keep_as_doc = [x[0] for x in docs_keep]
413
- docs_keep_length = len(docs_keep_as_doc)
414
-
415
- #print('docs_keep:')
416
- #print(docs_keep)
417
-
418
- vec_rank = [*range(1, docs_keep_length+1)]
419
- vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
420
-
421
- #print("vec_rank")
422
- #print(vec_rank)
423
-
424
- #print("vec_score")
425
- #print(vec_score)
426
-
427
-
428
-
429
- # 2nd level check on retrieved docs with TFIDF
430
- content_keep=[]
431
- for item in docs_keep:
432
- content_keep.append(item[0].page_content)
433
-
434
- tfidf_retriever = TFIDFRetriever.from_texts(content_keep, k = k_val)
435
- tfidf_result = tfidf_retriever.get_relevant_documents(new_question_kworded)
436
-
437
- #print("TDIDF retriever result:")
438
- #print(tfidf_result)
439
-
440
- tfidf_rank=[]
441
- tfidf_score = []
442
-
443
- for vec_item in docs_keep:
444
- x = 0
445
- for tfidf_item in tfidf_result:
446
- x = x + 1
447
- if tfidf_item.page_content == vec_item[0].page_content:
448
- tfidf_rank.append(x)
449
- tfidf_score.append((docs_keep_length/x)*tfidf_weight)
450
-
451
- #print("tfidf_rank:")
452
- #print(tfidf_rank)
453
- #print("tfidf_score:")
454
- #print(tfidf_score)
455
-
456
-
457
- # 3rd level check on retrieved docs with SVM retriever
458
- svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
459
- svm_result = svm_retriever.get_relevant_documents(new_question_kworded)
460
-
461
- #print("SVM retriever result:")
462
- #print(svm_result)
463
-
464
- svm_rank=[]
465
- svm_score = []
466
-
467
- for vec_item in docs_keep:
468
- x = 0
469
- for svm_item in svm_result:
470
- x = x + 1
471
- if svm_item.page_content == vec_item[0].page_content:
472
- svm_rank.append(x)
473
- svm_score.append((docs_keep_length/x)*svm_weight)
474
-
475
- #print("svm_score:")
476
- #print(svm_score)
477
-
478
-
479
- ## Calculate final score based on three ranking methods
480
- final_score = [a + b + c for a, b, c in zip(vec_score, tfidf_score, svm_score)]
481
- final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
482
-
483
- #print("Final score:")
484
- #print(final_score)
485
- #print("final rank:")
486
- #print(final_rank)
487
-
488
- best_rank_index_pos = []
489
-
490
- for x in range(1,out_passages+1):
491
- try:
492
- best_rank_index_pos.append(final_rank.index(x))
493
- except IndexError: # catch the error
494
- pass
495
-
496
- # Adjust best_rank_index_pos to
497
-
498
- #print("Best rank positions in original vector search list:")
499
- #print(best_rank_index_pos)
500
-
501
- best_rank_pos_series = pd.Series(best_rank_index_pos)
502
- #docs_keep_out = list(compress(docs_keep, best_rank_pos_series))
503
-
504
- #print("docs_keep:")
505
- #print(docs_keep)
506
-
507
- docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
508
-
509
-
510
- #docs_keep = [(docs_keep[best_rank_pos])]
511
- # Keep only 'best' options
512
- docs_keep_as_doc = [x[0] for x in docs_keep_out]# [docs_keep_as_doc_filt[0]]#[x[0] for x in docs_keep_as_doc_filt] #docs_keep_as_doc_filt[0]#
513
-
514
- #print("docs_keep_out:")
515
- #print(docs_keep_out)
516
-
517
- # Extract content and metadata from 'winning' passages.
518
-
519
- content=[]
520
- meta_url=[]
521
- score=[]
522
-
523
- for item in docs_keep_out:
524
- content.append(item[0].page_content)
525
- meta_url.append(item[0].metadata['source'])
526
- score.append(item[1])
527
-
528
- # Create df from 'winning' passages
529
-
530
- doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
531
- columns =['page_content', 'meta_url', 'score'])#.iloc[[0, 1]]
532
-
533
- #print("docs_keep_as_doc: ")
534
- #print(docs_keep_as_doc)
535
-
536
- #print("doc_df")
537
- #print(doc_df)
538
-
539
- docs_content = doc_df['page_content'].astype(str)
540
- docs_url = "https://" + doc_df['meta_url']
541
-
542
- #print("Docs meta url is: ")
543
- #print(docs_meta_url)
544
-
545
- #print("Docs content is: ")
546
- #print(docs_content)
547
-
548
- #docs_url = [d['source'] for d in docs_meta]
549
- #print(docs_url)
550
-
551
-
552
-
553
- return docs_keep_as_doc, docs_content, docs_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chatfuncs/.ipynb_checkpoints/ingest-checkpoint.py DELETED
@@ -1,509 +0,0 @@
1
- # ---
2
- # jupyter:
3
- # jupytext:
4
- # formats: ipynb,py:light
5
- # text_representation:
6
- # extension: .py
7
- # format_name: light
8
- # format_version: '1.5'
9
- # jupytext_version: 1.14.6
10
- # kernelspec:
11
- # display_name: Python 3 (ipykernel)
12
- # language: python
13
- # name: python3
14
- # ---
15
-
16
- # # Ingest website to FAISS
17
-
18
- # ## Install/ import stuff we need
19
-
20
- import os
21
- from pathlib import Path
22
- import re
23
- import requests
24
- import pandas as pd
25
- import dateutil.parser
26
- from typing import TypeVar, List
27
-
28
- from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
29
- from langchain.vectorstores.faiss import FAISS
30
- from langchain.text_splitter import RecursiveCharacterTextSplitter
31
- from langchain.docstore.document import Document
32
- from langchain.document_loaders import PyPDFLoader
33
-
34
- import magic
35
- from bs4 import BeautifulSoup
36
- from docx import Document as Doc
37
- from pypdf import PdfReader
38
- from docx import Document
39
-
40
- PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
41
- # -
42
-
43
- split_strat = [".", "!", "?", "\n\n", "\n", ",", " ", ""]
44
- chunk_size = 1000
45
- chunk_overlap = 200
46
-
47
- ## Overarching ingest function:
48
-
49
-
50
- def determine_file_type(file_path):
51
- """
52
- Determine the MIME type of the given file using the magic library.
53
-
54
- Parameters:
55
- file_path (str): Path to the file.
56
-
57
- Returns:
58
- str: MIME type of the file.
59
- """
60
- return magic.from_file(file_path, mime=True)
61
-
62
- def parse_pdf(file) -> List[str]:
63
-
64
- """
65
- Extract text from a PDF file.
66
-
67
- Parameters:
68
- file_path (str): Path to the PDF file.
69
-
70
- Returns:
71
- List[str]: Extracted text from the PDF.
72
- """
73
-
74
- output = []
75
- for i in range(0,len(file)):
76
- print(file[i].name)
77
- pdf = PdfReader(file[i].name) #[i]
78
- for page in pdf.pages:
79
- text = page.extract_text()
80
- # Merge hyphenated words
81
- text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
82
- # Fix newlines in the middle of sentences
83
- text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
84
- # Remove multiple newlines
85
- text = re.sub(r"\n\s*\n", "\n\n", text)
86
- output.append(text)
87
- return output
88
-
89
-
90
- def parse_docx(file_path):
91
- """
92
- Reads the content of a .docx file and returns it as a string.
93
-
94
- Parameters:
95
- - file_path (str): Path to the .docx file.
96
-
97
- Returns:
98
- - str: Content of the .docx file.
99
- """
100
- doc = Doc(file_path)
101
- full_text = []
102
- for para in doc.paragraphs:
103
- full_text.append(para.text)
104
- return '\n'.join(full_text)
105
-
106
-
107
- def parse_txt(file_path):
108
- """
109
- Read text from a TXT or HTML file.
110
-
111
- Parameters:
112
- file_path (str): Path to the TXT or HTML file.
113
-
114
- Returns:
115
- str: Text content of the file.
116
- """
117
- with open(file_path, 'r', encoding="utf-8") as file:
118
- return file.read()
119
-
120
-
121
-
122
- def parse_file(file_paths):
123
- """
124
- Accepts a list of file paths, determines each file's type,
125
- and passes it to the relevant parsing function.
126
-
127
- Parameters:
128
- file_paths (list): List of file paths.
129
-
130
- Returns:
131
- dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
132
- """
133
- if not isinstance(file_paths, list):
134
- raise ValueError("Expected a list of file paths.")
135
-
136
- mime_type_to_parser = {
137
- 'application/pdf': parse_pdf,
138
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parse_docx,
139
- 'text/plain': parse_txt,
140
- 'text/html': parse_html
141
- }
142
-
143
- parsed_contents = {}
144
-
145
- for file_path in file_paths:
146
- mime_type = determine_file_type(file_path)
147
- if mime_type in mime_type_to_parser:
148
- parsed_contents[file_path] = mime_type_to_parser[mime_type](file_path)
149
- else:
150
- parsed_contents[file_path] = f"Unsupported file type: {mime_type}"
151
-
152
- return parsed_contents
153
-
154
-
155
-
156
-
157
- def parse_html(page_url, div_filter="p"):
158
- """
159
- Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
160
-
161
- Parameters:
162
- page_url (str): The web URL or local file path.
163
-
164
- Returns:
165
- str: Extracted content.
166
- """
167
-
168
- def is_web_url(s):
169
- """
170
- Check if the input string is a web URL.
171
- """
172
- return s.startswith("http://") or s.startswith("https://")
173
-
174
- def is_local_html_file(s):
175
- """
176
- Check if the input string is a path to a local HTML file.
177
- """
178
- return (s.endswith(".html") or s.endswith(".htm")) and os.path.isfile(s)
179
-
180
- def extract_text_from_source(source):
181
- """
182
- Determine if the source is a web URL or a local HTML file,
183
- and then extract its content accordingly.
184
-
185
- Parameters:
186
- source (str): The web URL or local file path.
187
-
188
- Returns:
189
- str: Extracted content.
190
- """
191
- if is_web_url(source):
192
- response = requests.get(source)
193
- response.raise_for_status() # Raise an HTTPError for bad responses
194
- return response.text
195
- elif is_local_html_file(source):
196
- with open(source, 'r', encoding='utf-8') as file:
197
- return file.read()
198
- else:
199
- raise ValueError("Input is neither a valid web URL nor a local HTML file path.")
200
-
201
- def clean_html_data(data, date_filter="", div_filt="p"):
202
- """
203
- Extracts and cleans data from HTML content.
204
-
205
- Parameters:
206
- data (str): HTML content to be parsed.
207
- date_filter (str, optional): Date string to filter results. If set, only content with a date greater than this will be returned.
208
- div_filt (str, optional): HTML tag to search for text content. Defaults to "p".
209
-
210
- Returns:
211
- tuple: Contains extracted text and date as strings. Returns empty strings if not found.
212
- """
213
-
214
- soup = BeautifulSoup(data, 'html.parser')
215
-
216
- # Function to exclude div with id "bar"
217
- def exclude_div_with_id_bar(tag):
218
- return tag.has_attr('id') and tag['id'] == 'related-links'
219
-
220
- text_elements = soup.find_all(div_filt)
221
- date_elements = soup.find_all(div_filt, {"class": "page-neutral-intro__meta"})
222
-
223
- # Extract date
224
- date_out = ""
225
- if date_elements:
226
- date_out = re.search(">(.*?)<", str(date_elements[0])).group(1)
227
- date_dt = dateutil.parser.parse(date_out)
228
-
229
- if date_filter:
230
- date_filter_dt = dateutil.parser.parse(date_filter)
231
- if date_dt < date_filter_dt:
232
- return '', date_out
233
-
234
- # Extract text
235
- text_out_final = ""
236
- if text_elements:
237
- text_out_final = '\n'.join(paragraph.text for paragraph in text_elements)
238
- else:
239
- print(f"No elements found with tag '{div_filt}'. No text returned.")
240
-
241
- return text_out_final, date_out
242
-
243
-
244
- #page_url = "https://pypi.org/project/InstructorEmbedding/" #'https://www.ons.gov.uk/visualisations/censusareachanges/E09000022/index.html'
245
-
246
- html_text = extract_text_from_source(page_url)
247
- #print(page.text)
248
-
249
- texts = []
250
- metadatas = []
251
-
252
- clean_text, date = clean_html_data(html_text, date_filter="", div_filt=div_filter)
253
- texts.append(clean_text)
254
- metadatas.append({"source": page_url, "date":str(date)})
255
-
256
- return texts, metadatas
257
-
258
-
259
- # +
260
- # Convert parsed text to docs
261
- # -
262
-
263
- def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]:
264
- """
265
- Converts the output of parse_file (a dictionary of file paths to content)
266
- to a list of Documents with metadata.
267
- """
268
-
269
- doc_chunks = []
270
-
271
- for file_path, content in text_dict.items():
272
- ext = os.path.splitext(file_path)[1].lower()
273
-
274
- # Depending on the file extension, handle the content
275
- if ext == '.pdf':
276
- docs = pdf_text_to_docs(content, chunk_size)
277
- elif ext in ['.html', '.htm', '.txt', '.docx']:
278
- # Assuming you want to process HTML similarly to PDF in this context
279
- docs = html_text_to_docs(content, chunk_size)
280
- else:
281
- print(f"Unsupported file type {ext} for {file_path}. Skipping.")
282
- continue
283
-
284
- # Add filename as metadata
285
- for doc in docs:
286
- doc.metadata["file"] = file_path
287
-
288
- doc_chunks.extend(docs)
289
-
290
- return doc_chunks
291
-
292
-
293
-
294
- def pdf_text_to_docs(text: str, chunk_size: int = chunk_size) -> List[Document]:
295
- """Converts a string or list of strings to a list of Documents
296
- with metadata."""
297
- if isinstance(text, str):
298
- # Take a single string as one page
299
- text = [text]
300
-
301
- page_docs = [Document(page_content=page) for page in text]
302
-
303
- # Add page numbers as metadata
304
- for i, doc in enumerate(page_docs):
305
- doc.metadata["page"] = i + 1
306
-
307
- # Split pages into chunks
308
- doc_chunks = []
309
-
310
- for doc in page_docs:
311
- text_splitter = RecursiveCharacterTextSplitter(
312
- chunk_size=chunk_size,
313
- separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
314
- chunk_overlap=chunk_overlap,
315
- )
316
- chunks = text_splitter.split_text(doc.page_content)
317
-
318
-
319
- for i, chunk in enumerate(chunks):
320
- doc = Document(
321
- page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
322
- )
323
- # Add sources a metadata
324
- doc.metadata["page_chunk"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
325
- doc_chunks.append(doc)
326
- return doc_chunks
327
-
328
- def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
329
-
330
- text_splitter = RecursiveCharacterTextSplitter(
331
- separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
332
- chunk_size=chunk_size,
333
- chunk_overlap=chunk_overlap,
334
- length_function=len
335
- )
336
-
337
- #print(texts)
338
- #print(metadatas)
339
-
340
- documents = text_splitter.create_documents(texts, metadatas=metadatas)
341
-
342
- for i, chunk in enumerate(documents):
343
- chunk.metadata["chunk"] = i + 1
344
-
345
- return documents
346
-
347
-
348
-
349
-
350
-
351
-
352
- # # Functions for working with documents after loading them back in
353
-
354
- def pull_out_data(series):
355
-
356
- # define a lambda function to convert each string into a tuple
357
- to_tuple = lambda x: eval(x)
358
-
359
- # apply the lambda function to each element of the series
360
- series_tup = series.apply(to_tuple)
361
-
362
- series_tup_content = list(zip(*series_tup))[1]
363
-
364
- series = pd.Series(list(series_tup_content))#.str.replace("^Main post content", "", regex=True).str.strip()
365
-
366
- return series
367
-
368
-
369
- def docs_from_csv(df):
370
-
371
- import ast
372
-
373
- documents = []
374
-
375
- page_content = pull_out_data(df["0"])
376
- metadatas = pull_out_data(df["1"])
377
-
378
- for x in range(0,len(df)):
379
- new_doc = Document(page_content=page_content[x], metadata=metadatas[x])
380
- documents.append(new_doc)
381
-
382
- return documents
383
-
384
-
385
- def docs_from_lists(docs, metadatas):
386
-
387
- documents = []
388
-
389
- for x, doc in enumerate(docs):
390
- new_doc = Document(page_content=doc, metadata=metadatas[x])
391
- documents.append(new_doc)
392
-
393
- return documents
394
-
395
-
396
- def docs_elements_from_csv_save(docs_path="documents.csv"):
397
-
398
- documents = pd.read_csv(docs_path)
399
-
400
- docs_out = docs_from_csv(documents)
401
-
402
- out_df = pd.DataFrame(docs_out)
403
-
404
- docs_content = pull_out_data(out_df[0].astype(str))
405
-
406
- docs_meta = pull_out_data(out_df[1].astype(str))
407
-
408
- doc_sources = [d['source'] for d in docs_meta]
409
-
410
- return out_df, docs_content, docs_meta, doc_sources
411
-
412
-
413
- # documents = html_text_to_docs(texts, metadatas)
414
- #
415
- # documents[0]
416
- #
417
- # pd.DataFrame(documents).to_csv("documents.csv", index=None)
418
-
419
- # ## Create embeddings and save faiss vector store to the path specified in `save_to`
420
-
421
- def load_embeddings(model_name = "hkunlp/instructor-large"):
422
-
423
- if model_name == "hkunlp/instructor-large":
424
- embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
425
- embed_instruction="Represent the paragraph for retrieval: ",
426
- query_instruction="Represent the question for retrieving supporting documents: "
427
- )
428
-
429
- else:
430
- embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
431
-
432
- global embeddings
433
-
434
- embeddings = embeddings_func
435
-
436
- #return embeddings_func
437
-
438
-
439
- def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "hkunlp/instructor-large"):
440
-
441
- load_embeddings(model_name=model_name)
442
-
443
- #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
444
-
445
- print(f"> Total split documents: {len(docs_out)}")
446
-
447
- vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
448
-
449
-
450
- if Path(save_to).exists():
451
- vectorstore.save_local(folder_path=save_to)
452
-
453
- print("> DONE")
454
- print(f"> Saved to: {save_to}")
455
-
456
- ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
457
-
458
- import shutil
459
-
460
- shutil.make_archive(save_to, 'zip', save_to)
461
-
462
- os.remove(save_to + "/index.faiss")
463
- os.remove(save_to + "/index.pkl")
464
-
465
- shutil.move(save_to + '.zip', save_to + "/" + save_to + '.zip')
466
-
467
- return vectorstore
468
-
469
-
470
- # +
471
- # https://colab.research.google.com/drive/1RWqGXd2B6sPchlYVihKaBSsHy9zWRcYF#scrollTo=Q_eTIZwf4Dk2
472
-
473
- def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
474
- print(f"> Total split documents: {len(docs_out)}")
475
-
476
- vectordb = Chroma.from_documents(documents=docs_out,
477
- embedding=embeddings,
478
- persist_directory=save_to)
479
-
480
- # persiste the db to disk
481
- vectordb.persist()
482
-
483
- print("> DONE")
484
- print(f"> Saved to: {save_to}")
485
-
486
- return vectordb
487
-
488
-
489
- # + [markdown] jp-MarkdownHeadingCollapsed=true
490
- # ## Similarity search on saved vectorstore
491
- # -
492
-
493
- def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
494
-
495
- load_embeddings()
496
-
497
- docsearch = FAISS.load_local(folder_path=save_to, embeddings=embeddings)
498
-
499
-
500
- display(Markdown(question))
501
-
502
- search = docsearch.similarity_search_with_score(query, k=k_val)
503
-
504
- for item in search:
505
- print(item[0].page_content)
506
- print(f"Page: {item[0].metadata['source']}")
507
- print(f"Date: {item[0].metadata['date']}")
508
- print(f"Score: {item[1]}")
509
- print("---")