JaiSurya commited on
Commit
9f493b6
1 Parent(s): b06b29e

Initial app setup

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat With Pdf
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import core
3
+
4
+ def process_pdf_and_text(pdf_file_path, user_text):
5
+ print(f"[INFO] The pdf file is in the {pdf_file_path}")
6
+ if not hasattr(process_pdf_and_text,"_called"):
7
+ core.process_pdf(pdf_file_path)
8
+ process_pdf_and_text._called = True
9
+
10
+ result = core.process_query(user_text)
11
+ return result
12
+
13
+ def main():
14
+ # input components
15
+ pdf_input = gr.File(label="Upload PDF File")
16
+ text_input = gr.TextArea(label="Enter the query")
17
+ # output component
18
+ output_text = gr.TextArea()
19
+
20
+ # app interface
21
+ demo = gr.Interface(
22
+ fn=process_pdf_and_text,
23
+ inputs=[pdf_input, text_input],
24
+ outputs=output_text,
25
+ title="Chat With PDF",
26
+ description="RAG based Chat with pdf"
27
+ )
28
+
29
+ demo.launch()
30
+
31
+ if __name__ == "__main__":
32
+ main()
.ipynb_checkpoints/core-checkpoint.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from embeddings import Embeddings
2
+ from rag import RAG
3
+
4
+ rag_ = None
5
+
6
+ def process_pdf(file:str):
7
+ emb = Embeddings(file)
8
+ emb.save_the_embeddings()
9
+ global rag_
10
+ rag_ = RAG()
11
+
12
+ def process_query(user_text:str):
13
+ global rag_
14
+ return rag_.query(user_text)
.ipynb_checkpoints/embeddings-checkpoint.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains all the functionalities from the pdf extraction to the embeddings
2
+ import os
3
+ import re
4
+
5
+ from tqdm import tqdm
6
+ from spacy.lang.en import English
7
+ import fitz
8
+ import pandas as pd
9
+
10
+ import torch
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+ class Embeddings:
14
+
15
+ def __init__(self,pdf_file_path : str):
16
+ self.pdf_file_path = pdf_file_path
17
+ self.embedding_model_name = "all-mpnet-base-v2"
18
+ self.device = self.get_device()
19
+
20
+ def get_device(self) -> str:
21
+ """ Returns the device """
22
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
23
+ return device
24
+
25
+ def text_formatter(self,text : str) -> str:
26
+ """ Convert the text that contains the /n with the space"""
27
+ formatted_text = text.replace('\n',' ').strip()
28
+
29
+ return formatted_text
30
+
31
+ def count_and_split_sentence(self,text : str) -> (int,list[str]):
32
+ """To count and split the sentences from the given text """
33
+ nlp = English()
34
+ nlp.add_pipe("sentencizer")
35
+
36
+ list_of_sentences = list(nlp(text).sents)
37
+ list_of_sentences = [str(sentence) for sentence in list_of_sentences]
38
+
39
+ return len(list_of_sentences),list_of_sentences
40
+
41
+ def open_pdf(self):
42
+ """convert the pdf into dict dtype"""
43
+ doc = fitz.open(self.pdf_file_path)
44
+ data = []
45
+
46
+ print("[INFO] Converting the pdf into dict dtype")
47
+ for page_number,page in tqdm(enumerate(doc)):
48
+ text = page.get_text()
49
+ text = self.text_formatter(text = text)
50
+
51
+ sentence_count,sentences = self.count_and_split_sentence(text)
52
+
53
+ data.append(
54
+ {
55
+ "page_number" : page_number,
56
+ "char_count" : len(text),
57
+ "word_count" : len(text.split(" ")),
58
+ "sentence_count" : sentence_count,
59
+ "token_count" : len(text) / 4,
60
+ "sentence" : sentences,
61
+ "text" : text
62
+ }
63
+ )
64
+
65
+ return data
66
+
67
+ def split_the_array(self,array_list : list,
68
+ chunk_length : int) -> list[list[str]]:
69
+ """Split the array of sentences into groups of chunks"""
70
+ return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]
71
+
72
+ def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
73
+ """ Convert the sentences into chunks """
74
+ pages_and_texts = self.open_pdf()
75
+ pages_and_chunks = []
76
+
77
+ # splitting the chunks
78
+ print("[INFO] Splitting the sentences ")
79
+ for item in tqdm(pages_and_texts):
80
+ item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
81
+ item["chunk_count"] = len(item["sentence_chunks"])
82
+
83
+ # splitting the chunks
84
+ print("[INFO] Splitting into chunks ")
85
+ for item in tqdm(pages_and_texts):
86
+ for chunks in item["sentence_chunks"]:
87
+ d = {}
88
+
89
+ joined_sentence = "".join(chunks).replace(" "," ").strip()
90
+ joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends
91
+
92
+ if len(joined_sentence) / 4 > 30:
93
+ d["page_number"] = item["page_number"]
94
+ d["sentence_chunk"] = joined_sentence
95
+ # stats
96
+ d["char_count"] = len(joined_sentence)
97
+ d["word_count"] = len(list(joined_sentence.split(" ")))
98
+ d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
99
+
100
+ pages_and_chunks.append(d)
101
+
102
+ return pages_and_chunks
103
+
104
+ def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
105
+
106
+ data = self.convert_to_chunk(chunk_size)
107
+
108
+ embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
109
+ print("[INFO] Converting into embeddings ")
110
+ for item in tqdm(data):
111
+ item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
112
+
113
+ return data
114
+
115
+ def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
116
+ embedd_file = filename
117
+ if data is None:
118
+ data = self.convert_to_embedds()
119
+ dataframe = pd.DataFrame(data)
120
+ dataframe.to_csv(embedd_file,index = False)
.ipynb_checkpoints/rag-checkpoint.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this python file contains all steps from the retrieval to generation code
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer,util
6
+ from transformers import AutoTokenizer , AutoModelForCausalLM
7
+
8
+
9
+ class RAG:
10
+
11
+ def __init__(self):
12
+ self.model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ self.embedding_model_name = "all-mpnet-base-v2"
16
+ self.embeddings_filename = "embeddings.csv"
17
+ self.data_pd = pd.read_csv(self.embeddings_filename)
18
+ self.data_dict = pd.read_csv(self.embeddings_filename).to_dict(orient='records')
19
+ self.data_embeddings = self.get_embeddings()
20
+
21
+ self.embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
22
+ # Tokenizer
23
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
24
+
25
+ # LLM
26
+ self.llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=self.model_id,
27
+ torch_dtype=torch.float16).to(self.device)
28
+
29
+ def get_embeddings(self) -> list:
30
+ """Returns the embeddings from the csv file"""
31
+ data_embeddings = []
32
+
33
+ for tensor_str in self.data_pd["embeddings"]:
34
+ values_str = tensor_str.split("[")[1].split("]")[0]
35
+ values_list = [float(val) for val in values_str.split(",")]
36
+ tensor_result = torch.tensor(values_list)
37
+ data_embeddings.append(tensor_result)
38
+
39
+ data_embeddings = torch.stack(data_embeddings).to(self.device)
40
+ return data_embeddings
41
+
42
+
43
+ def retrieve_relevant_resource(self,user_query : str , k = 5):
44
+ """Function to retrieve relevant resource"""
45
+ query_embedding = self.embedding_model.encode(user_query, convert_to_tensor = True).to(self.device)
46
+ dot_score = util.dot_score( a = query_embedding, b = self.data_embeddings)[0]
47
+ score , idx = torch.topk(dot_score,k=k)
48
+ return score,idx
49
+
50
+ def prompt_formatter(self,query: str, context_items: list[dict]) -> str:
51
+ """
52
+ Augments query with text-based context from context_items.
53
+ """
54
+ # Join context items into one dotted paragraph
55
+ context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
56
+
57
+ base_prompt = """Based on the following context items, please answer the query.
58
+ \nNow use the following context items to answer the user query:
59
+ {context}
60
+ \nRelevant passages: <extract relevant passages from the context here>
61
+ User query: {query}
62
+ Answer:"""
63
+
64
+ # Update base prompt with context items and query
65
+ base_prompt = base_prompt.format(context=context, query=query)
66
+
67
+ # Create prompt template for instruction-tuned model
68
+ dialogue_template = [
69
+ {"role": "user",
70
+ "content": base_prompt}
71
+ ]
72
+
73
+ # Apply the chat template
74
+ prompt = self.tokenizer.apply_chat_template(conversation=dialogue_template,
75
+ tokenize=False,
76
+ add_generation_prompt=True)
77
+ return prompt
78
+
79
+ def query(self,user_text : str):
80
+ scores, indices = self.retrieve_relevant_resource(user_text)
81
+ context_items = [self.data_dict[i] for i in indices]
82
+ prompt = self.prompt_formatter(query=user_text,context_items=context_items)
83
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
84
+ outputs = self.llm_model.generate(**input_ids,max_new_tokens=256)
85
+ output_text = self.tokenizer.decode(outputs[0])
86
+ output_text = output_text.split("<|assistant|>")
87
+ output_text = output_text[1].split("</s>")[0]
88
+
89
+ return output_text
90
+
91
+
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ spacy
4
+ tqdm
5
+ PyMuPDF
6
+ torch
7
+ sentence_transformers
8
+ transformers
9
+ gradio
app.py CHANGED
@@ -1,63 +1,32 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
-
61
-
62
  if __name__ == "__main__":
63
- demo.launch()
 
1
  import gradio as gr
2
+ import core
3
+
4
+ def process_pdf_and_text(pdf_file_path, user_text):
5
+ print(f"[INFO] The pdf file is in the {pdf_file_path}")
6
+ if not hasattr(process_pdf_and_text,"_called"):
7
+ core.process_pdf(pdf_file_path)
8
+ process_pdf_and_text._called = True
9
+
10
+ result = core.process_query(user_text)
11
+ return result
12
+
13
+ def main():
14
+ # input components
15
+ pdf_input = gr.File(label="Upload PDF File")
16
+ text_input = gr.TextArea(label="Enter the query")
17
+ # output component
18
+ output_text = gr.TextArea()
19
+
20
+ # app interface
21
+ demo = gr.Interface(
22
+ fn=process_pdf_and_text,
23
+ inputs=[pdf_input, text_input],
24
+ outputs=output_text,
25
+ title="Chat With PDF",
26
+ description="RAG based Chat with pdf"
27
+ )
28
+
29
+ demo.launch()
30
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if __name__ == "__main__":
32
+ main()
core.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from embeddings import Embeddings
2
+ from rag import RAG
3
+
4
+ rag_ = None
5
+
6
+ def process_pdf(file:str):
7
+ emb = Embeddings(file)
8
+ emb.save_the_embeddings()
9
+ global rag_
10
+ rag_ = RAG()
11
+
12
+ def process_query(user_text:str):
13
+ global rag_
14
+ return rag_.query(user_text)
embeddings.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains all the functionalities from the pdf extraction to the embeddings
2
+ import os
3
+ import re
4
+
5
+ from tqdm import tqdm
6
+ from spacy.lang.en import English
7
+ import fitz
8
+ import pandas as pd
9
+
10
+ import torch
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+ class Embeddings:
14
+
15
+ def __init__(self,pdf_file_path : str):
16
+ self.pdf_file_path = pdf_file_path
17
+ self.embedding_model_name = "all-mpnet-base-v2"
18
+ self.device = self.get_device()
19
+
20
+ def get_device(self) -> str:
21
+ """ Returns the device """
22
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
23
+ return device
24
+
25
+ def text_formatter(self,text : str) -> str:
26
+ """ Convert the text that contains the /n with the space"""
27
+ formatted_text = text.replace('\n',' ').strip()
28
+
29
+ return formatted_text
30
+
31
+ def count_and_split_sentence(self,text : str) -> (int,list[str]):
32
+ """To count and split the sentences from the given text """
33
+ nlp = English()
34
+ nlp.add_pipe("sentencizer")
35
+
36
+ list_of_sentences = list(nlp(text).sents)
37
+ list_of_sentences = [str(sentence) for sentence in list_of_sentences]
38
+
39
+ return len(list_of_sentences),list_of_sentences
40
+
41
+ def open_pdf(self):
42
+ """convert the pdf into dict dtype"""
43
+ doc = fitz.open(self.pdf_file_path)
44
+ data = []
45
+
46
+ print("[INFO] Converting the pdf into dict dtype")
47
+ for page_number,page in tqdm(enumerate(doc)):
48
+ text = page.get_text()
49
+ text = self.text_formatter(text = text)
50
+
51
+ sentence_count,sentences = self.count_and_split_sentence(text)
52
+
53
+ data.append(
54
+ {
55
+ "page_number" : page_number,
56
+ "char_count" : len(text),
57
+ "word_count" : len(text.split(" ")),
58
+ "sentence_count" : sentence_count,
59
+ "token_count" : len(text) / 4,
60
+ "sentence" : sentences,
61
+ "text" : text
62
+ }
63
+ )
64
+
65
+ return data
66
+
67
+ def split_the_array(self,array_list : list,
68
+ chunk_length : int) -> list[list[str]]:
69
+ """Split the array of sentences into groups of chunks"""
70
+ return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]
71
+
72
+ def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
73
+ """ Convert the sentences into chunks """
74
+ pages_and_texts = self.open_pdf()
75
+ pages_and_chunks = []
76
+
77
+ # splitting the chunks
78
+ print("[INFO] Splitting the sentences ")
79
+ for item in tqdm(pages_and_texts):
80
+ item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
81
+ item["chunk_count"] = len(item["sentence_chunks"])
82
+
83
+ # splitting the chunks
84
+ print("[INFO] Splitting into chunks ")
85
+ for item in tqdm(pages_and_texts):
86
+ for chunks in item["sentence_chunks"]:
87
+ d = {}
88
+
89
+ joined_sentence = "".join(chunks).replace(" "," ").strip()
90
+ joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends
91
+
92
+ if len(joined_sentence) / 4 > 30:
93
+ d["page_number"] = item["page_number"]
94
+ d["sentence_chunk"] = joined_sentence
95
+ # stats
96
+ d["char_count"] = len(joined_sentence)
97
+ d["word_count"] = len(list(joined_sentence.split(" ")))
98
+ d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
99
+
100
+ pages_and_chunks.append(d)
101
+
102
+ return pages_and_chunks
103
+
104
+ def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
105
+
106
+ data = self.convert_to_chunk(chunk_size)
107
+
108
+ embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
109
+ print("[INFO] Converting into embeddings ")
110
+ for item in tqdm(data):
111
+ item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
112
+
113
+ return data
114
+
115
+ def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
116
+ embedd_file = filename
117
+ if data is None:
118
+ data = self.convert_to_embedds()
119
+ dataframe = pd.DataFrame(data)
120
+ dataframe.to_csv(embedd_file,index = False)
rag.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this python file contains all steps from the retrieval to generation code
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer,util
6
+ from transformers import AutoTokenizer , AutoModelForCausalLM
7
+
8
+
9
+ class RAG:
10
+
11
+ def __init__(self):
12
+ self.model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ self.embedding_model_name = "all-mpnet-base-v2"
16
+ self.embeddings_filename = "embeddings.csv"
17
+ self.data_pd = pd.read_csv(self.embeddings_filename)
18
+ self.data_dict = pd.read_csv(self.embeddings_filename).to_dict(orient='records')
19
+ self.data_embeddings = self.get_embeddings()
20
+
21
+ self.embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
22
+ # Tokenizer
23
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
24
+
25
+ # LLM
26
+ self.llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=self.model_id,
27
+ torch_dtype=torch.float16).to(self.device)
28
+
29
+ def get_embeddings(self) -> list:
30
+ """Returns the embeddings from the csv file"""
31
+ data_embeddings = []
32
+
33
+ for tensor_str in self.data_pd["embeddings"]:
34
+ values_str = tensor_str.split("[")[1].split("]")[0]
35
+ values_list = [float(val) for val in values_str.split(",")]
36
+ tensor_result = torch.tensor(values_list)
37
+ data_embeddings.append(tensor_result)
38
+
39
+ data_embeddings = torch.stack(data_embeddings).to(self.device)
40
+ return data_embeddings
41
+
42
+
43
+ def retrieve_relevant_resource(self,user_query : str , k = 5):
44
+ """Function to retrieve relevant resource"""
45
+ query_embedding = self.embedding_model.encode(user_query, convert_to_tensor = True).to(self.device)
46
+ dot_score = util.dot_score( a = query_embedding, b = self.data_embeddings)[0]
47
+ score , idx = torch.topk(dot_score,k=k)
48
+ return score,idx
49
+
50
+ def prompt_formatter(self,query: str, context_items: list[dict]) -> str:
51
+ """
52
+ Augments query with text-based context from context_items.
53
+ """
54
+ # Join context items into one dotted paragraph
55
+ context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
56
+
57
+ base_prompt = """Based on the following context items, please answer the query.
58
+ \nNow use the following context items to answer the user query:
59
+ {context}
60
+ \nRelevant passages: <extract relevant passages from the context here>
61
+ User query: {query}
62
+ Answer:"""
63
+
64
+ # Update base prompt with context items and query
65
+ base_prompt = base_prompt.format(context=context, query=query)
66
+
67
+ # Create prompt template for instruction-tuned model
68
+ dialogue_template = [
69
+ {"role": "user",
70
+ "content": base_prompt}
71
+ ]
72
+
73
+ # Apply the chat template
74
+ prompt = self.tokenizer.apply_chat_template(conversation=dialogue_template,
75
+ tokenize=False,
76
+ add_generation_prompt=True)
77
+ return prompt
78
+
79
+ def query(self,user_text : str):
80
+ scores, indices = self.retrieve_relevant_resource(user_text)
81
+ context_items = [self.data_dict[i] for i in indices]
82
+ prompt = self.prompt_formatter(query=user_text,context_items=context_items)
83
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
84
+ outputs = self.llm_model.generate(**input_ids,max_new_tokens=256)
85
+ output_text = self.tokenizer.decode(outputs[0])
86
+ output_text = output_text.split("<|assistant|>")
87
+ output_text = output_text[1].split("</s>")[0]
88
+
89
+ return output_text
90
+
91
+
requirements.txt CHANGED
@@ -1 +1,9 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ spacy
4
+ tqdm
5
+ PyMuPDF
6
+ torch
7
+ sentence_transformers
8
+ transformers
9
+ gradio