vidhiparikh commited on
Commit
f8ac855
·
verified ·
1 Parent(s): 074e0cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -41
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import PyPDF2
2
  import gradio as gr
3
- import os
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.llms import LlamaCpp
6
 
@@ -14,31 +13,26 @@ from sentence_transformers import SentenceTransformer, util
14
  from langchain.callbacks.manager import CallbackManager
15
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
16
 
17
- from ctransformers import AutoModelForCausalLM
18
-
19
- # Customized file paths
20
  pdf_files = ["CV_Vidhi_Parikh.pdf"]
21
 
22
- # Function to extract documents from PDF files
23
- def extract_documents_from_pdf(pdf_files):
24
  documents = []
25
  metadata = []
26
  content = []
27
  for pdf in pdf_files:
28
  pdf_reader = PyPDF2.PdfReader(pdf)
29
- for index, page in enumerate(pdf_reader.pages):
30
- document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
31
  documents.append(document_page)
32
  for doc in documents:
33
  content.append(doc["content"])
34
  metadata.append({
35
  "title": doc["title"]
36
  })
37
- print("Documents extracted from PDF files.")
38
  return content, metadata
39
 
40
- # Function to split documents into text chunks
41
- def split_documents_into_chunks(content, metadata):
42
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
43
  chunk_size=512,
44
  chunk_overlap=256,
@@ -47,34 +41,30 @@ def split_documents_into_chunks(content, metadata):
47
  print(f"Documents split into {len(split_documents)} passages.")
48
  return split_documents
49
 
50
- # Function to ingest split documents into the vector database
51
- def ingest_into_vector_database(split_documents):
52
  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
53
  database = FAISS.from_documents(split_documents, embeddings)
54
- DB_PATH = 'vectorstore/vector_database'
55
  database.save_local(DB_PATH)
56
  return database
57
 
58
- # Customized conversation template
59
  template = """[INST]
60
  As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
61
  - Answer the question based on the provided documents.
62
- - Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
63
  - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
64
  - If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
65
  - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
66
  - Do not fabricate information or include questions in your responses.
67
  - Do not prompt to select answers. Do not ask additional questions.
68
- - Cite the source of where exactly the information in the document is found and mention it in your responses.
69
  {question}
70
  [/INST]
71
  """
72
 
73
- # Callback manager for handling callbacks
74
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
75
 
76
- # Function to create a conversational chain
77
- def create_conversational_chain(vectordb):
78
  llama_llm = LlamaCpp(
79
  model_path="llama-2-7b-chat.Q8_0.gguf",
80
  temperature=0.75,
@@ -83,7 +73,7 @@ def create_conversational_chain(vectordb):
83
  callback_manager=callback_manager,
84
  n_ctx=3000)
85
 
86
- retriever = vectordb.as_retriever()
87
  CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
88
 
89
  memory = ConversationBufferMemory(
@@ -95,7 +85,7 @@ def create_conversational_chain(vectordb):
95
  #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
96
  memory=memory,
97
  return_source_documents=True))
98
- print("Conversational Chain created for the LLM using the vector store")
99
  return conversation_chain
100
 
101
  def validate_answer(response_answer, source_documents):
@@ -113,30 +103,21 @@ def validate_answer(response_answer, source_documents):
113
 
114
  return False
115
 
116
- # Extract documents from PDF files
117
- content, metadata = extract_documents_from_pdf(pdf_files)
118
-
119
- # Split documents into text chunks
120
- split_documents = split_documents_into_chunks(content, metadata)
121
-
122
- # Ingest split documents into the vector database
123
- vector_database = ingest_into_vector_database(split_documents)
124
  print("Vector database created.")
 
125
 
126
- # Create the conversation chain
127
- conversation_chain = create_conversational_chain(vector_database)
128
-
129
- # Function for the chatbot
130
- def chat_with_bot(input_text):
131
  user_query = input_text
132
  response = conversation_chain({"question": user_query})
133
- print("Response:", response)
134
- print("Answer:", response['answer'])
135
  return response['answer']
136
 
137
- # Create Gradio interface
138
  iface = gr.Interface(
139
- fn=chat_with_bot,
140
  inputs=gr.inputs.Textbox(lines=2, label="User Input"),
141
  outputs="text",
142
  layout="vertical",
@@ -144,6 +125,4 @@ iface = gr.Interface(
144
  description="Enter your message and the chatbot will respond."
145
  )
146
 
147
- # Launch the interface
148
  iface.launch()
149
- #
 
1
  import PyPDF2
2
  import gradio as gr
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.llms import LlamaCpp
5
 
 
13
  from langchain.callbacks.manager import CallbackManager
14
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
15
 
 
 
 
16
  pdf_files = ["CV_Vidhi_Parikh.pdf"]
17
 
18
+ def extract_documents(pdf_files):
 
19
  documents = []
20
  metadata = []
21
  content = []
22
  for pdf in pdf_files:
23
  pdf_reader = PyPDF2.PdfReader(pdf)
24
+ for index, text in enumerate(pdf_reader.pages):
25
+ document_page = {'title': pdf + " page " + str(index + 1),'content': pdf_reader.pages[index].extract_text()}
26
  documents.append(document_page)
27
  for doc in documents:
28
  content.append(doc["content"])
29
  metadata.append({
30
  "title": doc["title"]
31
  })
32
+ print("Content and metadata extracted from the documents.")
33
  return content, metadata
34
 
35
+ def split_text_chunks(content, metadata):
 
36
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
37
  chunk_size=512,
38
  chunk_overlap=256,
 
41
  print(f"Documents split into {len(split_documents)} passages.")
42
  return split_documents
43
 
44
+ def ingest_into_database(split_documents):
 
45
  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
46
  database = FAISS.from_documents(split_documents, embeddings)
47
+ DB_PATH = 'vectorstore/db_faiss'
48
  database.save_local(DB_PATH)
49
  return database
50
 
 
51
  template = """[INST]
52
  As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
53
  - Answer the question based on the provided documents.
54
+ - Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
55
  - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
56
  - If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
57
  - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
58
  - Do not fabricate information or include questions in your responses.
59
  - Do not prompt to select answers. Do not ask additional questions.
60
+ - Cite the source of where exactly is the information in the document and mention it in your responses.
61
  {question}
62
  [/INST]
63
  """
64
 
 
65
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
66
 
67
+ def create_conversation_chain(database):
 
68
  llama_llm = LlamaCpp(
69
  model_path="llama-2-7b-chat.Q8_0.gguf",
70
  temperature=0.75,
 
73
  callback_manager=callback_manager,
74
  n_ctx=3000)
75
 
76
+ retriever = database.as_retriever()
77
  CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
78
 
79
  memory = ConversationBufferMemory(
 
85
  #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
86
  memory=memory,
87
  return_source_documents=True))
88
+ print("Conversational Chain created for the LLM using the vector store.")
89
  return conversation_chain
90
 
91
  def validate_answer(response_answer, source_documents):
 
103
 
104
  return False
105
 
106
+ content, metadata = extract_documents(pdf_files)
107
+ split_documents = split_text_chunks(content, metadata)
108
+ database = ingest_into_database(split_documents)
 
 
 
 
 
109
  print("Vector database created.")
110
+ conversation_chain = create_conversation_chain(database)
111
 
112
+ def chat(input_text):
 
 
 
 
113
  user_query = input_text
114
  response = conversation_chain({"question": user_query})
115
+ print("Answer: ", response)
116
+ print(" Only answer:", response['answer'])
117
  return response['answer']
118
 
 
119
  iface = gr.Interface(
120
+ fn=chat,
121
  inputs=gr.inputs.Textbox(lines=2, label="User Input"),
122
  outputs="text",
123
  layout="vertical",
 
125
  description="Enter your message and the chatbot will respond."
126
  )
127
 
 
128
  iface.launch()