Spaces:
Sleeping
Sleeping
vidhiparikh
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import PyPDF2
|
2 |
import gradio as gr
|
3 |
-
import os
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain_community.llms import LlamaCpp
|
6 |
|
@@ -14,31 +13,26 @@ from sentence_transformers import SentenceTransformer, util
|
|
14 |
from langchain.callbacks.manager import CallbackManager
|
15 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
16 |
|
17 |
-
from ctransformers import AutoModelForCausalLM
|
18 |
-
|
19 |
-
# Customized file paths
|
20 |
pdf_files = ["CV_Vidhi_Parikh.pdf"]
|
21 |
|
22 |
-
|
23 |
-
def extract_documents_from_pdf(pdf_files):
|
24 |
documents = []
|
25 |
metadata = []
|
26 |
content = []
|
27 |
for pdf in pdf_files:
|
28 |
pdf_reader = PyPDF2.PdfReader(pdf)
|
29 |
-
for index,
|
30 |
-
document_page = {'title': pdf + " page " + str(index + 1),'content':
|
31 |
documents.append(document_page)
|
32 |
for doc in documents:
|
33 |
content.append(doc["content"])
|
34 |
metadata.append({
|
35 |
"title": doc["title"]
|
36 |
})
|
37 |
-
print("
|
38 |
return content, metadata
|
39 |
|
40 |
-
|
41 |
-
def split_documents_into_chunks(content, metadata):
|
42 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
43 |
chunk_size=512,
|
44 |
chunk_overlap=256,
|
@@ -47,34 +41,30 @@ def split_documents_into_chunks(content, metadata):
|
|
47 |
print(f"Documents split into {len(split_documents)} passages.")
|
48 |
return split_documents
|
49 |
|
50 |
-
|
51 |
-
def ingest_into_vector_database(split_documents):
|
52 |
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
53 |
database = FAISS.from_documents(split_documents, embeddings)
|
54 |
-
DB_PATH = 'vectorstore/
|
55 |
database.save_local(DB_PATH)
|
56 |
return database
|
57 |
|
58 |
-
# Customized conversation template
|
59 |
template = """[INST]
|
60 |
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
|
61 |
- Answer the question based on the provided documents.
|
62 |
-
- Be
|
63 |
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
|
64 |
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
|
65 |
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
|
66 |
- Do not fabricate information or include questions in your responses.
|
67 |
- Do not prompt to select answers. Do not ask additional questions.
|
68 |
-
- Cite the source of where exactly the information in the document
|
69 |
{question}
|
70 |
[/INST]
|
71 |
"""
|
72 |
|
73 |
-
# Callback manager for handling callbacks
|
74 |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
|
75 |
|
76 |
-
|
77 |
-
def create_conversational_chain(vectordb):
|
78 |
llama_llm = LlamaCpp(
|
79 |
model_path="llama-2-7b-chat.Q8_0.gguf",
|
80 |
temperature=0.75,
|
@@ -83,7 +73,7 @@ def create_conversational_chain(vectordb):
|
|
83 |
callback_manager=callback_manager,
|
84 |
n_ctx=3000)
|
85 |
|
86 |
-
retriever =
|
87 |
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
|
88 |
|
89 |
memory = ConversationBufferMemory(
|
@@ -95,7 +85,7 @@ def create_conversational_chain(vectordb):
|
|
95 |
#condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
96 |
memory=memory,
|
97 |
return_source_documents=True))
|
98 |
-
print("Conversational Chain created for the LLM using the vector store")
|
99 |
return conversation_chain
|
100 |
|
101 |
def validate_answer(response_answer, source_documents):
|
@@ -113,30 +103,21 @@ def validate_answer(response_answer, source_documents):
|
|
113 |
|
114 |
return False
|
115 |
|
116 |
-
|
117 |
-
content, metadata
|
118 |
-
|
119 |
-
# Split documents into text chunks
|
120 |
-
split_documents = split_documents_into_chunks(content, metadata)
|
121 |
-
|
122 |
-
# Ingest split documents into the vector database
|
123 |
-
vector_database = ingest_into_vector_database(split_documents)
|
124 |
print("Vector database created.")
|
|
|
125 |
|
126 |
-
|
127 |
-
conversation_chain = create_conversational_chain(vector_database)
|
128 |
-
|
129 |
-
# Function for the chatbot
|
130 |
-
def chat_with_bot(input_text):
|
131 |
user_query = input_text
|
132 |
response = conversation_chain({"question": user_query})
|
133 |
-
print("
|
134 |
-
print("
|
135 |
return response['answer']
|
136 |
|
137 |
-
# Create Gradio interface
|
138 |
iface = gr.Interface(
|
139 |
-
fn=
|
140 |
inputs=gr.inputs.Textbox(lines=2, label="User Input"),
|
141 |
outputs="text",
|
142 |
layout="vertical",
|
@@ -144,6 +125,4 @@ iface = gr.Interface(
|
|
144 |
description="Enter your message and the chatbot will respond."
|
145 |
)
|
146 |
|
147 |
-
# Launch the interface
|
148 |
iface.launch()
|
149 |
-
#
|
|
|
1 |
import PyPDF2
|
2 |
import gradio as gr
|
|
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain_community.llms import LlamaCpp
|
5 |
|
|
|
13 |
from langchain.callbacks.manager import CallbackManager
|
14 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
15 |
|
|
|
|
|
|
|
16 |
pdf_files = ["CV_Vidhi_Parikh.pdf"]
|
17 |
|
18 |
+
def extract_documents(pdf_files):
|
|
|
19 |
documents = []
|
20 |
metadata = []
|
21 |
content = []
|
22 |
for pdf in pdf_files:
|
23 |
pdf_reader = PyPDF2.PdfReader(pdf)
|
24 |
+
for index, text in enumerate(pdf_reader.pages):
|
25 |
+
document_page = {'title': pdf + " page " + str(index + 1),'content': pdf_reader.pages[index].extract_text()}
|
26 |
documents.append(document_page)
|
27 |
for doc in documents:
|
28 |
content.append(doc["content"])
|
29 |
metadata.append({
|
30 |
"title": doc["title"]
|
31 |
})
|
32 |
+
print("Content and metadata extracted from the documents.")
|
33 |
return content, metadata
|
34 |
|
35 |
+
def split_text_chunks(content, metadata):
|
|
|
36 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
37 |
chunk_size=512,
|
38 |
chunk_overlap=256,
|
|
|
41 |
print(f"Documents split into {len(split_documents)} passages.")
|
42 |
return split_documents
|
43 |
|
44 |
+
def ingest_into_database(split_documents):
|
|
|
45 |
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
46 |
database = FAISS.from_documents(split_documents, embeddings)
|
47 |
+
DB_PATH = 'vectorstore/db_faiss'
|
48 |
database.save_local(DB_PATH)
|
49 |
return database
|
50 |
|
|
|
51 |
template = """[INST]
|
52 |
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
|
53 |
- Answer the question based on the provided documents.
|
54 |
+
- Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
|
55 |
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
|
56 |
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
|
57 |
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
|
58 |
- Do not fabricate information or include questions in your responses.
|
59 |
- Do not prompt to select answers. Do not ask additional questions.
|
60 |
+
- Cite the source of where exactly is the information in the document and mention it in your responses.
|
61 |
{question}
|
62 |
[/INST]
|
63 |
"""
|
64 |
|
|
|
65 |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
|
66 |
|
67 |
+
def create_conversation_chain(database):
|
|
|
68 |
llama_llm = LlamaCpp(
|
69 |
model_path="llama-2-7b-chat.Q8_0.gguf",
|
70 |
temperature=0.75,
|
|
|
73 |
callback_manager=callback_manager,
|
74 |
n_ctx=3000)
|
75 |
|
76 |
+
retriever = database.as_retriever()
|
77 |
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
|
78 |
|
79 |
memory = ConversationBufferMemory(
|
|
|
85 |
#condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
86 |
memory=memory,
|
87 |
return_source_documents=True))
|
88 |
+
print("Conversational Chain created for the LLM using the vector store.")
|
89 |
return conversation_chain
|
90 |
|
91 |
def validate_answer(response_answer, source_documents):
|
|
|
103 |
|
104 |
return False
|
105 |
|
106 |
+
content, metadata = extract_documents(pdf_files)
|
107 |
+
split_documents = split_text_chunks(content, metadata)
|
108 |
+
database = ingest_into_database(split_documents)
|
|
|
|
|
|
|
|
|
|
|
109 |
print("Vector database created.")
|
110 |
+
conversation_chain = create_conversation_chain(database)
|
111 |
|
112 |
+
def chat(input_text):
|
|
|
|
|
|
|
|
|
113 |
user_query = input_text
|
114 |
response = conversation_chain({"question": user_query})
|
115 |
+
print("Answer: ", response)
|
116 |
+
print(" Only answer:", response['answer'])
|
117 |
return response['answer']
|
118 |
|
|
|
119 |
iface = gr.Interface(
|
120 |
+
fn=chat,
|
121 |
inputs=gr.inputs.Textbox(lines=2, label="User Input"),
|
122 |
outputs="text",
|
123 |
layout="vertical",
|
|
|
125 |
description="Enter your message and the chatbot will respond."
|
126 |
)
|
127 |
|
|
|
128 |
iface.launch()
|
|