nnpy commited on
Commit
d988646
·
verified ·
1 Parent(s): 4bc3cb4

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +187 -0
  3. getting_real_basecamp.pdf +3 -0
  4. requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ getting_real_basecamp.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from io import BytesIO
3
+ from PIL import Image
4
+ import google.generativeai as genai
5
+ import google.ai.generativelanguage as glm
6
+ from langchain.vectorstores import Chroma
7
+ from PyPDF2 import PdfReader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
+ import streamlit as st
11
+
12
+ st.title("DocsGPT")
13
+
14
+ genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
15
+
16
+ st.markdown(
17
+ """
18
+ <style>
19
+ .css-1jc7ptx, .e1ewe7hr3, .viewerBadge_container__1QSob,
20
+ .styles_viewerBadge__1yB5_, .viewerBadge_link__1S137,
21
+ .viewerBadge_text__1JaDK {
22
+ display: none;
23
+ }
24
+ </style>
25
+ """,
26
+ unsafe_allow_html=True
27
+ )
28
+
29
+ rag = glm.Tool(
30
+ function_declarations=[
31
+ glm.FunctionDeclaration(
32
+ name='vector_search',
33
+ description="Returns the content of the document user attached. Make sure that your not passing query as a question use like **keywords** instead. Use this function to search for contents in the user attached or uploaded documents to you. Try not to completly paste the user question as query, instead use keywords.",
34
+ parameters=glm.Schema(
35
+ type=glm.Type.OBJECT,
36
+ properties={
37
+ 'query': glm.Schema(type=glm.Type.STRING),
38
+ },
39
+ required=['query']
40
+ )
41
+ )
42
+ ]
43
+ )
44
+
45
+ gemini = genai.GenerativeModel('gemini-pro', tools=[rag])
46
+ gemini_vision = genai.GenerativeModel('gemini-pro-vision')
47
+
48
+ class rawkn:
49
+ def __init__(self, text):
50
+ self.text = text
51
+ def get_relevant_documents(self, query):
52
+ return self.text
53
+
54
+ def loader_data(files, include_getting_real):
55
+ file_type = files[0].type if len(files) > 0 else "application/pdf"
56
+ total_content = ''
57
+ num_pages = 0
58
+ if include_getting_real:
59
+ files.append("./getting_real_basecamp.pdf")
60
+ for file in files:
61
+ if file_type == "application/pdf":
62
+ pdf_reader = PdfReader(file)
63
+ content = ''
64
+ for page in pdf_reader.pages:
65
+ num_pages += 1
66
+ content += page.extract_text()
67
+ for img in page.images:
68
+ try:
69
+ image_stream = BytesIO(img.data)
70
+ img = Image.open(image_stream)
71
+ img_desc = gemini_vision.generate_content(["Generate a detailed description of the image. If it is a flow chart, please create a flowchart that exactly as it is. If it is table, try to create a table exactly like in the image. write all the text in the image it it contains any text. Clearly explain the image in more detailed.\nAlso make sure give a nice heading to the image contant.", img]).candidates[0].content.parts[0].text
72
+ print("***************************")
73
+ print(img_desc)
74
+ print("***************************")
75
+ content += "Image content:\n" + img_desc
76
+ except:
77
+ print("cannot extract image")
78
+
79
+ if file_type == "text/plain":
80
+ content = file.read()
81
+ content = content.decode("utf-8")
82
+ total_content += content
83
+
84
+ if num_pages <= 2:
85
+ chunk_size = 500
86
+ elif num_pages <= 3:
87
+ chunk_size = 1000
88
+ elif num_pages <= 5:
89
+ chunk_size = 2000
90
+ elif num_pages <= 10:
91
+ chunk_size = 3000
92
+ else:
93
+ chunk_size = 4000
94
+
95
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
96
+ texts = text_splitter.split_text(total_content)
97
+ try:
98
+ embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
99
+ vector_store = Chroma.from_texts(texts, embeddings).as_retriever()
100
+ st.session_state.knowledge = vector_store
101
+ st.session_state.chat.history.append(glm.Content(
102
+ parts=[glm.Part(
103
+ text=f"Now i've uploaded some files.\nHere are the list of documents you have access to:\n{[i.name if type(i) != str else i for i in files]}"
104
+ )],
105
+ role="user"
106
+ )
107
+ )
108
+ st.session_state.chat.history.append(glm.Content(
109
+ parts=[glm.Part(
110
+ text=f"Sure! Ask me anything about the documents you have uploaded. I can help you with that."
111
+ )],
112
+ role="model"
113
+ )
114
+ )
115
+ except:
116
+ st.session_state.knowledge = rawkn(total_content)
117
+
118
+ if "history" not in st.session_state:
119
+ st.session_state.history = []
120
+
121
+ if "knowledge" not in st.session_state:
122
+ st.session_state.knowledge = None
123
+
124
+ if "chat" not in st.session_state:
125
+ st.session_state.chat = gemini.start_chat(history=[glm.Content(
126
+ parts=[glm.Part(
127
+ text="Your name is DocsGPT. You are very helpful and can assist with documents uploaded by the user. Use the vector_search tool/function to search for contents in the user attached or uploaded documents to you.\nYou have access to all documents uploaded by the user."
128
+ )],
129
+ role="user"
130
+ ),
131
+ glm.Content(
132
+ parts=[glm.Part(
133
+ text="Sure, i can do that for you."
134
+ )],
135
+ role="model"
136
+ )])
137
+
138
+ for history in st.session_state.history:
139
+ with st.chat_message(history["role"]):
140
+ st.markdown(history["text"])
141
+
142
+ with st.sidebar:
143
+ st.title("Knowledge")
144
+ st.markdown("""### Tips to use DocsGPT:
145
+ - Upload your documents [pdf, txt] to DocsGPT and make sure to click on the process button.
146
+ - wait for a second and then start chatting with DocsGPT.
147
+ - While asking questions to DocsGPT about your uploaded files, please refer your uploaded files as *Document*, *Docs*, *attached or uploaded docs*, so the model can easily understands what you are referring to.""")
148
+ files = st.file_uploader("Upload a file", accept_multiple_files=True, type=["pdf", "txt"])
149
+ include_getting_real = st.checkbox("Include getting-real?")
150
+ process = st.button("Process")
151
+ if process and files:
152
+ with st.spinner('loading your file. This may take a while...'):
153
+ loader_data(files, include_getting_real)
154
+ elif process and include_getting_real:
155
+ with st.spinner('loading your file. This may take a while...'):
156
+ loader_data([], include_getting_real)
157
+
158
+ if prompt := st.chat_input("Enter your message..."):
159
+ st.session_state.history.append({"role": "user", "text": prompt})
160
+ with st.chat_message("user"):
161
+ st.markdown(prompt)
162
+ with st.chat_message("assistant"):
163
+ message_placeholder = st.empty()
164
+ response = st.session_state.chat.send_message(prompt)
165
+ if response.candidates[0].content.parts[0].text == '':
166
+ args = response.candidates[0].content.parts[0].function_call.args['query']
167
+ if st.session_state.knowledge is not None:
168
+ print("searching for ", args)
169
+ related_docs = str(st.session_state.knowledge.get_relevant_documents(args))
170
+ print(related_docs)
171
+ else:
172
+ related_docs = 'No knowledge documents loaded'
173
+ response = st.session_state.chat.send_message(
174
+ glm.Content(
175
+ parts=[glm.Part(
176
+ function_response = glm.FunctionResponse(
177
+ name='vector_search',
178
+ response={'rag': related_docs},
179
+ )
180
+ )]
181
+ )
182
+ ).candidates[0].content.parts[0].text
183
+ else:
184
+ response = response.candidates[0].content.parts[0].text
185
+ print(st.session_state.chat.history)
186
+ message_placeholder.markdown(response)
187
+ st.session_state.history.append({"role": "assistant", "text": response})
getting_real_basecamp.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a369da3ab9d824af8eddc9bfbaa6f8d9ae4a6cc3981f0bb92c2b19e46a563af
3
+ size 5118368
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ pypdf
3
+ PyPDF2
4
+ chromadb
5
+ langchain-google-genai
6
+ langchain-community
7
+ streamlit
8
+ google-generativeai