peinan commited on
Commit
4869103
ยท
1 Parent(s): 6c7af2d

add RAG part

Browse files
.gitignore CHANGED
@@ -11,3 +11,8 @@ wheels/
11
 
12
  .DS_Store
13
  gradio_cached_examples
 
 
 
 
 
 
11
 
12
  .DS_Store
13
  gradio_cached_examples
14
+
15
+ .env
16
+ local_qdrant
17
+
18
+ .vscode
pyproject.toml CHANGED
@@ -7,9 +7,14 @@ authors = [
7
  ]
8
  dependencies = [
9
  "gradio>=4.19.2",
10
- "langchain>=0.1.9",
11
  "gradio-pdf>=0.0.5",
12
  "loguru>=0.7.2",
 
 
 
 
 
 
13
  ]
14
  readme = "README.md"
15
  requires-python = ">= 3.8"
 
7
  ]
8
  dependencies = [
9
  "gradio>=4.19.2",
 
10
  "gradio-pdf>=0.0.5",
11
  "loguru>=0.7.2",
12
+ "pypdf>=4.0.2",
13
+ "langchain>=0.1.9",
14
+ "openai>=1.12.0",
15
+ "tiktoken>=0.6.0",
16
+ "qdrant-client>=1.7.3",
17
+ "requests>=2.31.0",
18
  ]
19
  readme = "README.md"
20
  requires-python = ">= 3.8"
requirements-dev.lock CHANGED
@@ -22,6 +22,7 @@ annotated-types==0.6.0
22
  anyio==4.3.0
23
  # via httpx
24
  # via langchain-core
 
25
  # via starlette
26
  asttokens==2.4.1
27
  # via icecream
@@ -51,6 +52,8 @@ dataclasses-json==0.6.4
51
  # via langchain-community
52
  decorator==5.1.1
53
  # via ipython
 
 
54
  executing==2.0.1
55
  # via icecream
56
  # via stack-data
@@ -75,17 +78,30 @@ gradio-client==0.10.1
75
  # via gradio
76
  gradio-pdf==0.0.5
77
  # via pdfchat
 
 
 
 
 
78
  h11==0.14.0
79
  # via httpcore
80
  # via uvicorn
 
 
 
 
81
  httpcore==1.0.4
82
  # via httpx
83
  httpx==0.27.0
84
  # via gradio
85
  # via gradio-client
 
 
86
  huggingface-hub==0.20.3
87
  # via gradio
88
  # via gradio-client
 
 
89
  icecream==2.1.3
90
  idna==3.6
91
  # via anyio
@@ -150,6 +166,9 @@ numpy==1.26.4
150
  # via langchain-community
151
  # via matplotlib
152
  # via pandas
 
 
 
153
  orjson==3.9.15
154
  # via gradio
155
  # via langsmith
@@ -171,8 +190,12 @@ pexpect==4.9.0
171
  pillow==10.2.0
172
  # via gradio
173
  # via matplotlib
 
 
174
  prompt-toolkit==3.0.43
175
  # via ipython
 
 
176
  ptyprocess==0.7.0
177
  # via pexpect
178
  pure-eval==0.2.2
@@ -183,6 +206,8 @@ pydantic==2.6.2
183
  # via langchain
184
  # via langchain-core
185
  # via langsmith
 
 
186
  pydantic-core==2.16.3
187
  # via pydantic
188
  pydub==0.25.1
@@ -193,6 +218,8 @@ pygments==2.17.2
193
  # via rich
194
  pyparsing==3.1.1
195
  # via matplotlib
 
 
196
  python-dateutil==2.8.2
197
  # via matplotlib
198
  # via pandas
@@ -206,15 +233,21 @@ pyyaml==6.0.1
206
  # via langchain
207
  # via langchain-community
208
  # via langchain-core
 
 
209
  referencing==0.33.0
210
  # via jsonschema
211
  # via jsonschema-specifications
 
 
212
  requests==2.31.0
213
  # via huggingface-hub
214
  # via langchain
215
  # via langchain-community
216
  # via langchain-core
217
  # via langsmith
 
 
218
  rich==13.7.0
219
  # via typer
220
  rpds-py==0.18.0
@@ -224,6 +257,8 @@ ruff==0.2.2
224
  # via gradio
225
  semantic-version==2.10.0
226
  # via gradio
 
 
227
  shellingham==1.5.4
228
  # via typer
229
  six==1.16.0
@@ -232,6 +267,7 @@ six==1.16.0
232
  sniffio==1.3.0
233
  # via anyio
234
  # via httpx
 
235
  sqlalchemy==2.0.27
236
  # via langchain
237
  # via langchain-community
@@ -243,12 +279,15 @@ tenacity==8.2.3
243
  # via langchain
244
  # via langchain-community
245
  # via langchain-core
 
 
246
  tomlkit==0.12.0
247
  # via gradio
248
  toolz==0.12.1
249
  # via altair
250
  tqdm==4.66.2
251
  # via huggingface-hub
 
252
  traitlets==5.14.1
253
  # via ipython
254
  # via matplotlib-inline
@@ -259,6 +298,7 @@ typing-extensions==4.9.0
259
  # via gradio
260
  # via gradio-client
261
  # via huggingface-hub
 
262
  # via pydantic
263
  # via pydantic-core
264
  # via sqlalchemy
@@ -269,6 +309,7 @@ typing-inspect==0.9.0
269
  tzdata==2024.1
270
  # via pandas
271
  urllib3==2.2.1
 
272
  # via requests
273
  uvicorn==0.27.1
274
  # via gradio
 
22
  anyio==4.3.0
23
  # via httpx
24
  # via langchain-core
25
+ # via openai
26
  # via starlette
27
  asttokens==2.4.1
28
  # via icecream
 
52
  # via langchain-community
53
  decorator==5.1.1
54
  # via ipython
55
+ distro==1.9.0
56
+ # via openai
57
  executing==2.0.1
58
  # via icecream
59
  # via stack-data
 
78
  # via gradio
79
  gradio-pdf==0.0.5
80
  # via pdfchat
81
+ grpcio==1.62.0
82
+ # via grpcio-tools
83
+ # via qdrant-client
84
+ grpcio-tools==1.62.0
85
+ # via qdrant-client
86
  h11==0.14.0
87
  # via httpcore
88
  # via uvicorn
89
+ h2==4.1.0
90
+ # via httpx
91
+ hpack==4.0.0
92
+ # via h2
93
  httpcore==1.0.4
94
  # via httpx
95
  httpx==0.27.0
96
  # via gradio
97
  # via gradio-client
98
+ # via openai
99
+ # via qdrant-client
100
  huggingface-hub==0.20.3
101
  # via gradio
102
  # via gradio-client
103
+ hyperframe==6.0.1
104
+ # via h2
105
  icecream==2.1.3
106
  idna==3.6
107
  # via anyio
 
166
  # via langchain-community
167
  # via matplotlib
168
  # via pandas
169
+ # via qdrant-client
170
+ openai==1.12.0
171
+ # via pdfchat
172
  orjson==3.9.15
173
  # via gradio
174
  # via langsmith
 
190
  pillow==10.2.0
191
  # via gradio
192
  # via matplotlib
193
+ portalocker==2.8.2
194
+ # via qdrant-client
195
  prompt-toolkit==3.0.43
196
  # via ipython
197
+ protobuf==4.25.3
198
+ # via grpcio-tools
199
  ptyprocess==0.7.0
200
  # via pexpect
201
  pure-eval==0.2.2
 
206
  # via langchain
207
  # via langchain-core
208
  # via langsmith
209
+ # via openai
210
+ # via qdrant-client
211
  pydantic-core==2.16.3
212
  # via pydantic
213
  pydub==0.25.1
 
218
  # via rich
219
  pyparsing==3.1.1
220
  # via matplotlib
221
+ pypdf==4.0.2
222
+ # via pdfchat
223
  python-dateutil==2.8.2
224
  # via matplotlib
225
  # via pandas
 
233
  # via langchain
234
  # via langchain-community
235
  # via langchain-core
236
+ qdrant-client==1.7.3
237
+ # via pdfchat
238
  referencing==0.33.0
239
  # via jsonschema
240
  # via jsonschema-specifications
241
+ regex==2023.12.25
242
+ # via tiktoken
243
  requests==2.31.0
244
  # via huggingface-hub
245
  # via langchain
246
  # via langchain-community
247
  # via langchain-core
248
  # via langsmith
249
+ # via pdfchat
250
+ # via tiktoken
251
  rich==13.7.0
252
  # via typer
253
  rpds-py==0.18.0
 
257
  # via gradio
258
  semantic-version==2.10.0
259
  # via gradio
260
+ setuptools==69.1.1
261
+ # via grpcio-tools
262
  shellingham==1.5.4
263
  # via typer
264
  six==1.16.0
 
267
  sniffio==1.3.0
268
  # via anyio
269
  # via httpx
270
+ # via openai
271
  sqlalchemy==2.0.27
272
  # via langchain
273
  # via langchain-community
 
279
  # via langchain
280
  # via langchain-community
281
  # via langchain-core
282
+ tiktoken==0.6.0
283
+ # via pdfchat
284
  tomlkit==0.12.0
285
  # via gradio
286
  toolz==0.12.1
287
  # via altair
288
  tqdm==4.66.2
289
  # via huggingface-hub
290
+ # via openai
291
  traitlets==5.14.1
292
  # via ipython
293
  # via matplotlib-inline
 
298
  # via gradio
299
  # via gradio-client
300
  # via huggingface-hub
301
+ # via openai
302
  # via pydantic
303
  # via pydantic-core
304
  # via sqlalchemy
 
309
  tzdata==2024.1
310
  # via pandas
311
  urllib3==2.2.1
312
+ # via qdrant-client
313
  # via requests
314
  uvicorn==0.27.1
315
  # via gradio
requirements.lock CHANGED
@@ -22,6 +22,7 @@ annotated-types==0.6.0
22
  anyio==4.3.0
23
  # via httpx
24
  # via langchain-core
 
25
  # via starlette
26
  attrs==23.2.0
27
  # via aiohttp
@@ -45,6 +46,8 @@ cycler==0.12.1
45
  dataclasses-json==0.6.4
46
  # via langchain
47
  # via langchain-community
 
 
48
  fastapi==0.110.0
49
  # via gradio
50
  ffmpy==0.3.2
@@ -66,17 +69,30 @@ gradio-client==0.10.1
66
  # via gradio
67
  gradio-pdf==0.0.5
68
  # via pdfchat
 
 
 
 
 
69
  h11==0.14.0
70
  # via httpcore
71
  # via uvicorn
 
 
 
 
72
  httpcore==1.0.4
73
  # via httpx
74
  httpx==0.27.0
75
  # via gradio
76
  # via gradio-client
 
 
77
  huggingface-hub==0.20.3
78
  # via gradio
79
  # via gradio-client
 
 
80
  idna==3.6
81
  # via anyio
82
  # via httpx
@@ -135,6 +151,9 @@ numpy==1.26.4
135
  # via langchain-community
136
  # via matplotlib
137
  # via pandas
 
 
 
138
  orjson==3.9.15
139
  # via gradio
140
  # via langsmith
@@ -152,12 +171,18 @@ pandas==2.2.1
152
  pillow==10.2.0
153
  # via gradio
154
  # via matplotlib
 
 
 
 
155
  pydantic==2.6.2
156
  # via fastapi
157
  # via gradio
158
  # via langchain
159
  # via langchain-core
160
  # via langsmith
 
 
161
  pydantic-core==2.16.3
162
  # via pydantic
163
  pydub==0.25.1
@@ -166,6 +191,8 @@ pygments==2.17.2
166
  # via rich
167
  pyparsing==3.1.1
168
  # via matplotlib
 
 
169
  python-dateutil==2.8.2
170
  # via matplotlib
171
  # via pandas
@@ -179,15 +206,21 @@ pyyaml==6.0.1
179
  # via langchain
180
  # via langchain-community
181
  # via langchain-core
 
 
182
  referencing==0.33.0
183
  # via jsonschema
184
  # via jsonschema-specifications
 
 
185
  requests==2.31.0
186
  # via huggingface-hub
187
  # via langchain
188
  # via langchain-community
189
  # via langchain-core
190
  # via langsmith
 
 
191
  rich==13.7.0
192
  # via typer
193
  rpds-py==0.18.0
@@ -197,6 +230,8 @@ ruff==0.2.2
197
  # via gradio
198
  semantic-version==2.10.0
199
  # via gradio
 
 
200
  shellingham==1.5.4
201
  # via typer
202
  six==1.16.0
@@ -204,6 +239,7 @@ six==1.16.0
204
  sniffio==1.3.0
205
  # via anyio
206
  # via httpx
 
207
  sqlalchemy==2.0.27
208
  # via langchain
209
  # via langchain-community
@@ -213,12 +249,15 @@ tenacity==8.2.3
213
  # via langchain
214
  # via langchain-community
215
  # via langchain-core
 
 
216
  tomlkit==0.12.0
217
  # via gradio
218
  toolz==0.12.1
219
  # via altair
220
  tqdm==4.66.2
221
  # via huggingface-hub
 
222
  typer==0.9.0
223
  # via gradio
224
  typing-extensions==4.9.0
@@ -226,6 +265,7 @@ typing-extensions==4.9.0
226
  # via gradio
227
  # via gradio-client
228
  # via huggingface-hub
 
229
  # via pydantic
230
  # via pydantic-core
231
  # via sqlalchemy
@@ -236,6 +276,7 @@ typing-inspect==0.9.0
236
  tzdata==2024.1
237
  # via pandas
238
  urllib3==2.2.1
 
239
  # via requests
240
  uvicorn==0.27.1
241
  # via gradio
 
22
  anyio==4.3.0
23
  # via httpx
24
  # via langchain-core
25
+ # via openai
26
  # via starlette
27
  attrs==23.2.0
28
  # via aiohttp
 
46
  dataclasses-json==0.6.4
47
  # via langchain
48
  # via langchain-community
49
+ distro==1.9.0
50
+ # via openai
51
  fastapi==0.110.0
52
  # via gradio
53
  ffmpy==0.3.2
 
69
  # via gradio
70
  gradio-pdf==0.0.5
71
  # via pdfchat
72
+ grpcio==1.62.0
73
+ # via grpcio-tools
74
+ # via qdrant-client
75
+ grpcio-tools==1.62.0
76
+ # via qdrant-client
77
  h11==0.14.0
78
  # via httpcore
79
  # via uvicorn
80
+ h2==4.1.0
81
+ # via httpx
82
+ hpack==4.0.0
83
+ # via h2
84
  httpcore==1.0.4
85
  # via httpx
86
  httpx==0.27.0
87
  # via gradio
88
  # via gradio-client
89
+ # via openai
90
+ # via qdrant-client
91
  huggingface-hub==0.20.3
92
  # via gradio
93
  # via gradio-client
94
+ hyperframe==6.0.1
95
+ # via h2
96
  idna==3.6
97
  # via anyio
98
  # via httpx
 
151
  # via langchain-community
152
  # via matplotlib
153
  # via pandas
154
+ # via qdrant-client
155
+ openai==1.12.0
156
+ # via pdfchat
157
  orjson==3.9.15
158
  # via gradio
159
  # via langsmith
 
171
  pillow==10.2.0
172
  # via gradio
173
  # via matplotlib
174
+ portalocker==2.8.2
175
+ # via qdrant-client
176
+ protobuf==4.25.3
177
+ # via grpcio-tools
178
  pydantic==2.6.2
179
  # via fastapi
180
  # via gradio
181
  # via langchain
182
  # via langchain-core
183
  # via langsmith
184
+ # via openai
185
+ # via qdrant-client
186
  pydantic-core==2.16.3
187
  # via pydantic
188
  pydub==0.25.1
 
191
  # via rich
192
  pyparsing==3.1.1
193
  # via matplotlib
194
+ pypdf==4.0.2
195
+ # via pdfchat
196
  python-dateutil==2.8.2
197
  # via matplotlib
198
  # via pandas
 
206
  # via langchain
207
  # via langchain-community
208
  # via langchain-core
209
+ qdrant-client==1.7.3
210
+ # via pdfchat
211
  referencing==0.33.0
212
  # via jsonschema
213
  # via jsonschema-specifications
214
+ regex==2023.12.25
215
+ # via tiktoken
216
  requests==2.31.0
217
  # via huggingface-hub
218
  # via langchain
219
  # via langchain-community
220
  # via langchain-core
221
  # via langsmith
222
+ # via pdfchat
223
+ # via tiktoken
224
  rich==13.7.0
225
  # via typer
226
  rpds-py==0.18.0
 
230
  # via gradio
231
  semantic-version==2.10.0
232
  # via gradio
233
+ setuptools==69.1.1
234
+ # via grpcio-tools
235
  shellingham==1.5.4
236
  # via typer
237
  six==1.16.0
 
239
  sniffio==1.3.0
240
  # via anyio
241
  # via httpx
242
+ # via openai
243
  sqlalchemy==2.0.27
244
  # via langchain
245
  # via langchain-community
 
249
  # via langchain
250
  # via langchain-community
251
  # via langchain-core
252
+ tiktoken==0.6.0
253
+ # via pdfchat
254
  tomlkit==0.12.0
255
  # via gradio
256
  toolz==0.12.1
257
  # via altair
258
  tqdm==4.66.2
259
  # via huggingface-hub
260
+ # via openai
261
  typer==0.9.0
262
  # via gradio
263
  typing-extensions==4.9.0
 
265
  # via gradio
266
  # via gradio-client
267
  # via huggingface-hub
268
+ # via openai
269
  # via pydantic
270
  # via pydantic-core
271
  # via sqlalchemy
 
276
  tzdata==2024.1
277
  # via pandas
278
  urllib3==2.2.1
279
+ # via qdrant-client
280
  # via requests
281
  uvicorn==0.27.1
282
  # via gradio
requirements.txt CHANGED
@@ -21,6 +21,7 @@ annotated-types==0.6.0
21
  anyio==4.3.0
22
  # via httpx
23
  # via langchain-core
 
24
  # via starlette
25
  attrs==23.2.0
26
  # via aiohttp
@@ -44,6 +45,8 @@ cycler==0.12.1
44
  dataclasses-json==0.6.4
45
  # via langchain
46
  # via langchain-community
 
 
47
  fastapi==0.110.0
48
  # via gradio
49
  ffmpy==0.3.2
@@ -65,17 +68,30 @@ gradio-client==0.10.1
65
  # via gradio
66
  gradio-pdf==0.0.5
67
  # via pdfchat
 
 
 
 
 
68
  h11==0.14.0
69
  # via httpcore
70
  # via uvicorn
 
 
 
 
71
  httpcore==1.0.4
72
  # via httpx
73
  httpx==0.27.0
74
  # via gradio
75
  # via gradio-client
 
 
76
  huggingface-hub==0.20.3
77
  # via gradio
78
  # via gradio-client
 
 
79
  idna==3.6
80
  # via anyio
81
  # via httpx
@@ -134,6 +150,9 @@ numpy==1.26.4
134
  # via langchain-community
135
  # via matplotlib
136
  # via pandas
 
 
 
137
  orjson==3.9.15
138
  # via gradio
139
  # via langsmith
@@ -151,12 +170,18 @@ pandas==2.2.1
151
  pillow==10.2.0
152
  # via gradio
153
  # via matplotlib
 
 
 
 
154
  pydantic==2.6.2
155
  # via fastapi
156
  # via gradio
157
  # via langchain
158
  # via langchain-core
159
  # via langsmith
 
 
160
  pydantic-core==2.16.3
161
  # via pydantic
162
  pydub==0.25.1
@@ -165,6 +190,8 @@ pygments==2.17.2
165
  # via rich
166
  pyparsing==3.1.1
167
  # via matplotlib
 
 
168
  python-dateutil==2.8.2
169
  # via matplotlib
170
  # via pandas
@@ -178,15 +205,21 @@ pyyaml==6.0.1
178
  # via langchain
179
  # via langchain-community
180
  # via langchain-core
 
 
181
  referencing==0.33.0
182
  # via jsonschema
183
  # via jsonschema-specifications
 
 
184
  requests==2.31.0
185
  # via huggingface-hub
186
  # via langchain
187
  # via langchain-community
188
  # via langchain-core
189
  # via langsmith
 
 
190
  rich==13.7.0
191
  # via typer
192
  rpds-py==0.18.0
@@ -196,6 +229,8 @@ ruff==0.2.2
196
  # via gradio
197
  semantic-version==2.10.0
198
  # via gradio
 
 
199
  shellingham==1.5.4
200
  # via typer
201
  six==1.16.0
@@ -203,6 +238,7 @@ six==1.16.0
203
  sniffio==1.3.0
204
  # via anyio
205
  # via httpx
 
206
  sqlalchemy==2.0.27
207
  # via langchain
208
  # via langchain-community
@@ -212,12 +248,15 @@ tenacity==8.2.3
212
  # via langchain
213
  # via langchain-community
214
  # via langchain-core
 
 
215
  tomlkit==0.12.0
216
  # via gradio
217
  toolz==0.12.1
218
  # via altair
219
  tqdm==4.66.2
220
  # via huggingface-hub
 
221
  typer==0.9.0
222
  # via gradio
223
  typing-extensions==4.9.0
@@ -225,6 +264,7 @@ typing-extensions==4.9.0
225
  # via gradio
226
  # via gradio-client
227
  # via huggingface-hub
 
228
  # via pydantic
229
  # via pydantic-core
230
  # via sqlalchemy
@@ -235,6 +275,7 @@ typing-inspect==0.9.0
235
  tzdata==2024.1
236
  # via pandas
237
  urllib3==2.2.1
 
238
  # via requests
239
  uvicorn==0.27.1
240
  # via gradio
 
21
  anyio==4.3.0
22
  # via httpx
23
  # via langchain-core
24
+ # via openai
25
  # via starlette
26
  attrs==23.2.0
27
  # via aiohttp
 
45
  dataclasses-json==0.6.4
46
  # via langchain
47
  # via langchain-community
48
+ distro==1.9.0
49
+ # via openai
50
  fastapi==0.110.0
51
  # via gradio
52
  ffmpy==0.3.2
 
68
  # via gradio
69
  gradio-pdf==0.0.5
70
  # via pdfchat
71
+ grpcio==1.62.0
72
+ # via grpcio-tools
73
+ # via qdrant-client
74
+ grpcio-tools==1.62.0
75
+ # via qdrant-client
76
  h11==0.14.0
77
  # via httpcore
78
  # via uvicorn
79
+ h2==4.1.0
80
+ # via httpx
81
+ hpack==4.0.0
82
+ # via h2
83
  httpcore==1.0.4
84
  # via httpx
85
  httpx==0.27.0
86
  # via gradio
87
  # via gradio-client
88
+ # via openai
89
+ # via qdrant-client
90
  huggingface-hub==0.20.3
91
  # via gradio
92
  # via gradio-client
93
+ hyperframe==6.0.1
94
+ # via h2
95
  idna==3.6
96
  # via anyio
97
  # via httpx
 
150
  # via langchain-community
151
  # via matplotlib
152
  # via pandas
153
+ # via qdrant-client
154
+ openai==1.12.0
155
+ # via pdfchat
156
  orjson==3.9.15
157
  # via gradio
158
  # via langsmith
 
170
  pillow==10.2.0
171
  # via gradio
172
  # via matplotlib
173
+ portalocker==2.8.2
174
+ # via qdrant-client
175
+ protobuf==4.25.3
176
+ # via grpcio-tools
177
  pydantic==2.6.2
178
  # via fastapi
179
  # via gradio
180
  # via langchain
181
  # via langchain-core
182
  # via langsmith
183
+ # via openai
184
+ # via qdrant-client
185
  pydantic-core==2.16.3
186
  # via pydantic
187
  pydub==0.25.1
 
190
  # via rich
191
  pyparsing==3.1.1
192
  # via matplotlib
193
+ pypdf==4.0.2
194
+ # via pdfchat
195
  python-dateutil==2.8.2
196
  # via matplotlib
197
  # via pandas
 
205
  # via langchain
206
  # via langchain-community
207
  # via langchain-core
208
+ qdrant-client==1.7.3
209
+ # via pdfchat
210
  referencing==0.33.0
211
  # via jsonschema
212
  # via jsonschema-specifications
213
+ regex==2023.12.25
214
+ # via tiktoken
215
  requests==2.31.0
216
  # via huggingface-hub
217
  # via langchain
218
  # via langchain-community
219
  # via langchain-core
220
  # via langsmith
221
+ # via pdfchat
222
+ # via tiktoken
223
  rich==13.7.0
224
  # via typer
225
  rpds-py==0.18.0
 
229
  # via gradio
230
  semantic-version==2.10.0
231
  # via gradio
232
+ setuptools==69.1.1
233
+ # via grpcio-tools
234
  shellingham==1.5.4
235
  # via typer
236
  six==1.16.0
 
238
  sniffio==1.3.0
239
  # via anyio
240
  # via httpx
241
+ # via openai
242
  sqlalchemy==2.0.27
243
  # via langchain
244
  # via langchain-community
 
248
  # via langchain
249
  # via langchain-community
250
  # via langchain-core
251
+ tiktoken==0.6.0
252
+ # via pdfchat
253
  tomlkit==0.12.0
254
  # via gradio
255
  toolz==0.12.1
256
  # via altair
257
  tqdm==4.66.2
258
  # via huggingface-hub
259
+ # via openai
260
  typer==0.9.0
261
  # via gradio
262
  typing-extensions==4.9.0
 
264
  # via gradio
265
  # via gradio-client
266
  # via huggingface-hub
267
+ # via openai
268
  # via pydantic
269
  # via pydantic-core
270
  # via sqlalchemy
 
275
  tzdata==2024.1
276
  # via pandas
277
  urllib3==2.2.1
278
+ # via qdrant-client
279
  # via requests
280
  uvicorn==0.27.1
281
  # via gradio
src/pdfchat/app.py CHANGED
@@ -1,12 +1,52 @@
 
 
1
  import time
2
  from dataclasses import dataclass
3
  from pathlib import Path
4
 
5
  import gradio as gr
 
6
  from gradio_pdf import PDF
 
 
 
 
7
  from loguru import logger
8
-
9
- MODEL_CALM2 = "cyberagent/calm2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  @dataclass
@@ -46,34 +86,85 @@ def open_file(file_path: str) -> str:
46
  if file_path.suffix == ".txt":
47
  text = file_path.read_text()
48
  elif file_path.suffix == ".pdf":
49
- text = "WARNING: PDF file is not supported yet."
50
  else:
51
  text = "WARNING: Unsupported file format."
52
 
53
  return text
54
 
55
 
56
- def get_response(query: str, document: str | None) -> str:
57
- response = ""
58
- if not document:
59
- response = "No document is uploaded. Please upload a document."
60
- else:
61
- response = f"Your document: {document}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  return response
64
 
65
 
66
- def bot(history: ChatHistory, query: str, file_path: str) -> ChatHistory:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  history = ChatHistory(history)
68
  document = open_file(file_path) if file_path else None
69
- response = get_response(query, document)
70
- history.add_chat(Chat(query=query, response=response))
 
 
71
  logger.info(history)
72
 
73
  history.clear_last_response()
74
- for char in response:
75
  history[-1].response += char
76
- time.sleep(0.02)
77
  yield history
78
 
79
 
@@ -121,7 +212,7 @@ with gr.Blocks() as app:
121
  )
122
  submit_button = gr.Button("Submit", variant="primary", size="sm")
123
  submit = submit_button.click(
124
- fn=bot,
125
  inputs=[chatbot, text_box, file_box],
126
  outputs=chatbot,
127
  )
 
1
+ import os
2
+ import re
3
  import time
4
  from dataclasses import dataclass
5
  from pathlib import Path
6
 
7
  import gradio as gr
8
+ import requests
9
  from gradio_pdf import PDF
10
+ from langchain.embeddings.openai import OpenAIEmbeddings
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from langchain.vectorstores.qdrant import Qdrant
14
  from loguru import logger
15
+ from pypdf import PdfReader
16
+
17
+ MODEL_CALM2 = "cyberagent/calm2-7b-chat"
18
+ text_splitter = CharacterTextSplitter(
19
+ separator="\n\n",
20
+ chunk_size=1000,
21
+ chunk_overlap=0,
22
+ )
23
+ QDRANT_MODE = "local"
24
+ if QDRANT_MODE == "local":
25
+ QDRANT_CLIENT_CONFIG = {
26
+ "path": "./local_qdrant",
27
+ }
28
+ elif QDRANT_MODE == "cloud":
29
+ QDRANT_CLIENT_CONFIG = {
30
+ "url": os.environ.get("QDRANT_URL"),
31
+ "api_key": os.environ.get("QDRANT_API_KEY"),
32
+ }
33
+ if not QDRANT_CLIENT_CONFIG["url"] or not QDRANT_CLIENT_CONFIG["api_key"]:
34
+ raise ValueError(
35
+ "Please set the QDRANT_URL and QDRANT_API_KEY environment variables."
36
+ )
37
+ COLLECTION_NAME = "pdfchat"
38
+ PROMPT_TEMPLATE = """ไปฅไธ‹ใฎๆ–‡่„ˆใ‚’ๅˆฉ็”จใ—ใฆใ€ๆœ€ๅพŒใฎ่ณชๅ•ใซ็ญ”ใˆใชใ•ใ„ใ€‚
39
+ ็ญ”ใˆใŒใ‚ใ‹ใ‚‰ใชใ„ๅ ดๅˆใฏใ€ใ‚ใ‹ใ‚‰ใชใ„ใจ็ญ”ใˆใฆใใ ใ•ใ„ใ€‚
40
+
41
+ ใ€ๆ–‡่„ˆใ€‘
42
+ {context}
43
+
44
+ ใ€่ณชๅ•ใ€‘
45
+ {question}
46
+
47
+ ใ€็ญ”ใˆใ€‘
48
+ """
49
+ LLM_URL = os.environ.get("PDFCHAT_LLM_URL")
50
 
51
 
52
  @dataclass
 
86
  if file_path.suffix == ".txt":
87
  text = file_path.read_text()
88
  elif file_path.suffix == ".pdf":
89
+ text = parse_pdf(file_path)
90
  else:
91
  text = "WARNING: Unsupported file format."
92
 
93
  return text
94
 
95
 
96
+ def parse_pdf(file_path: Path, backend="pypdf") -> str:
97
+ reader = PdfReader(file_path)
98
+ contents = "".join([page.extract_text() for page in reader.pages])
99
+
100
+ contents = re.sub(r"[ ใ€€]+\n[ ใ€€]+", "\n", contents)
101
+ contents = re.sub(r"[ ใ€€]+\n", "\n", contents)
102
+ contents = re.sub(r"\n[ ใ€€]+", "\n", contents)
103
+ contents = re.sub(r"[^ใ€‚\n]\n", "", contents)
104
+ contents = re.sub(r"[\wใ€๏ผˆ๏ผ‰]\n[\wใ€๏ผˆ๏ผ‰]", "", contents)
105
+ contents = re.sub(r"\n{3,}", "\n\n", contents)
106
+
107
+ return contents
108
+
109
+
110
+ def get_response(prompt: str) -> str:
111
+ response = requests.post(
112
+ LLM_URL,
113
+ json={
114
+ "prompt": prompt,
115
+ "max_new_tokens": 2048,
116
+ },
117
+ ).json()
118
 
119
  return response
120
 
121
 
122
+ def retrieve_relevant_documents(query: str, document: str | None) -> list[str]:
123
+ if not document:
124
+ return "No document is uploaded. Please upload a document."
125
+
126
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
127
+ if not OPENAI_API_KEY:
128
+ raise ValueError("Please set the OPENAI_API_KEY environment variable.")
129
+
130
+ documents = text_splitter.split_text(document)
131
+ embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
132
+
133
+ db = Qdrant.from_texts(
134
+ texts=documents,
135
+ embedding=embeddings,
136
+ **QDRANT_CLIENT_CONFIG,
137
+ )
138
+ retriever = db.as_retriever()
139
+ relevant_documents = [
140
+ doc.page_content for doc in retriever.get_relevant_documents(query)
141
+ ]
142
+
143
+ return relevant_documents
144
+
145
+
146
+ def build_prompt(query: str, context: str) -> str:
147
+ prompt = PromptTemplate(
148
+ template=PROMPT_TEMPLATE,
149
+ input_variables=["context", "question"],
150
+ ).format(context=context, question=query)
151
+
152
+ return prompt
153
+
154
+
155
+ def main(history: ChatHistory, query: str, file_path: str | None) -> ChatHistory:
156
  history = ChatHistory(history)
157
  document = open_file(file_path) if file_path else None
158
+ relevant_documents = retrieve_relevant_documents(query=query, document=document)
159
+ prompt = build_prompt(query=query, context="\n\n".join(relevant_documents))
160
+ response_message = get_response(prompt)["message"]
161
+ history.add_chat(Chat(query=query, response=response_message))
162
  logger.info(history)
163
 
164
  history.clear_last_response()
165
+ for char in response_message:
166
  history[-1].response += char
167
+ time.sleep(0.01)
168
  yield history
169
 
170
 
 
212
  )
213
  submit_button = gr.Button("Submit", variant="primary", size="sm")
214
  submit = submit_button.click(
215
+ fn=main,
216
  inputs=[chatbot, text_box, file_box],
217
  outputs=chatbot,
218
  )