herMaster commited on
Commit
1e493e3
β€’
1 Parent(s): a7c5078

giving user ability to upload PDF

Browse files
Files changed (1) hide show
  1. app.py +112 -44
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from qdrant_client import models, QdrantClient
3
  from sentence_transformers import SentenceTransformer
4
  from PyPDF2 import PdfReader
@@ -45,7 +46,75 @@ llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
45
  print("LLM loaded........................................")
46
  print("################################################################")
47
 
48
- def get_chunks(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  text_splitter = RecursiveCharacterTextSplitter(
50
  # seperator = "\n",
51
  chunk_size = 250,
@@ -57,62 +126,61 @@ def get_chunks(text):
57
  return chunks
58
 
59
 
60
- pdf_path = './100 Weird Facts About the Human Body.pdf'
61
 
62
 
63
- reader = PdfReader(pdf_path)
64
- text = ""
65
- num_of_pages = len(reader.pages)
66
 
67
- for page in range(num_of_pages):
68
- current_page = reader.pages[page]
69
- text += current_page.extract_text()
70
 
71
 
72
- chunks = get_chunks(text)
73
- print(chunks)
74
- print("Chunks are ready.....................................")
75
- print("######################################################")
76
 
77
- client = QdrantClient(path = "./db")
78
- print("db created................................................")
79
- print("#####################################################################")
80
 
81
- client.recreate_collection(
82
- collection_name="my_facts",
83
- vectors_config=models.VectorParams(
84
- size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
85
- distance=models.Distance.COSINE,
86
- ),
87
- )
88
 
89
- print("Collection created........................................")
90
- print("#########################################################")
91
 
92
 
93
 
94
- li = []
95
- for i in range(len(chunks)):
96
- li.append(i)
97
 
98
- dic = zip(li, chunks)
99
- dic= dict(dic)
100
-
101
- client.upload_records(
102
- collection_name="my_facts",
103
- records=[
104
- models.Record(
105
- id=idx,
106
- vector=encoder.encode(dic[idx]).tolist(),
107
- payload= {dic[idx][:5] : dic[idx]}
108
- ) for idx in dic.keys()
109
- ],
110
- )
111
 
112
- print("Records uploaded........................................")
113
- print("###########################################################")
114
 
115
- def chat(question):
116
 
117
  hits = client.search(
118
  collection_name="my_facts",
@@ -148,7 +216,7 @@ def chat(question):
148
 
149
  screen = gr.Interface(
150
  fn = chat,
151
- inputs = gr.Textbox(lines = 10, placeholder = "Enter your question here πŸ‘‰"),
152
  outputs = gr.Textbox(lines = 10, placeholder = "Your answer will be here soon πŸš€"),
153
  title="Q&A with PDF πŸ‘©πŸ»β€πŸ’»πŸ““βœπŸ»πŸ’‘",
154
  description="This app facilitates a conversation with PDFs available on https://www.delo.si/assets/media/other/20110728/100%20Weird%20Facts%20About%20the%20Human%20Body.pdfπŸ’‘",
 
1
  import gradio as gr
2
+ from gradio_pdf import PDF
3
  from qdrant_client import models, QdrantClient
4
  from sentence_transformers import SentenceTransformer
5
  from PyPDF2 import PdfReader
 
46
  print("LLM loaded........................................")
47
  print("################################################################")
48
 
49
+ # def get_chunks(text):
50
+ # text_splitter = RecursiveCharacterTextSplitter(
51
+ # # seperator = "\n",
52
+ # chunk_size = 250,
53
+ # chunk_overlap = 50,
54
+ # length_function = len,
55
+ # )
56
+
57
+ # chunks = text_splitter.split_text(text)
58
+ # return chunks
59
+
60
+
61
+ # pdf_path = './100 Weird Facts About the Human Body.pdf'
62
+
63
+
64
+ # reader = PdfReader(pdf_path)
65
+ # text = ""
66
+ # num_of_pages = len(reader.pages)
67
+
68
+ # for page in range(num_of_pages):
69
+ # current_page = reader.pages[page]
70
+ # text += current_page.extract_text()
71
+
72
+
73
+ # chunks = get_chunks(text)
74
+ # print(chunks)
75
+ # print("Chunks are ready.....................................")
76
+ # print("######################################################")
77
+
78
+ # client = QdrantClient(path = "./db")
79
+ # print("db created................................................")
80
+ # print("#####################################################################")
81
+
82
+ # client.recreate_collection(
83
+ # collection_name="my_facts",
84
+ # vectors_config=models.VectorParams(
85
+ # size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
86
+ # distance=models.Distance.COSINE,
87
+ # ),
88
+ # )
89
+
90
+ # print("Collection created........................................")
91
+ # print("#########################################################")
92
+
93
+
94
+
95
+ # li = []
96
+ # for i in range(len(chunks)):
97
+ # li.append(i)
98
+
99
+ # dic = zip(li, chunks)
100
+ # dic= dict(dic)
101
+
102
+ # client.upload_records(
103
+ # collection_name="my_facts",
104
+ # records=[
105
+ # models.Record(
106
+ # id=idx,
107
+ # vector=encoder.encode(dic[idx]).tolist(),
108
+ # payload= {dic[idx][:5] : dic[idx]}
109
+ # ) for idx in dic.keys()
110
+ # ],
111
+ # )
112
+
113
+ # print("Records uploaded........................................")
114
+ # print("###########################################################")
115
+
116
+ def chat(file, question):
117
+ def get_chunks(text):
118
  text_splitter = RecursiveCharacterTextSplitter(
119
  # seperator = "\n",
120
  chunk_size = 250,
 
126
  return chunks
127
 
128
 
129
+ pdf_path = './100 Weird Facts About the Human Body.pdf'
130
 
131
 
132
+ reader = PdfReader(pdf_path)
133
+ text = ""
134
+ num_of_pages = len(reader.pages)
135
 
136
+ for page in range(num_of_pages):
137
+ current_page = reader.pages[page]
138
+ text += current_page.extract_text()
139
 
140
 
141
+ chunks = get_chunks(text)
142
+ # print(chunks)
143
+ # print("Chunks are ready.....................................")
144
+ # print("######################################################")
145
 
146
+ client = QdrantClient(path = "./db")
147
+ # print("db created................................................")
148
+ # print("#####################################################################")
149
 
150
+ client.recreate_collection(
151
+ collection_name="my_facts",
152
+ vectors_config=models.VectorParams(
153
+ size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
154
+ distance=models.Distance.COSINE,
155
+ ),
156
+ )
157
 
158
+ # print("Collection created........................................")
159
+ # print("#########################################################")
160
 
161
 
162
 
163
+ li = []
164
+ for i in range(len(chunks)):
165
+ li.append(i)
166
 
167
+ dic = zip(li, chunks)
168
+ dic= dict(dic)
169
+
170
+ client.upload_records(
171
+ collection_name="my_facts",
172
+ records=[
173
+ models.Record(
174
+ id=idx,
175
+ vector=encoder.encode(dic[idx]).tolist(),
176
+ payload= {dic[idx][:5] : dic[idx]}
177
+ ) for idx in dic.keys()
178
+ ],
179
+ )
180
 
181
+ # print("Records uploaded........................................")
182
+ # print("###########################################################")
183
 
 
184
 
185
  hits = client.search(
186
  collection_name="my_facts",
 
216
 
217
  screen = gr.Interface(
218
  fn = chat,
219
+ inputs = [PDF(label="Upload a PDF", interactive=True), gr.Textbox(lines = 10, placeholder = "Enter your question here πŸ‘‰")],
220
  outputs = gr.Textbox(lines = 10, placeholder = "Your answer will be here soon πŸš€"),
221
  title="Q&A with PDF πŸ‘©πŸ»β€πŸ’»πŸ““βœπŸ»πŸ’‘",
222
  description="This app facilitates a conversation with PDFs available on https://www.delo.si/assets/media/other/20110728/100%20Weird%20Facts%20About%20the%20Human%20Body.pdfπŸ’‘",