robertselvam commited on
Commit
dce8568
·
1 Parent(s): 46d1b2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -109
app.py CHANGED
@@ -18,23 +18,7 @@ import tempfile
18
  import pandas as pd
19
  import re
20
 
21
-
22
- # Create and Declare Global Varibale "result"
23
-
24
-
25
- class ChemicalIdentifier:
26
-
27
- def __init__(self):
28
-
29
- openai.api_key = os.getenv("OPENAI_API_KEY")
30
- self.logger = logging.getLogger("ChemicalIdentifier")
31
- self.logger.setLevel(logging.DEBUG)
32
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33
- console_handler = logging.StreamHandler()
34
- console_handler.setLevel(logging.DEBUG)
35
- console_handler.setFormatter(formatter)
36
- self.logger.addHandler(console_handler)
37
-
38
  def get_empty_state(self):
39
 
40
  """ Create empty Knowledge base"""
@@ -52,63 +36,54 @@ class ChemicalIdentifier:
52
  ValueError: If the URL is not valid or the file cannot be fetched.
53
  """
54
 
55
- try:
56
- if validators.url(url):
57
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
58
- r = requests.get(url,headers=headers)
59
- if r.status_code != 200:
60
- raise ValueError(
61
- "Check the url of your file; returned status code %s" % r.status_code
62
- )
63
-
64
- content_type = r.headers.get("content-type")
65
- file_extension = mimetypes.guess_extension(content_type)
66
- temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
67
- temp_file.write(r.content)
68
- file_path = temp_file.name
69
- loader = UnstructuredFileLoader(file_path, strategy="fast")
70
- docs = loader.load()
71
- return docs
72
- else:
73
- raise ValueError("Please enter a valid URL")
74
- except Exception as e:
75
- self.logger.error("Error occurred while uploading the file: %s", str(e))
76
- raise ValueError("Error occurred while uploading the file") from e
77
 
 
78
 
79
- def extract_chemical_names(self,text:str)->str:
80
- """
81
- Extracts chemical names from the given text.
82
  Args:
83
- text (str): The text to extract chemical names from.
84
  Returns:
85
- str: The extracted chemical names in bullet form.
86
- Raises:
87
- ValueError: If an error occurs during the extraction process.
88
  """
89
 
90
- try:
91
- prompt = f"Identify the Chemical Names Only give text in bullet form {text}. Don't Generate any extra chemicals apart from given text"
92
- response = openai.Completion.create(
93
- model="text-davinci-003",
94
- prompt=prompt,
95
- temperature=0,
96
- max_tokens=500,
97
- top_p=1,
98
- frequency_penalty=0,
99
- presence_penalty=0,
100
- )
101
-
102
- message = response.choices[0].text.strip()
103
- if ":" in message:
104
- message = re.sub(r'^.*:', '', message)
105
- return message.strip()
106
- except Exception as e:
107
- self.logger.error("Error occurred while finding chemicals: %s", str(e))
108
- raise ValueError("Error occurred while finding chemicals") from e
109
-
110
-
111
- def get_chemicals_for_url(self,urls:str)->str:
112
  """
113
  Retrieves chemicals from the provided URLs.
114
 
@@ -122,18 +97,144 @@ class ChemicalIdentifier:
122
  ValueError: If an error occurs during the process.
123
  """
124
 
125
- try:
126
- total_chemical=[]
127
- for url in urls.split(','):
128
- webpage_text = self.get_content_from_url(url)
129
- chemicals = self.extract_chemical_names(webpage_text)
130
- total_chemical.append(str(url)+"\n"+chemicals+"\n\n")
131
- list_of_chemicals = "".join(total_chemical)
132
- return list_of_chemicals
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- except Exception as e:
135
- self.logger.error("Error occurred while getting chemicals from URLs: %s", str(e))
136
- raise ValueError("Error occurred while getting chemicals from URLs") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  def create_knowledge_base(self,docs):
139
 
@@ -163,35 +264,58 @@ class ChemicalIdentifier:
163
  # Return the resulting knowledge base
164
  return knowledge_base
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def file_path_show(self,file_paths):
167
- file_paths = [single_file_path.name for single_file_path in file_paths]
168
- return file_paths
169
 
170
  def get_chemicals_for_file(self,state):
171
-
172
  knowledge_base = state["knowledge_base"]
173
-
174
  # Set the question for which we want to find the answer
175
- question = "Identify the Chemical Capabilities Only"
176
 
177
  # Perform a similarity search on the knowledge base to retrieve relevant documents
178
  docs = knowledge_base.similarity_search(question)
179
 
180
  # Initialize an OpenAI language model for question answering
181
- template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
182
- Identify the Chemical Capabilities Only.
183
- {context}
184
- Question :{question}.
185
- The result should be in bullet points format.
186
- """
187
 
188
- prompt = PromptTemplate(template=template,input_variables=["context","question"])
189
 
190
  llm = OpenAI(temperature=0.4)
191
- llm_chain = LLMChain(prompt=prompt, llm=llm)
192
 
193
  # Load a question-answering chain using the language model
194
- chain = load_qa_chain(llm, chain_type="stuff",prompt=prompt)
195
 
196
  # Run the question-answering chain on the input documents and question
197
  response = chain.run(input_documents=docs, question=question)
@@ -209,7 +333,7 @@ class ChemicalIdentifier:
209
 
210
 
211
  file_paths = [single_file_path.name for single_file_path in file_paths]
212
- collection_of_results = []
213
  for file_obj in file_paths:
214
 
215
  loader = UnstructuredFileLoader(file_obj, strategy="fast")
@@ -221,24 +345,24 @@ class ChemicalIdentifier:
221
  knowledge_base = self.create_knowledge_base(docs)
222
  state = {"knowledge_base": knowledge_base}
223
  pdf_name = os.path.basename(file_obj)
 
224
  final_ans = self.get_chemicals_for_file(state)
225
- response = pdf_name+"\n"+final_ans+"\n\n"
226
- collection_of_results.append(response)
227
  # Return a tuple containing the file name and the knowledge base
228
- results = "".join(collection_of_results)
229
  return results
230
 
231
- def get_final_result(self,urls,file_paths,state,progress=gr.Progress()):
232
-
233
  if urls:
234
  if file_paths:
235
- urls_chemicals = self.get_chemicals_for_url(urls)
236
  file_chemicals = self.identify_chemicals_in_files(file_paths,state)
237
  chemicals = urls_chemicals + file_chemicals
238
 
239
  return chemicals
240
  else:
241
- urls_chemicals = self.get_chemicals_for_url(urls)
242
  return urls_chemicals
243
  elif file_paths:
244
  file_chemicals = self.identify_chemicals_in_files(file_paths,state)
@@ -253,10 +377,10 @@ class ChemicalIdentifier:
253
  """
254
 
255
  with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
256
- gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'><center><h1 class ="center">
257
- <img src="https://hudsonandhayes.co.uk/wp-content/uploads/2023/01/Group-479.svg" height="110px" width="280px"></h1></center>
258
- <br><h1 style="color:#fff">Chemical Capability Identifier</h1></center>""")
259
  state = gr.State(self.get_empty_state())
 
 
260
  with gr.Column(elem_id="col-container"):
261
  with gr.Row(elem_id="row-flex"):
262
  url = gr.Textbox(label="URL")
@@ -271,20 +395,30 @@ class ChemicalIdentifier:
271
  file_count = "multiple",variant="primary")
272
  with gr.Row():
273
  with gr.Column(scale=1, min_width=0):
274
- compare_btn = gr.Button(value="Generate Analysis",variant="primary")
 
 
 
 
275
  with gr.Row():
276
  with gr.Column(scale=1, min_width=0):
277
- compared_result = gr.Textbox(value="",label='Chemical Capabilities :',show_label=True, placeholder="",lines=10)
 
 
 
 
 
 
278
 
279
  upload_button.upload(self.file_path_show, upload_button, [file_output])
280
 
281
  compare_btn.click(self.get_final_result,[url,upload_button,state],compared_result)
282
-
283
- demo.launch()
 
284
 
285
 
286
  if __name__ == "__main__":
287
 
288
- logging.basicConfig(level=logging.DEBUG)
289
  chemical_identifier = ChemicalIdentifier()
290
  chemical_identifier.gradio_interface()
 
18
  import pandas as pd
19
  import re
20
 
21
+ class DocumentQA:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def get_empty_state(self):
23
 
24
  """ Create empty Knowledge base"""
 
36
  ValueError: If the URL is not valid or the file cannot be fetched.
37
  """
38
 
39
+ if validators.url(url):
40
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
41
+ r = requests.get(url,headers=headers)
42
+ if r.status_code != 200:
43
+ raise ValueError(
44
+ "Check the url of your file; returned status code %s" % r.status_code
45
+ )
46
+
47
+ content_type = r.headers.get("content-type")
48
+ file_extension = mimetypes.guess_extension(content_type)
49
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
50
+ temp_file.write(r.content)
51
+ file_path = temp_file.name
52
+ loader = UnstructuredFileLoader(file_path, strategy="fast")
53
+ docs = loader.load()
54
+ return docs
55
+ else:
56
+ raise ValueError("Please enter a valid URL")
 
 
 
 
57
 
58
+ def create_knowledge_base(self,docs):
59
 
60
+ """Create a knowledge base from the given documents.
 
 
61
  Args:
62
+ docs (List[str]): List of documents.
63
  Returns:
64
+ FAISS: Knowledge base built from the documents.
 
 
65
  """
66
 
67
+ # Initialize a CharacterTextSplitter to split the documents into chunks
68
+ # Each chunk has a maximum length of 500 characters
69
+ # There is no overlap between the chunks
70
+ text_splitter = CharacterTextSplitter(
71
+ separator="\n", chunk_size=500, chunk_overlap=100, length_function=len
72
+ )
73
+
74
+ # Split the documents into chunks using the text_splitter
75
+ chunks = text_splitter.split_documents(docs)
76
+
77
+ # Initialize an OpenAIEmbeddings model to compute embeddings of the chunks
78
+ embeddings = OpenAIEmbeddings()
79
+
80
+ # Build a knowledge base using FAISS from the chunks and their embeddings
81
+ knowledge_base = FAISS.from_documents(chunks, embeddings)
82
+
83
+ # Return the resulting knowledge base
84
+ return knowledge_base
85
+
86
+ def get_chemicals_for_url(self,urls:str,state,input_qus)->str:
 
 
87
  """
88
  Retrieves chemicals from the provided URLs.
89
 
 
97
  ValueError: If an error occurs during the process.
98
  """
99
 
100
+ total_chemical=[]
101
+
102
+ for url in urls.split(','):
103
+ webpage_text = self.get_content_from_url(url)
104
+ knowledge_base = self.create_knowledge_base(webpage_text)
105
+ state = {"knowledge_base": knowledge_base}
106
+ chemicals = self.get_chemicals_for_file(state,input_qus)
107
+ total_chemical.append(str(url)+"\n"+chemicals+"\n\n")
108
+ list_of_chemicals = "".join(total_chemical)
109
+
110
+ return list_of_chemicals
111
+
112
+
113
+
114
+ def file_path_show(self,file_paths):
115
+ file_paths = [single_file_path.name for single_file_path in file_paths]
116
+ return file_paths
117
+
118
+ def get_chemicals_for_file(self,state,question):
119
+ knowledge_base = state["knowledge_base"]
120
+ # Set the question for which we want to find the answer
121
+ # question = "Identify the Chemical Capabilities Only"
122
+
123
+ # Perform a similarity search on the knowledge base to retrieve relevant documents
124
+ docs = knowledge_base.similarity_search(question)
125
+
126
+ # Initialize an OpenAI language model for question answering
127
+ template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
128
+ Identify the Chemical Capabilities Only.
129
+ {context}
130
+ Question :{question}.
131
+ The result should be in bullet points format.
132
+ """
133
+
134
+ prompt = PromptTemplate(template=template,input_variables=["context","question"])
135
+
136
+ llm = OpenAI(temperature=0.4)
137
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
138
+
139
+ # Load a question-answering chain using the language model
140
+ chain = load_qa_chain(llm, chain_type="stuff",prompt=prompt)
141
+
142
+ # Run the question-answering chain on the input documents and question
143
+ response = chain.run(input_documents=docs, question=question)
144
+
145
+ # Return the response as the answer to the question
146
+ return response
147
+
148
+ def identify_chemicals_in_files(self,file_paths,state,question):
149
+ """Upload a file and create a knowledge base from its contents.
150
+ Args:
151
+ file_paths : The files to uploaded.
152
+ Returns:
153
+ tuple: A tuple containing the file name and the knowledge base.
154
+ """
155
+
156
+
157
+ file_paths = [single_file_path.name for single_file_path in file_paths]
158
+ results =''
159
+ for file_obj in file_paths:
160
+
161
+ loader = UnstructuredFileLoader(file_obj, strategy="fast")
162
+
163
+ # Load the contents of the file using the loader
164
+ docs =loader.load()
165
+
166
+ # Create a knowledge base from the loaded documents using the create_knowledge_base() method
167
+ knowledge_base = self.create_knowledge_base(docs)
168
+ state = {"knowledge_base": knowledge_base}
169
+ pdf_name = os.path.basename(file_obj)
170
+
171
+ final_ans = self.get_chemicals_for_file(state,question)
172
+ results += pdf_name+"\n"+final_ans+"\n\n"
173
+
174
+ # Return a tuple containing the file name and the knowledge base
175
+ return results
176
+
177
+ def get_final_result(self,urls,file_paths,state,input_qus):
178
+
179
+ if urls:
180
+ if file_paths:
181
+ urls_chemicals = self.get_chemicals_for_url(urls,state,input_qus)
182
+ file_chemicals = self.identify_chemicals_in_files(file_paths,state,input_qus)
183
+ chemicals = urls_chemicals + file_chemicals
184
+
185
+ return chemicals
186
+ else:
187
+ urls_chemicals = self.get_chemicals_for_url(urls,state,input_qus)
188
+ return urls_chemicals
189
+ elif file_paths:
190
+ file_chemicals = self.identify_chemicals_in_files(file_paths,state,input_qus)
191
+ return file_chemicals
192
+ else:
193
+ return "No Files Uploaded"
194
+
195
+ document_qa = DocumentQA()
196
+ class ChemicalIdentifier:
197
+
198
+ def __init__(self):
199
+ openai.api_key = os.getenv("OPENAI_API_KEY")
200
+ # os.environ['OPENAI_API_KEY'] = openai_api_key
201
+
202
+ def get_empty_state(self):
203
+
204
+ """ Create empty Knowledge base"""
205
+
206
+ return {"knowledge_base": None}
207
+
208
+ def get_content_from_url(self,url:str)->List:
209
+ """
210
+ Uploads a file from a given URL and returns the loaded document.
211
+ Args:
212
+ url (str): The URL of the file to be uploaded.
213
+ Returns:
214
+ Document: The loaded document.
215
+ Raises:
216
+ ValueError: If the URL is not valid or the file cannot be fetched.
217
+ """
218
+
219
 
220
+ if validators.url(url):
221
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
222
+ r = requests.get(url,headers=headers)
223
+ if r.status_code != 200:
224
+ raise ValueError(
225
+ "Check the url of your file; returned status code %s" % r.status_code
226
+ )
227
+
228
+ content_type = r.headers.get("content-type")
229
+ file_extension = mimetypes.guess_extension(content_type)
230
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
231
+ temp_file.write(r.content)
232
+ file_path = temp_file.name
233
+ loader = UnstructuredFileLoader(file_path, strategy="fast")
234
+ docs = loader.load()
235
+ return docs
236
+ else:
237
+ raise ValueError("Please enter a valid URL")
238
 
239
  def create_knowledge_base(self,docs):
240
 
 
264
  # Return the resulting knowledge base
265
  return knowledge_base
266
 
267
+
268
+ def get_chemicals_for_url(self,urls:str,state)->str:
269
+ """
270
+ Retrieves chemicals from the provided URLs.
271
+
272
+ Args:
273
+ urls (str): Comma-separated URLs of the files to be processed.
274
+
275
+ Returns:
276
+ str: The extracted chemical names.
277
+
278
+ Raises:
279
+ ValueError: If an error occurs during the process.
280
+ """
281
+
282
+ total_chemical=[]
283
+ for url in urls.split(','):
284
+ webpage_text = self.get_content_from_url(url)
285
+ knowledge_base = self.create_knowledge_base(webpage_text)
286
+ state = {"knowledge_base": knowledge_base}
287
+ chemicals = self.get_chemicals_for_file(state)
288
+ total_chemical.append(str(url)+"\n"+chemicals+"\n\n")
289
+ list_of_chemicals = "".join(total_chemical)
290
+ return list_of_chemicals
291
+
292
+
293
  def file_path_show(self,file_paths):
294
+ file_paths = [single_file_path.name for single_file_path in file_paths]
295
+ return file_paths
296
 
297
  def get_chemicals_for_file(self,state):
 
298
  knowledge_base = state["knowledge_base"]
 
299
  # Set the question for which we want to find the answer
300
+ question = "list out chemicals.Result should be in bullet form"
301
 
302
  # Perform a similarity search on the knowledge base to retrieve relevant documents
303
  docs = knowledge_base.similarity_search(question)
304
 
305
  # Initialize an OpenAI language model for question answering
306
+ # template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
307
+ # list out all the chemical names.
308
+ # {context}
309
+ # Question :{question}.
310
+ # The result should be in bullet points format.
311
+ # """
312
 
313
+ # prompt = PromptTemplate(template=template,input_variables=["context","question"])
314
 
315
  llm = OpenAI(temperature=0.4)
 
316
 
317
  # Load a question-answering chain using the language model
318
+ chain = load_qa_chain(llm, chain_type="stuff")
319
 
320
  # Run the question-answering chain on the input documents and question
321
  response = chain.run(input_documents=docs, question=question)
 
333
 
334
 
335
  file_paths = [single_file_path.name for single_file_path in file_paths]
336
+ results =''
337
  for file_obj in file_paths:
338
 
339
  loader = UnstructuredFileLoader(file_obj, strategy="fast")
 
345
  knowledge_base = self.create_knowledge_base(docs)
346
  state = {"knowledge_base": knowledge_base}
347
  pdf_name = os.path.basename(file_obj)
348
+
349
  final_ans = self.get_chemicals_for_file(state)
350
+ results += pdf_name+"\n"+final_ans+"\n\n"
351
+
352
  # Return a tuple containing the file name and the knowledge base
 
353
  return results
354
 
355
+ def get_final_result(self,urls,file_paths,state):
356
+
357
  if urls:
358
  if file_paths:
359
+ urls_chemicals = self.get_chemicals_for_url(urls,state)
360
  file_chemicals = self.identify_chemicals_in_files(file_paths,state)
361
  chemicals = urls_chemicals + file_chemicals
362
 
363
  return chemicals
364
  else:
365
+ urls_chemicals = self.get_chemicals_for_url(urls,state)
366
  return urls_chemicals
367
  elif file_paths:
368
  file_chemicals = self.identify_chemicals_in_files(file_paths,state)
 
377
  """
378
 
379
  with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
380
+ gr.HTML("""<center><img src="https://hudsonandhayes.co.uk/wp-content/uploads/2023/01/Group-479.svg" height="110px" width="280px"></center>""")
 
 
381
  state = gr.State(self.get_empty_state())
382
+ gr.HTML("""<center><h1 style="color:#fff">Chemical Identifier</h1></center>""")
383
+
384
  with gr.Column(elem_id="col-container"):
385
  with gr.Row(elem_id="row-flex"):
386
  url = gr.Textbox(label="URL")
 
395
  file_count = "multiple",variant="primary")
396
  with gr.Row():
397
  with gr.Column(scale=1, min_width=0):
398
+ compare_btn = gr.Button(value="Chemicals",variant="primary")
399
+ with gr.Row():
400
+ with gr.Column(scale=1, min_width=0):
401
+ compared_result = gr.Textbox(value="",label='Chemicals :',show_label=True, placeholder="",lines=10)
402
+
403
  with gr.Row():
404
  with gr.Column(scale=1, min_width=0):
405
+ input_qus = gr.Textbox(value="",label='question :',show_label=True, placeholder="")
406
+ with gr.Row():
407
+ with gr.Column(scale=1, min_width=0):
408
+ find_answer = gr.Button(value="Find",label='Find',show_label=True, placeholder="")
409
+ with gr.Row():
410
+ with gr.Column(scale=1, min_width=0):
411
+ output = gr.Textbox(value="",label='output :',show_label=True, placeholder="")
412
 
413
  upload_button.upload(self.file_path_show, upload_button, [file_output])
414
 
415
  compare_btn.click(self.get_final_result,[url,upload_button,state],compared_result)
416
+
417
+ find_answer.click(document_qa.get_final_result,[url,upload_button,state,input_qus],output)
418
+ demo.launch(debug=True)
419
 
420
 
421
  if __name__ == "__main__":
422
 
 
423
  chemical_identifier = ChemicalIdentifier()
424
  chemical_identifier.gradio_interface()