That1BrainCell commited on
Commit
fff4b44
·
verified ·
1 Parent(s): c980209

New Changes

Browse files
Files changed (3) hide show
  1. embedding.py +262 -262
  2. infridgement_score.py +174 -0
  3. preprocess.py +184 -168
embedding.py CHANGED
@@ -1,263 +1,263 @@
1
- from PyPDF2 import PdfReader
2
- import requests
3
- import json
4
- import os
5
- import concurrent.futures
6
- import random
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- from langchain_community.document_loaders import WebBaseLoader
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- import google.generativeai as genai
12
- from io import BytesIO
13
-
14
-
15
-
16
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
17
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
18
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
19
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
20
-
21
- genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
22
-
23
-
24
- def pdf_extractor(link):
25
- text = ''
26
-
27
- try:
28
- # Fetch the PDF file from the URL
29
- response = requests.get(link)
30
- response.raise_for_status() # Raise an error for bad status codes
31
-
32
- # Use BytesIO to handle the PDF content in memory
33
- pdf_file = BytesIO(response.content)
34
-
35
- # Load the PDF file
36
- reader = PdfReader(pdf_file)
37
- for page in reader.pages:
38
- text += page.extract_text() # Extract text from each page
39
-
40
- except requests.exceptions.HTTPError as e:
41
- print(f'HTTP error occurred: {e}')
42
- except Exception as e:
43
- print(f'An error occurred: {e}')
44
-
45
- return [text]
46
-
47
- def web_extractor(link):
48
- text = ''
49
-
50
- try:
51
- loader = WebBaseLoader(link)
52
- pages = loader.load_and_split()
53
-
54
- for page in pages:
55
- text+=page.page_content
56
- except:
57
- pass
58
-
59
- return [text]
60
-
61
-
62
- def feature_extraction(tag, history , context):
63
-
64
- prompt = f'''
65
- You are an intelligent assistant tasked with updating product information. You have two data sources:
66
- 1. Tag_History: Previously gathered information about the product.
67
- 2. Tag_Context: New data that might contain additional details.
68
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
69
- Guidelines:
70
- - Only add new details that are relevant to the {tag} FIELD.
71
- - Do not add or modify any other fields in the Tag_History.
72
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
73
- Here is the data:
74
- Tag_Context: {str(context)}
75
- Tag_History: {history}
76
- Respond with the updated Tag_History.
77
- '''
78
-
79
- # model = random.choice([gemini,gemini1,gemini2,gemini3])
80
- result = gemini1.invoke(prompt)
81
-
82
- return result.content
83
-
84
- def detailed_feature_extraction(find, context):
85
-
86
- prompt = f'''
87
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
88
- 1. Context: The gathered information about the product.
89
- 2. Format: Details which need to be filled based on Context.
90
- Your job is to read the Context and update the relevant field in Format using Context.
91
- Guidelines:
92
- - Only add details that are relevant to the individual FIELD.
93
- - Do not add or modify any other fields in the Format.
94
- - If nothing found return None.
95
- Here is the data:
96
- The Context is {str(context)}
97
- The Format is {str(find)}
98
- '''
99
-
100
- model = random.choice([gemini,gemini1,gemini2,gemini3])
101
- result = model.invoke(prompt)
102
-
103
- return result.content
104
-
105
- def detailed_history(history):
106
-
107
- details = {
108
- "Introduction": {
109
- "Product Name": None,
110
- "Overview of the product": None,
111
- "Purpose of the manual": None,
112
- "Audience": None,
113
- "Additional Details": None
114
- },
115
- "Specifications": {
116
- "Technical specifications": None,
117
- "Performance metrics": None,
118
- "Additional Details": None
119
- },
120
- "Product Overview": {
121
- "Product features": None,
122
- "Key components and parts": None,
123
- "Additional Details": None
124
- },
125
- "Safety Information": {
126
- "Safety warnings and precautions": None,
127
- "Compliance and certification information": None,
128
- "Additional Details": None
129
- },
130
- "Installation Instructions": {
131
- "Unboxing and inventory checklist": None,
132
- "Step-by-step installation guide": None,
133
- "Required tools and materials": None,
134
- "Additional Details": None
135
- },
136
- "Setup and Configuration": {
137
- "Initial setup procedures": None,
138
- "Configuration settings": None,
139
- "Troubleshooting setup issues": None,
140
- "Additional Details": None
141
- },
142
- "Operation Instructions": {
143
- "How to use the product": None,
144
- "Detailed instructions for different functionalities": None,
145
- "User interface guide": None,
146
- "Additional Details": None
147
- },
148
- "Maintenance and Care": {
149
- "Cleaning instructions": None,
150
- "Maintenance schedule": None,
151
- "Replacement parts and accessories": None,
152
- "Additional Details": None
153
- },
154
- "Troubleshooting": {
155
- "Common issues and solutions": None,
156
- "Error messages and their meanings": None,
157
- "Support Information": None,
158
- "Additional Details": None
159
- },
160
- "Warranty Information": {
161
- "Terms and Conditions": None,
162
- "Service and repair information": None,
163
- "Additional Details": None
164
- },
165
- "Legal Information": {
166
- "Copyright information": None,
167
- "Trademarks and patents": None,
168
- "Disclaimers": None,
169
- "Additional Details": None
170
-
171
- }
172
- }
173
-
174
- for key,val in history.items():
175
-
176
- find = details[key]
177
-
178
- details[key] = str(detailed_feature_extraction(find,val))
179
-
180
- return details
181
-
182
-
183
- def get_embeddings(link,tag_option):
184
-
185
- print(f"\nCreating Embeddings ----- {link}")
186
-
187
- if tag_option=='Single':
188
- history = { "Details": "" }
189
-
190
- else:
191
- history = {
192
- "Introduction": "",
193
- "Specifications": "",
194
- "Product Overview": "",
195
- "Safety Information": "",
196
- "Installation Instructions": "",
197
- "Setup and Configuration": "",
198
- "Operation Instructions": "",
199
- "Maintenance and Care": "",
200
- "Troubleshooting": "",
201
- "Warranty Information": "",
202
- "Legal Information": ""
203
- }
204
-
205
- # Extract Text -----------------------------
206
- print("Extracting Text")
207
- if link[-3:] == '.md' or link[8:11] == 'en.':
208
- text = web_extractor(link)
209
- else:
210
- text = pdf_extractor(link)
211
-
212
- # Create Chunks ----------------------------
213
- print("Writing Tag Data")
214
-
215
- if tag_option=="Single":
216
- history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
217
-
218
- else:
219
- chunks = text_splitter.create_documents(text)
220
-
221
- for chunk in chunks:
222
-
223
- with concurrent.futures.ThreadPoolExecutor() as executor:
224
- future_to_key = {
225
- executor.submit(
226
- feature_extraction, f"Product {key}", history[key], chunk.page_content
227
- ): key for key in history
228
- }
229
- for future in concurrent.futures.as_completed(future_to_key):
230
- key = future_to_key[future]
231
- try:
232
- response = future.result()
233
- history[key] = response
234
- except Exception as e:
235
- print(f"Error processing {key}: {e}")
236
-
237
- print("Creating Vectors")
238
- genai_embeddings=[]
239
-
240
- for tag in history:
241
- result = genai.embed_content(
242
- model="models/embedding-001",
243
- content=history[tag],
244
- task_type="retrieval_document")
245
- genai_embeddings.append(result['embedding'])
246
-
247
-
248
- return history,genai_embeddings
249
-
250
- global text_splitter
251
- global data
252
- global history
253
-
254
-
255
- text_splitter = RecursiveCharacterTextSplitter(
256
- chunk_size = 10000,
257
- chunk_overlap = 100,
258
- separators = ["",''," "]
259
- )
260
-
261
- if __name__ == '__main__':
262
- # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Single"))
263
  pass
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from io import BytesIO
13
+
14
+
15
+
16
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
17
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
18
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
19
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
20
+
21
+ genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
22
+
23
+
24
+ def pdf_extractor(link):
25
+ text = ''
26
+
27
+ try:
28
+ # Fetch the PDF file from the URL
29
+ response = requests.get(link)
30
+ response.raise_for_status() # Raise an error for bad status codes
31
+
32
+ # Use BytesIO to handle the PDF content in memory
33
+ pdf_file = BytesIO(response.content)
34
+
35
+ # Load the PDF file
36
+ reader = PdfReader(pdf_file)
37
+ for page in reader.pages:
38
+ text += page.extract_text() # Extract text from each page
39
+
40
+ except requests.exceptions.HTTPError as e:
41
+ print(f'HTTP error occurred: {e}')
42
+ except Exception as e:
43
+ print(f'An error occurred: {e}')
44
+
45
+ return [text]
46
+
47
+ def web_extractor(link):
48
+ text = ''
49
+
50
+ try:
51
+ loader = WebBaseLoader(link)
52
+ pages = loader.load_and_split()
53
+
54
+ for page in pages:
55
+ text+=page.page_content
56
+ except:
57
+ pass
58
+
59
+ return [text]
60
+
61
+
62
+ def feature_extraction(tag, history , context):
63
+
64
+ prompt = f'''
65
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
66
+ 1. Tag_History: Previously gathered information about the product.
67
+ 2. Tag_Context: New data that might contain additional details.
68
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
69
+ Guidelines:
70
+ - Only add new details that are relevant to the {tag} FIELD.
71
+ - Do not add or modify any other fields in the Tag_History.
72
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
73
+ Here is the data:
74
+ Tag_Context: {str(context)}
75
+ Tag_History: {history}
76
+ Respond with the updated Tag_History.
77
+ '''
78
+
79
+ # model = random.choice([gemini,gemini1,gemini2,gemini3])
80
+ result = gemini1.invoke(prompt)
81
+
82
+ return result.content
83
+
84
+ def detailed_feature_extraction(find, context):
85
+
86
+ prompt = f'''
87
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
88
+ 1. Context: The gathered information about the product.
89
+ 2. Format: Details which need to be filled based on Context.
90
+ Your job is to read the Context and update the relevant field in Format using Context.
91
+ Guidelines:
92
+ - Only add details that are relevant to the individual FIELD.
93
+ - Do not add or modify any other fields in the Format.
94
+ - If nothing found return None.
95
+ Here is the data:
96
+ The Context is {str(context)}
97
+ The Format is {str(find)}
98
+ '''
99
+
100
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
101
+ result = model.invoke(prompt)
102
+
103
+ return result.content
104
+
105
+ def detailed_history(history):
106
+
107
+ details = {
108
+ "Introduction": {
109
+ "Product Name": None,
110
+ "Overview of the product": None,
111
+ "Purpose of the manual": None,
112
+ "Audience": None,
113
+ "Additional Details": None
114
+ },
115
+ "Specifications": {
116
+ "Technical specifications": None,
117
+ "Performance metrics": None,
118
+ "Additional Details": None
119
+ },
120
+ "Product Overview": {
121
+ "Product features": None,
122
+ "Key components and parts": None,
123
+ "Additional Details": None
124
+ },
125
+ "Safety Information": {
126
+ "Safety warnings and precautions": None,
127
+ "Compliance and certification information": None,
128
+ "Additional Details": None
129
+ },
130
+ "Installation Instructions": {
131
+ "Unboxing and inventory checklist": None,
132
+ "Step-by-step installation guide": None,
133
+ "Required tools and materials": None,
134
+ "Additional Details": None
135
+ },
136
+ "Setup and Configuration": {
137
+ "Initial setup procedures": None,
138
+ "Configuration settings": None,
139
+ "Troubleshooting setup issues": None,
140
+ "Additional Details": None
141
+ },
142
+ "Operation Instructions": {
143
+ "How to use the product": None,
144
+ "Detailed instructions for different functionalities": None,
145
+ "User interface guide": None,
146
+ "Additional Details": None
147
+ },
148
+ "Maintenance and Care": {
149
+ "Cleaning instructions": None,
150
+ "Maintenance schedule": None,
151
+ "Replacement parts and accessories": None,
152
+ "Additional Details": None
153
+ },
154
+ "Troubleshooting": {
155
+ "Common issues and solutions": None,
156
+ "Error messages and their meanings": None,
157
+ "Support Information": None,
158
+ "Additional Details": None
159
+ },
160
+ "Warranty Information": {
161
+ "Terms and Conditions": None,
162
+ "Service and repair information": None,
163
+ "Additional Details": None
164
+ },
165
+ "Legal Information": {
166
+ "Copyright information": None,
167
+ "Trademarks and patents": None,
168
+ "Disclaimers": None,
169
+ "Additional Details": None
170
+
171
+ }
172
+ }
173
+
174
+ for key,val in history.items():
175
+
176
+ find = details[key]
177
+
178
+ details[key] = str(detailed_feature_extraction(find,val))
179
+
180
+ return details
181
+
182
+
183
+ def get_embeddings(link,tag_option):
184
+
185
+ print(f"\nCreating Embeddings ----- {link}")
186
+
187
+ if tag_option=='Complete Document Similarity':
188
+ history = { "Details": "" }
189
+
190
+ else:
191
+ history = {
192
+ "Introduction": "",
193
+ "Specifications": "",
194
+ "Product Overview": "",
195
+ "Safety Information": "",
196
+ "Installation Instructions": "",
197
+ "Setup and Configuration": "",
198
+ "Operation Instructions": "",
199
+ "Maintenance and Care": "",
200
+ "Troubleshooting": "",
201
+ "Warranty Information": "",
202
+ "Legal Information": ""
203
+ }
204
+
205
+ # Extract Text -----------------------------
206
+ print("Extracting Text")
207
+ if link[-3:] == '.md' or link[8:11] == 'en.':
208
+ text = web_extractor(link)
209
+ else:
210
+ text = pdf_extractor(link)
211
+
212
+ # Create Chunks ----------------------------
213
+ print("Writing Tag Data")
214
+
215
+ if tag_option=="Complete Document Similarity":
216
+ history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
217
+
218
+ else:
219
+ chunks = text_splitter.create_documents(text)
220
+
221
+ for chunk in chunks:
222
+
223
+ with concurrent.futures.ThreadPoolExecutor() as executor:
224
+ future_to_key = {
225
+ executor.submit(
226
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
227
+ ): key for key in history
228
+ }
229
+ for future in concurrent.futures.as_completed(future_to_key):
230
+ key = future_to_key[future]
231
+ try:
232
+ response = future.result()
233
+ history[key] = response
234
+ except Exception as e:
235
+ print(f"Error processing {key}: {e}")
236
+
237
+ print("Creating Vectors")
238
+ genai_embeddings=[]
239
+
240
+ for tag in history:
241
+ result = genai.embed_content(
242
+ model="models/embedding-001",
243
+ content=history[tag],
244
+ task_type="retrieval_document")
245
+ genai_embeddings.append(result['embedding'])
246
+
247
+
248
+ return history,genai_embeddings
249
+
250
+ global text_splitter
251
+ global data
252
+ global history
253
+
254
+
255
+ text_splitter = RecursiveCharacterTextSplitter(
256
+ chunk_size = 10000,
257
+ chunk_overlap = 100,
258
+ separators = ["",''," "]
259
+ )
260
+
261
+ if __name__ == '__main__':
262
+ # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Single"))
263
  pass
infridgement_score.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import concurrent.futures
3
+ from concurrent.futures import ThreadPoolExecutor,as_completed
4
+ from functools import partial
5
+ import numpy as np
6
+ from io import StringIO
7
+ import sys
8
+ import time
9
+
10
+ # File Imports
11
+ from embedding import get_embeddings # Ensure this file/module is available
12
+ from preprocess import filtering # Ensure this file/module is available
13
+ from search import *
14
+
15
+ # Cosine Similarity Function
16
+ def cosine_similarity(vec1, vec2):
17
+ vec1 = np.array(vec1)
18
+ vec2 = np.array(vec2)
19
+
20
+ dot_product = np.dot(vec1, vec2)
21
+ magnitude_vec1 = np.linalg.norm(vec1)
22
+ magnitude_vec2 = np.linalg.norm(vec2)
23
+
24
+ if magnitude_vec1 == 0 or magnitude_vec2 == 0:
25
+ return 0.0
26
+
27
+ cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
28
+ return cosine_sim
29
+
30
+ # Logger class to capture output
31
+ class StreamCapture:
32
+ def __init__(self):
33
+ self.output = StringIO()
34
+ self._stdout = sys.stdout
35
+
36
+ def __enter__(self):
37
+ sys.stdout = self.output
38
+ return self.output
39
+
40
+ def __exit__(self, exc_type, exc_val, exc_tb):
41
+ sys.stdout = self._stdout
42
+
43
+ # Main Function
44
+ def score(main_product, main_url, product_count, link_count, search, logger, log_area):
45
+ data = {}
46
+ similar_products = extract_similar_products(main_product)[:product_count]
47
+
48
+ if search == 'All':
49
+
50
+ def process_product(product, search_function, main_product):
51
+ search_result = search_function(product)
52
+ return filtering(search_result, main_product, product, link_count)
53
+
54
+
55
+ search_functions = {
56
+ 'google': search_google,
57
+ 'duckduckgo': search_duckduckgo,
58
+ # 'archive': search_archive,
59
+ 'github': search_github,
60
+ 'wikipedia': search_wikipedia
61
+ }
62
+
63
+ with ThreadPoolExecutor() as executor:
64
+ future_to_product_search = {
65
+ executor.submit(process_product, product, search_function, main_product): (product, search_name)
66
+ for product in similar_products
67
+ for search_name, search_function in search_functions.items()
68
+ }
69
+
70
+ for future in as_completed(future_to_product_search):
71
+ product, search_name = future_to_product_search[future]
72
+ try:
73
+ if product not in data:
74
+ data[product] = {}
75
+ data[product] = future.result()
76
+ except Exception as e:
77
+ print(f"Error processing product {product} with {search_name}: {e}")
78
+
79
+ else:
80
+
81
+ for product in similar_products:
82
+
83
+ if search == 'google':
84
+ data[product] = filtering(search_google(product), main_product, product, link_count)
85
+ elif search == 'duckduckgo':
86
+ data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
87
+ elif search == 'archive':
88
+ data[product] = filtering(search_archive(product), main_product, product, link_count)
89
+ elif search == 'github':
90
+ data[product] = filtering(search_github(product), main_product, product, link_count)
91
+ elif search == 'wikipedia':
92
+ data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
93
+
94
+ logger.write("\n\nFiltered Links ------------------>\n")
95
+ logger.write(str(data) + "\n")
96
+ log_area.text(logger.getvalue())
97
+
98
+ logger.write("\n\nCreating Main product Embeddings ---------->\n")
99
+ main_result, main_embedding = get_embeddings(main_url,tag_option)
100
+ log_area.text(logger.getvalue())
101
+
102
+ print("main",main_embedding)
103
+
104
+ cosine_sim_scores = []
105
+
106
+ logger.write("\n\nCreating Similar product Embeddings ---------->\n")
107
+ log_area.text(logger.getvalue())
108
+
109
+
110
+ for product in data:
111
+
112
+ if len(data[product])==0:
113
+ logger.write("\n\nNo Product links Found Increase No of Links or Change Search Source\n")
114
+ log_area.text(logger.getvalue())
115
+
116
+ cosine_sim_scores.append((product,'No Product links Found Increase Number of Links or Change Search Source',None,None))
117
+
118
+ else:
119
+ for link in data[product][:link_count]:
120
+
121
+ similar_result, similar_embedding = get_embeddings(link,tag_option)
122
+ log_area.text(logger.getvalue())
123
+
124
+ print(similar_embedding)
125
+ for i in range(len(main_embedding)):
126
+ score = cosine_similarity(main_embedding[i], similar_embedding[i])
127
+ cosine_sim_scores.append((product, link, i, score))
128
+ log_area.text(logger.getvalue())
129
+
130
+ logger.write("--------------- DONE -----------------\n")
131
+ log_area.text(logger.getvalue())
132
+ return cosine_sim_scores, main_result
133
+
134
+ # Streamlit Interface
135
+ st.title("Check Infringement")
136
+
137
+
138
+ # Inputs
139
+ main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
140
+ main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
141
+ search_method = st.selectbox('Choose Search Engine', ['All','duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
142
+
143
+ col1, col2 = st.columns(2)
144
+ with col1:
145
+ product_count = st.number_input("Number of Simliar Products",min_value=1, step=1, format="%i")
146
+ with col2:
147
+ link_count = st.number_input("Number of Links per product",min_value=1, step=1, format="%i")
148
+
149
+
150
+ tag_option = st.selectbox('Choose Similarity Method', ["Complete Document Similarity","Feild Wise Document Similarity"])
151
+
152
+
153
+ if st.button('Check for Infringement'):
154
+ log_output = st.empty() # Placeholder for log output
155
+
156
+ with st.spinner('Processing...'):
157
+ with StreamCapture() as logger:
158
+ cosine_sim_scores, main_result = score(main_product, main_url,product_count, link_count, search_method, logger, log_output)
159
+
160
+ st.success('Processing complete!')
161
+
162
+ st.subheader("Cosine Similarity Scores")
163
+
164
+ # = score(main_product, main_url, search, logger, log_output)
165
+ if tag_option == 'Complete Document Similarity':
166
+ tags = ['Details']
167
+ else:
168
+ tags = ['Introduction', 'Specifications', 'Product Overview', 'Safety Information', 'Installation Instructions', 'Setup and Configuration', 'Operation Instructions', 'Maintenance and Care', 'Troubleshooting', 'Warranty Information', 'Legal Information']
169
+
170
+ for product, link, index, value in cosine_sim_scores:
171
+ if not index:
172
+ st.write(f"Product: {product}, Link: {link}")
173
+ if value!=None:
174
+ st.write(f"{tags[index]:<20} - Similarity: {value:.2f}")
preprocess.py CHANGED
@@ -1,168 +1,184 @@
1
- import requests
2
- import json
3
- import random
4
- import concurrent.futures
5
- from concurrent.futures import ThreadPoolExecutor
6
- from langchain_community.document_loaders import PyPDFLoader
7
- from langdetect import detect_langs
8
- import requests
9
- from PyPDF2 import PdfReader
10
- from io import BytesIO
11
- from langchain_community.document_loaders import WebBaseLoader
12
- from langchain_google_genai import ChatGoogleGenerativeAI
13
- import logging
14
-
15
- data = False
16
- seen = set()
17
-
18
- # API Urls -----
19
-
20
- # main_url = "http://127.0.0.1:5000/search/all"
21
- main_url = "http://127.0.0.1:8000/search/all"
22
- # main_product = "Samsung Galaxy s23 ultra"
23
-
24
- # Revelevance Checking Models -----
25
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
-
30
-
31
- # API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
32
- # headers = {"Authorization": "Bearer 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"}
33
-
34
- # Error Debug
35
- logging.basicConfig(level=logging.INFO)
36
-
37
-
38
- def get_links(main_product,api_key):
39
- params = {
40
- "API_KEY": f"{api_key}",
41
- "product": f"{main_product}",
42
- }
43
-
44
- # Flask
45
- response = requests.get(main_url, params=params)
46
-
47
- # FastAPI
48
- # response = requests.post(main_url, json=params)
49
-
50
-
51
- if response.status_code == 200:
52
- results = response.json()
53
- with open('data.json', 'w') as f:
54
- json.dump(results, f)
55
- else:
56
- print(f"Failed to fetch results: {response.status_code}")
57
-
58
-
59
-
60
- def language_preprocess(text):
61
- try:
62
- if detect_langs(text)[0].lang == 'en':
63
- return True
64
- return False
65
- except:
66
- return False
67
-
68
-
69
- def relevant(product, similar_product, content):
70
-
71
- try:
72
- payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
73
-
74
- # response = requests.post(API_URL, headers=headers, json=payload)
75
- # output = response.json()
76
- # return bool(output[0]['generated_text'])
77
-
78
- model = random.choice([gemini,gemini1,gemini2,gemini3])
79
- result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
80
- return bool(result)
81
-
82
- except:
83
- return False
84
-
85
-
86
-
87
- def download_pdf(url, timeout=10):
88
- try:
89
- response = requests.get(url, timeout=timeout)
90
- response.raise_for_status()
91
- return BytesIO(response.content)
92
-
93
- except requests.RequestException as e:
94
- logging.error(f"PDF download error: {e}")
95
- return None
96
-
97
- def extract_text_from_pdf(pdf_file, pages):
98
- reader = PdfReader(pdf_file)
99
- extracted_text = ""
100
-
101
- l = len(reader.pages)
102
-
103
- try:
104
- for page_num in pages:
105
- if page_num < l:
106
- page = reader.pages[page_num]
107
- extracted_text += page.extract_text() + "\n"
108
- else:
109
- print(f"Page {page_num} does not exist in the document.")
110
-
111
- return extracted_text
112
-
113
- except:
114
- return 'हे चालत नाही'
115
-
116
- def extract_text_online(link):
117
-
118
- loader = WebBaseLoader(link)
119
- pages = loader.load_and_split()
120
-
121
- text = ''
122
-
123
- for page in pages[:3]:
124
- text+=page.page_content
125
-
126
- return text
127
-
128
-
129
- def process_link(link, main_product, similar_product):
130
- if link in seen:
131
- return None
132
- seen.add(link)
133
- try:
134
- if link[-3:]=='.md' or link[8:11] == 'en.':
135
- text = extract_text_online(link)
136
- else:
137
- pdf_file = download_pdf(link)
138
- text = extract_text_from_pdf(pdf_file, [0, 2, 4])
139
-
140
- if language_preprocess(text):
141
- if relevant(main_product, similar_product, text):
142
- print("Accepted",link)
143
- return link
144
- except:
145
- pass
146
- print("NOT Accepted",link)
147
- return None
148
-
149
- def filtering(urls, main_product, similar_product):
150
- res = []
151
-
152
- print(f"Filtering Links of ---- {similar_product}")
153
- # Main Preprocess ------------------------------
154
- with ThreadPoolExecutor() as executor:
155
- futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
156
- for future in concurrent.futures.as_completed(futures):
157
- result = future.result()
158
- if result is not None:
159
- res.append(result)
160
-
161
- return res
162
-
163
-
164
- # Main Functions -------------------------------------------------->
165
-
166
- # get_links()
167
- # preprocess()
168
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import random
4
+ import concurrent.futures
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langdetect import detect_langs
8
+ import requests
9
+ from PyPDF2 import PdfReader
10
+ from io import BytesIO
11
+ from langchain_community.document_loaders import WebBaseLoader
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ import logging
14
+
15
+ data = False
16
+ seen = set()
17
+
18
+ # API Urls -----
19
+
20
+ # main_url = "http://127.0.0.1:5000/search/all"
21
+ main_url = "http://127.0.0.1:8000/search/all"
22
+ # main_product = "Samsung Galaxy s23 ultra"
23
+
24
+ # Revelevance Checking Models -----
25
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
+
30
+
31
+ API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
32
+ headers = {"Authorization": "Bearer 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"}
33
+
34
+ # Error Debug
35
+ logging.basicConfig(level=logging.INFO)
36
+
37
+
38
+ def get_links(main_product,api_key):
39
+ params = {
40
+ "API_KEY": f"{api_key}",
41
+ "product": f"{main_product}",
42
+ }
43
+
44
+ # Flask
45
+ response = requests.get(main_url, params=params)
46
+
47
+ # FastAPI
48
+ # response = requests.post(main_url, json=params)
49
+
50
+
51
+ if response.status_code == 200:
52
+ results = response.json()
53
+ with open('data.json', 'w') as f:
54
+ json.dump(results, f)
55
+ else:
56
+ print(f"Failed to fetch results: {response.status_code}")
57
+
58
+
59
+
60
+ def language_preprocess(text):
61
+ try:
62
+ if detect_langs(text)[0].lang == 'en':
63
+ return True
64
+ return False
65
+ except:
66
+ return False
67
+
68
+
69
+ def relevant(product, similar_product, content):
70
+
71
+ try:
72
+ payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
73
+
74
+ # response = requests.post(API_URL, headers=headers, json=payload)
75
+ # output = response.json()
76
+ # return bool(output[0]['generated_text'])
77
+
78
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
79
+ result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
80
+ return bool(result)
81
+
82
+ except:
83
+ return False
84
+
85
+
86
+
87
+ def download_pdf(url, timeout=10):
88
+ try:
89
+ response = requests.get(url, timeout=timeout)
90
+ response.raise_for_status()
91
+ return BytesIO(response.content)
92
+
93
+ except requests.RequestException as e:
94
+ logging.error(f"PDF download error: {e}")
95
+ return None
96
+
97
+ def extract_text_from_pdf(pdf_file, pages):
98
+ reader = PdfReader(pdf_file)
99
+ extracted_text = ""
100
+
101
+ l = len(reader.pages)
102
+
103
+ try:
104
+ for page_num in pages:
105
+ if page_num < l:
106
+ page = reader.pages[page_num]
107
+ extracted_text += page.extract_text() + "\n"
108
+ else:
109
+ print(f"Page {page_num} does not exist in the document.")
110
+
111
+ return extracted_text
112
+
113
+ except:
114
+ return 'हे चालत नाही'
115
+
116
+ def extract_text_online(link):
117
+
118
+ loader = WebBaseLoader(link)
119
+ pages = loader.load_and_split()
120
+
121
+ text = ''
122
+
123
+ for page in pages[:3]:
124
+ text+=page.page_content
125
+
126
+ return text
127
+
128
+
129
+ def process_link(link, main_product, similar_product):
130
+ if link in seen:
131
+ return None
132
+ seen.add(link)
133
+ try:
134
+ if link[-3:]=='.md' or link[8:11] == 'en.':
135
+ text = extract_text_online(link)
136
+ else:
137
+ pdf_file = download_pdf(link)
138
+ text = extract_text_from_pdf(pdf_file, [0, 2, 4])
139
+
140
+ if language_preprocess(text):
141
+ if relevant(main_product, similar_product, text):
142
+ print("Accepted",link)
143
+ return link
144
+ except:
145
+ pass
146
+ print("NOT Accepted",link)
147
+ return None
148
+
149
+ def filtering(urls, main_product, similar_product, link_count):
150
+ res = []
151
+
152
+ # print(f"Filtering Links of ---- {similar_product}")
153
+ # Main Preprocess ------------------------------
154
+ # with ThreadPoolExecutor() as executor:
155
+ # futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
156
+ # for future in concurrent.futures.as_completed(futures):
157
+ # result = future.result()
158
+ # if result is not None:
159
+ # res.append(result)
160
+
161
+ # return res
162
+
163
+ count = 0
164
+
165
+ print(f"Filtering Links of ---- {similar_product}")
166
+
167
+ for link in urls:
168
+ result = process_link(link, main_product, similar_product)
169
+
170
+ if result is not None:
171
+ res.append(result)
172
+ count += 1
173
+
174
+ if count == link_count:
175
+ break
176
+
177
+ return res
178
+
179
+
180
+ # Main Functions -------------------------------------------------->
181
+
182
+ # get_links()
183
+ # preprocess()
184
+