Files changed (1) hide show
  1. preprocess.py +168 -168
preprocess.py CHANGED
@@ -1,168 +1,168 @@
1
- import requests
2
- import json
3
- import random
4
- import concurrent.futures
5
- from concurrent.futures import ThreadPoolExecutor
6
- from langchain_community.document_loaders import PyPDFLoader
7
- from langdetect import detect_langs
8
- import requests
9
- from PyPDF2 import PdfReader
10
- from io import BytesIO
11
- from langchain_community.document_loaders import WebBaseLoader
12
- from langchain_google_genai import ChatGoogleGenerativeAI
13
- import logging
14
-
15
- data = False
16
- seen = set()
17
-
18
- # API Urls -----
19
-
20
- # main_url = "http://127.0.0.1:5000/search/all"
21
- main_url = "http://127.0.0.1:8000/search/all"
22
- # main_product = "Samsung Galaxy s23 ultra"
23
-
24
- # Revelevance Checking Models -----
25
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
-
30
-
31
- API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
32
- headers = {"Authorization": "Bearer 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"}
33
-
34
- # Error Debug
35
- logging.basicConfig(level=logging.INFO)
36
-
37
-
38
- def get_links(main_product,api_key):
39
- params = {
40
- "API_KEY": f"{api_key}",
41
- "product": f"{main_product}",
42
- }
43
-
44
- # Flask
45
- response = requests.get(main_url, params=params)
46
-
47
- # FastAPI
48
- # response = requests.post(main_url, json=params)
49
-
50
-
51
- if response.status_code == 200:
52
- results = response.json()
53
- with open('data.json', 'w') as f:
54
- json.dump(results, f)
55
- else:
56
- print(f"Failed to fetch results: {response.status_code}")
57
-
58
-
59
-
60
- def language_preprocess(text):
61
- try:
62
- if detect_langs(text)[0].lang == 'en':
63
- return True
64
- return False
65
- except:
66
- return False
67
-
68
-
69
- def relevant(product, similar_product, content):
70
-
71
- try:
72
- payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
73
-
74
- # response = requests.post(API_URL, headers=headers, json=payload)
75
- # output = response.json()
76
- # return bool(output[0]['generated_text'])
77
-
78
- model = random.choice([gemini,gemini1,gemini2,gemini3])
79
- result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
80
- return bool(result)
81
-
82
- except:
83
- return False
84
-
85
-
86
-
87
- def download_pdf(url, timeout=10):
88
- try:
89
- response = requests.get(url, timeout=timeout)
90
- response.raise_for_status()
91
- return BytesIO(response.content)
92
-
93
- except requests.RequestException as e:
94
- logging.error(f"PDF download error: {e}")
95
- return None
96
-
97
- def extract_text_from_pdf(pdf_file, pages):
98
- reader = PdfReader(pdf_file)
99
- extracted_text = ""
100
-
101
- l = len(reader.pages)
102
-
103
- try:
104
- for page_num in pages:
105
- if page_num < l:
106
- page = reader.pages[page_num]
107
- extracted_text += page.extract_text() + "\n"
108
- else:
109
- print(f"Page {page_num} does not exist in the document.")
110
-
111
- return extracted_text
112
-
113
- except:
114
- return 'हे चालत नाही'
115
-
116
- def extract_text_online(link):
117
-
118
- loader = WebBaseLoader(link)
119
- pages = loader.load_and_split()
120
-
121
- text = ''
122
-
123
- for page in pages[:3]:
124
- text+=page.page_content
125
-
126
- return text
127
-
128
-
129
- def process_link(link, main_product, similar_product):
130
- if link in seen:
131
- return None
132
- seen.add(link)
133
- try:
134
- if link[-3:]=='.md' or link[8:11] == 'en.':
135
- text = extract_text_online(link)
136
- else:
137
- pdf_file = download_pdf(link)
138
- text = extract_text_from_pdf(pdf_file, [0, 2, 4])
139
-
140
- if language_preprocess(text):
141
- if relevant(main_product, similar_product, text):
142
- print("Accepted",link)
143
- return link
144
- except:
145
- pass
146
- print("NOT Accepted",link)
147
- return None
148
-
149
- def filtering(urls, main_product, similar_product):
150
- res = []
151
-
152
- print(f"Filtering Links of ---- {similar_product}")
153
- # Main Preprocess ------------------------------
154
- with ThreadPoolExecutor() as executor:
155
- futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
156
- for future in concurrent.futures.as_completed(futures):
157
- result = future.result()
158
- if result is not None:
159
- res.append(result)
160
-
161
- return res
162
-
163
-
164
- # Main Functions -------------------------------------------------->
165
-
166
- # get_links()
167
- # preprocess()
168
-
 
1
+ import requests
2
+ import json
3
+ import random
4
+ import concurrent.futures
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langdetect import detect_langs
8
+ import requests
9
+ from PyPDF2 import PdfReader
10
+ from io import BytesIO
11
+ from langchain_community.document_loaders import WebBaseLoader
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ import logging
14
+
15
+ data = False
16
+ seen = set()
17
+
18
+ # API Urls -----
19
+
20
+ # main_url = "http://127.0.0.1:5000/search/all"
21
+ main_url = "http://127.0.0.1:8000/search/all"
22
+ # main_product = "Samsung Galaxy s23 ultra"
23
+
24
+ # Revelevance Checking Models -----
25
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
+
30
+
31
+ # API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
32
+ # headers = {"Authorization": "Bearer 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"}
33
+
34
+ # Error Debug
35
+ logging.basicConfig(level=logging.INFO)
36
+
37
+
38
+ def get_links(main_product,api_key):
39
+ params = {
40
+ "API_KEY": f"{api_key}",
41
+ "product": f"{main_product}",
42
+ }
43
+
44
+ # Flask
45
+ response = requests.get(main_url, params=params)
46
+
47
+ # FastAPI
48
+ # response = requests.post(main_url, json=params)
49
+
50
+
51
+ if response.status_code == 200:
52
+ results = response.json()
53
+ with open('data.json', 'w') as f:
54
+ json.dump(results, f)
55
+ else:
56
+ print(f"Failed to fetch results: {response.status_code}")
57
+
58
+
59
+
60
+ def language_preprocess(text):
61
+ try:
62
+ if detect_langs(text)[0].lang == 'en':
63
+ return True
64
+ return False
65
+ except:
66
+ return False
67
+
68
+
69
+ def relevant(product, similar_product, content):
70
+
71
+ try:
72
+ payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
73
+
74
+ # response = requests.post(API_URL, headers=headers, json=payload)
75
+ # output = response.json()
76
+ # return bool(output[0]['generated_text'])
77
+
78
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
79
+ result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
80
+ return bool(result)
81
+
82
+ except:
83
+ return False
84
+
85
+
86
+
87
+ def download_pdf(url, timeout=10):
88
+ try:
89
+ response = requests.get(url, timeout=timeout)
90
+ response.raise_for_status()
91
+ return BytesIO(response.content)
92
+
93
+ except requests.RequestException as e:
94
+ logging.error(f"PDF download error: {e}")
95
+ return None
96
+
97
+ def extract_text_from_pdf(pdf_file, pages):
98
+ reader = PdfReader(pdf_file)
99
+ extracted_text = ""
100
+
101
+ l = len(reader.pages)
102
+
103
+ try:
104
+ for page_num in pages:
105
+ if page_num < l:
106
+ page = reader.pages[page_num]
107
+ extracted_text += page.extract_text() + "\n"
108
+ else:
109
+ print(f"Page {page_num} does not exist in the document.")
110
+
111
+ return extracted_text
112
+
113
+ except:
114
+ return 'हे चालत नाही'
115
+
116
+ def extract_text_online(link):
117
+
118
+ loader = WebBaseLoader(link)
119
+ pages = loader.load_and_split()
120
+
121
+ text = ''
122
+
123
+ for page in pages[:3]:
124
+ text+=page.page_content
125
+
126
+ return text
127
+
128
+
129
+ def process_link(link, main_product, similar_product):
130
+ if link in seen:
131
+ return None
132
+ seen.add(link)
133
+ try:
134
+ if link[-3:]=='.md' or link[8:11] == 'en.':
135
+ text = extract_text_online(link)
136
+ else:
137
+ pdf_file = download_pdf(link)
138
+ text = extract_text_from_pdf(pdf_file, [0, 2, 4])
139
+
140
+ if language_preprocess(text):
141
+ if relevant(main_product, similar_product, text):
142
+ print("Accepted",link)
143
+ return link
144
+ except:
145
+ pass
146
+ print("NOT Accepted",link)
147
+ return None
148
+
149
+ def filtering(urls, main_product, similar_product):
150
+ res = []
151
+
152
+ print(f"Filtering Links of ---- {similar_product}")
153
+ # Main Preprocess ------------------------------
154
+ with ThreadPoolExecutor() as executor:
155
+ futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
156
+ for future in concurrent.futures.as_completed(futures):
157
+ result = future.result()
158
+ if result is not None:
159
+ res.append(result)
160
+
161
+ return res
162
+
163
+
164
+ # Main Functions -------------------------------------------------->
165
+
166
+ # get_links()
167
+ # preprocess()
168
+