Prathmesh48 commited on
Commit
73f4358
1 Parent(s): fa1e477

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -327
app.py CHANGED
@@ -1,327 +1,172 @@
1
- from flask import Flask, request, jsonify, render_template
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from googlesearch import search
5
- from duckduckgo_search import DDGS
6
- import concurrent.futures
7
- import re
8
-
9
-
10
- app = Flask(__name__)
11
-
12
- API_KEY_DEFAULT = '12345'
13
-
14
- # Function to search DuckDuckGo
15
- def duckduckgo_search(query):
16
- try:
17
- results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
- return [res['href'] for res in results]
19
- except:
20
- return []
21
-
22
- # Function to search Google
23
- def google_search(query):
24
-
25
- links = []
26
- try:
27
- api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
28
- search_engine_id = 'c4ca951b9fc6949cb'
29
-
30
- url = f"https://www.googleapis.com/customsearch/v1"
31
- params = {
32
- "key": api_key,
33
- "cx": search_engine_id,
34
- "q": query + " manual filetype:pdf"
35
- }
36
-
37
- response = requests.get(url, params=params)
38
- results = response.json()
39
-
40
- for item in results.get('items', []):
41
- links.append(item['link'])
42
- except:
43
- pass
44
-
45
- try:
46
- extension = "ext:pdf"
47
- for result in search(query + " manual " + extension, num_results=5):
48
- if result.endswith('.pdf'):
49
- links.append(result)
50
- except:
51
- pass
52
-
53
-
54
- return links
55
-
56
- # Function to search Internet Archive
57
- def archive_search(query):
58
-
59
- try:
60
- url = "https://archive.org/advancedsearch.php"
61
- params = {
62
- 'q': f'{query} manual',
63
- 'fl[]': ['identifier', 'title', 'format'],
64
- 'rows': 50,
65
- 'page': 1,
66
- 'output': 'json'
67
- }
68
-
69
- # Make the request
70
- response = requests.get(url, params=params)
71
- data = response.json()
72
-
73
- # Function to extract hyperlinks from a webpage
74
- def extract_hyperlinks(url):
75
- # Send a GET request to the URL
76
- response = requests.get(url)
77
-
78
- # Check if the request was successful
79
- if response.status_code == 200:
80
- # Parse the HTML content of the page
81
- soup = BeautifulSoup(response.text, 'html.parser')
82
-
83
- # Find all <a> tags (hyperlinks)
84
- for link in soup.find_all('a', href=True):
85
- href = link['href']
86
- if href.endswith('.pdf'):
87
- pdf_files.append(url+'/'+href)
88
- if href.endswith('.iso'):
89
- # If the link ends with .iso, follow the link and extract .pdf hyperlinks
90
- extract_pdf_from_iso(url+'/'+href+'/')
91
-
92
- # Function to extract .pdf hyperlinks from an .iso file
93
- def extract_pdf_from_iso(iso_url):
94
- # Send a GET request to the ISO URL
95
- iso_response = requests.get(iso_url)
96
-
97
- # Check if the request was successful
98
- if iso_response.status_code == 200:
99
- # Parse the HTML content of the ISO page
100
- iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
101
-
102
- # Find all <a> tags (hyperlinks) in the ISO page
103
- for link in iso_soup.find_all('a', href=True):
104
- href = link['href']
105
- if href.endswith('.pdf'):
106
- pdf_files.append('https:'+href)
107
-
108
- pdf_files = []
109
-
110
- def process_doc(doc):
111
- identifier = doc.get('identifier', 'N/A')
112
- # title = doc.get('title', 'N/A')
113
- # format = doc.get('format', 'N/A')
114
- pdf_link = f"https://archive.org/download/{identifier}"
115
- extract_hyperlinks(pdf_link)
116
-
117
- with concurrent.futures.ThreadPoolExecutor() as executor:
118
- futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
119
-
120
- # Optionally, wait for all futures to complete and handle any exceptions
121
- for future in concurrent.futures.as_completed(futures):
122
- try:
123
- future.result() # This will raise an exception if the function call raised
124
- except Exception as exc:
125
- print(f'Generated an exception: {exc}')
126
-
127
-
128
- return pdf_files
129
-
130
- except:
131
- return []
132
-
133
- def github_search(query):
134
-
135
- try:
136
- # GitHub Search API endpoint
137
- url = f"https://api.github.com/search/code?q={query}+extension:md"
138
-
139
- headers = {
140
- 'Authorization': 'Token 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'
141
- }
142
-
143
- # Make the request
144
- response = requests.get(url,headers=headers)
145
- data = response.json()
146
- links = [item['html_url'].replace('/blob','').replace('//github','//raw.github') for item in data['items']]
147
-
148
- return links
149
-
150
- except:
151
- return []
152
-
153
-
154
- #Similarity Check
155
-
156
- def extract_similar_products(query):
157
- results = DDGS().chat(f'{query} Similar Products')
158
-
159
- pattern = r'^\d+\.\s(.+)$'
160
- matches = re.findall(pattern, results, re.MULTILINE)
161
- matches = [item.split(': ')[0] for item in matches]
162
- print(matches)
163
-
164
- return matches[:5] if matches else []
165
-
166
-
167
- # Define API routes -------------------------------------------------------
168
-
169
- @app.route('/')
170
- def home():
171
- return render_template('index.html')
172
-
173
-
174
- @app.route('/search/google', methods=['GET','POST'])
175
- def search_google():
176
-
177
- if request.method == 'POST':
178
- data = request.get_json()
179
- api_key = data.get('API_KEY')
180
- product = data.get('product')
181
- else:
182
- product = request.args.get('product')
183
- api_key = request.args.get('API_KEY')
184
-
185
- similar_products = extract_similar_products(product)
186
- if api_key == API_KEY_DEFAULT:
187
- results = {product: google_search(product)}
188
- for p in similar_products:
189
- results[p] = google_search(p)
190
- return jsonify(results)
191
-
192
- else:
193
- return jsonify({'error': 'Invalid API key'}), 401
194
-
195
- @app.route('/search/duckduckgo', methods=['GET','POST'])
196
- def search_duckduckgo():
197
-
198
- if request.method == 'POST':
199
- data = request.get_json()
200
- api_key = data.get('API_KEY')
201
- product = data.get('product')
202
- else:
203
- product = request.args.get('product')
204
- api_key = request.args.get('API_KEY')
205
-
206
- similar_products = extract_similar_products(product)
207
-
208
-
209
- if api_key == API_KEY_DEFAULT:
210
- results = {product: duckduckgo_search(product)}
211
- for p in similar_products:
212
- results[p] = duckduckgo_search(p)
213
- return jsonify(results)
214
- else:
215
- return jsonify({'error': 'Invalid API key'}), 401
216
-
217
-
218
- @app.route('/search/archive', methods=['GET','POST'])
219
- def search_archive():
220
-
221
- if request.method == 'POST':
222
- data = request.get_json()
223
- api_key = data.get('API_KEY')
224
- product = data.get('product')
225
- else:
226
- product = request.args.get('product')
227
- api_key = request.args.get('API_KEY')
228
-
229
- # Retrieve custom headers if any
230
-
231
- similar_products = extract_similar_products(product)
232
-
233
- if api_key == API_KEY_DEFAULT:
234
- results = {product: archive_search(product)}
235
-
236
- def process_product(product):
237
- return product, archive_search(product)
238
-
239
- with concurrent.futures.ThreadPoolExecutor() as executor:
240
- # Map the process_product function to similar_products
241
- future_to_product = {executor.submit(process_product, p): p for p in similar_products}
242
-
243
- # Collect results as they complete
244
- for future in concurrent.futures.as_completed(future_to_product):
245
- product, result = future.result()
246
- results[product] = result
247
-
248
- return jsonify(results)
249
-
250
- else:
251
- return jsonify({'error': 'Invalid API key'}), 401
252
-
253
-
254
- @app.route('/search/github', methods=['GET','POST'])
255
- def search_github():
256
-
257
- if request.method == 'POST':
258
- data = request.get_json()
259
- api_key = data.get('API_KEY')
260
- product = data.get('product')
261
- else:
262
- product = request.args.get('product')
263
- api_key = request.args.get('API_KEY')
264
-
265
- similar_products = extract_similar_products(product)
266
-
267
- if api_key == API_KEY_DEFAULT:
268
- results = {product: github_search(product)}
269
- for p in similar_products:
270
- results[p] = github_search(p)
271
- return jsonify(results)
272
-
273
- else:
274
- return jsonify({'error': 'Invalid API key'}), 401
275
-
276
-
277
- @app.route('/search/all', methods=['GET','POST'])
278
- def search_all():
279
-
280
- if request.method == 'POST':
281
- data = request.get_json()
282
- api_key = data.get('API_KEY')
283
- product = data.get('product')
284
- else:
285
- product = request.args.get('product')
286
- api_key = request.args.get('API_KEY')
287
-
288
- similar_products = extract_similar_products(product)
289
-
290
-
291
- if api_key == API_KEY_DEFAULT:
292
-
293
- results = {
294
- product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
295
- }
296
-
297
- def search_product(p):
298
- return {
299
- 'product': p,
300
- 'duckduckgo': duckduckgo_search(p),
301
- 'google': google_search(p),
302
- 'github': github_search(p),
303
- 'archive': archive_search(p)
304
- }
305
-
306
- with concurrent.futures.ThreadPoolExecutor() as executor:
307
- future_to_product = {executor.submit(search_product, p): p for p in similar_products}
308
-
309
- for future in concurrent.futures.as_completed(future_to_product):
310
- result = future.result()
311
- product = result['product']
312
- results[product] = [
313
- {'duckduckgo': result['duckduckgo']},
314
- {'google': result['google']},
315
- {'github': result['github']},
316
- {'archive': result['archive']}
317
- ]
318
-
319
- return jsonify(results)
320
-
321
- else:
322
- return jsonify({'error': 'Invalid API key'}), 401
323
-
324
- # Run the Flask app
325
- if __name__ == '__main__':
326
- app.run(debug=True)
327
-
 
1
+ # file: app.py
2
+
3
+ import gradio as gr
4
+ import requests
5
+ import json
6
+ import concurrent.futures
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langdetect import detect_langs
10
+ from PyPDF2 import PdfReader
11
+ from io import BytesIO
12
+ import logging
13
+ from dotenv import load_dotenv
14
+ import os
15
+
16
+ load_dotenv()
17
+ data = False
18
+ seen = set()
19
+
20
+ main_url = "https://similar-products-api.vercel.app/search/all"
21
+ main_product = "Samsung Galaxy"
22
+
23
+ API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
24
+ headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
25
+
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+ def get_links(product):
29
+ params = {
30
+ "API_KEY": "12345",
31
+ "product": f"{product}",
32
+ }
33
+ response = requests.get(main_url, params=params)
34
+ if response.status_code == 200:
35
+ results = response.json()
36
+ return results
37
+ else:
38
+ return {}
39
+
40
+ def language_preprocess(text):
41
+ try:
42
+ if detect_langs(text)[0].lang == 'en':
43
+ return True
44
+ return False
45
+ except Exception as e:
46
+ logging.error(f"Language detection error: {e}")
47
+ return False
48
+
49
+ def relevant(product, similar_product, content):
50
+ try:
51
+ payload = {"inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content[:700]}'''}
52
+ response = requests.post(API_URL, headers=headers, json=payload)
53
+ output = response.json()
54
+ return bool(output[0]['generated_text'])
55
+ except Exception as e:
56
+ logging.error(f"Relevance checking error: {e}")
57
+ return False
58
+
59
+ def download_pdf(url, timeout=10):
60
+ try:
61
+ response = requests.get(url, timeout=timeout)
62
+ response.raise_for_status()
63
+ return BytesIO(response.content)
64
+ except requests.RequestException as e:
65
+ logging.error(f"PDF download error: {e}")
66
+ return None
67
+
68
+ def extract_text_from_pages(pdf_file, pages):
69
+ reader = PdfReader(pdf_file)
70
+ extracted_text = ""
71
+ try:
72
+ for page_num in pages:
73
+ if page_num < len(reader.pages):
74
+ page = reader.pages[page_num]
75
+ extracted_text += page.extract_text() + "\n"
76
+ else:
77
+ logging.warning(f"Page {page_num} does not exist in the document.")
78
+ return extracted_text
79
+ except Exception as e:
80
+ logging.error(f"PDF text extraction error: {e}")
81
+ return 'हे चालत नाही'
82
+
83
+ def process_link(link, similar_product):
84
+ if link in seen:
85
+ return None
86
+ seen.add(link)
87
+ try:
88
+ pdf_file = download_pdf(link)
89
+ if pdf_file:
90
+ text = extract_text_from_pages(pdf_file, [0, 2, 4])
91
+ if language_preprocess(text):
92
+ if relevant(main_product, similar_product, text):
93
+ return link
94
+ except Exception as e:
95
+ logging.error(f"Error processing link: {e}")
96
+ return None
97
+
98
+ def filtering(urls, similar_product):
99
+ res = []
100
+ with ThreadPoolExecutor() as executor:
101
+ futures = {executor.submit(process_link, link, similar_product): link for link in urls}
102
+ for future in concurrent.futures.as_completed(futures):
103
+ result = future.result()
104
+ if result is not None:
105
+ res.append(result)
106
+ return res
107
+
108
+ def wikipedia_url(product):
109
+ api_url = "https://en.wikipedia.org/w/api.php"
110
+ params = {
111
+ "action": "opensearch",
112
+ "search": product,
113
+ "limit": 5,
114
+ "namespace": 0,
115
+ "format": "json"
116
+ }
117
+ try:
118
+ response = requests.get(api_url, params=params)
119
+ response.raise_for_status()
120
+ data = response.json()
121
+ if data and len(data) > 3 and len(data[3]) > 0:
122
+ return data[3]
123
+ else:
124
+ return []
125
+ except requests.RequestException as e:
126
+ logging.error(f"Error fetching Wikipedia URLs: {e}")
127
+ return []
128
+
129
+ def preprocess_initial(product):
130
+ return get_links(product)
131
+
132
+ def preprocess_filter(product, data):
133
+ for similar_product in data:
134
+ # if similar_product != product:
135
+ if list(data[similar_product][0])[0] == 'duckduckgo':
136
+ s = set(('duckduckgo', 'google', 'archive'))
137
+ temp = []
138
+
139
+ for idx, item in enumerate(data[similar_product]):
140
+ if list(item)[0] in s:
141
+ urls = data[similar_product][idx][list(item)[0]]
142
+ temp += filtering(urls, similar_product)
143
+ else:
144
+ temp += data[similar_product][idx][list(item)[0]]
145
+
146
+ data[similar_product] = temp
147
+ data[similar_product] += wikipedia_url(similar_product)
148
+ else:
149
+ urls = data[similar_product]
150
+ data[similar_product] = filtering(urls, similar_product)
151
+ data[similar_product] += wikipedia_url(similar_product)
152
+ logging.info('Filtering completed')
153
+ return data
154
+
155
+ def main(product_name):
156
+ return preprocess_initial(product_name)
157
+
158
+ def filter_links(product_name, initial_data):
159
+ return preprocess_filter(product_name, initial_data)
160
+
161
+ with gr.Blocks() as demo:
162
+ product_name = gr.Textbox(label="Product Name")
163
+ get_links_btn = gr.Button("Get Links")
164
+ initial_links_output = gr.JSON()
165
+ filter_btn = gr.Button("Filter Links")
166
+ filtered_links_output = gr.JSON()
167
+
168
+ get_links_btn.click(fn=main, inputs=product_name, outputs=initial_links_output)
169
+ filter_btn.click(fn=filter_links, inputs=[product_name, initial_links_output], outputs=filtered_links_output)
170
+
171
+ if __name__ == "__main__":
172
+ demo.launch()