kensvin commited on
Commit
470bdb8
1 Parent(s): 6289423
Files changed (1) hide show
  1. app.py +88 -111
app.py CHANGED
@@ -1,12 +1,8 @@
1
  import spaces
2
  import os
3
- os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
4
-
5
  from dotenv import load_dotenv
6
- load_dotenv()
7
-
8
  import re
9
- from urllib.parse import urlparse, parse_qs
10
  import pandas as pd
11
  import unicodedata as uni
12
  import emoji
@@ -16,15 +12,15 @@ from langchain_community.document_loaders import DataFrameLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.chains import RetrievalQA
19
- # from tokopedia import request_product_id, request_product_review
20
  import gradio as gr
 
 
21
 
22
- shop_id = ""
23
- item_id = ""
24
- item = {}
25
- LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
26
 
27
- import logging
 
28
 
29
  # Configure logging
30
  logging.basicConfig(
@@ -32,13 +28,21 @@ logging.basicConfig(
32
  format="%(asctime)s [%(levelname)s] %(message)s",
33
  handlers=[logging.StreamHandler()],
34
  )
35
-
36
  logger = logging.getLogger(__name__)
37
 
38
-
39
- import requests
 
 
 
 
 
 
 
 
40
 
41
 
 
42
  def request_product_id(shop_domain, product_key):
43
  ENDPOINT = "https://gql.tokopedia.com/graphql/PDPGetLayoutQuery"
44
  payload = {
@@ -81,30 +85,22 @@ def request_product_id(shop_domain, product_key):
81
  }
82
  """,
83
  }
84
-
85
  headers = {
86
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
87
  "Referer": "https://www.tokopedia.com",
88
  "X-TKPD-AKAMAI": "pdpGetLayout",
89
  }
90
-
91
  try:
92
- response = requests.request(
93
- method="POST",
94
- url=ENDPOINT,
95
- json=payload,
96
- headers=headers,
97
- timeout=60
98
- )
99
- response.raise_for_status() # Raise an exception for non-2xx status codes
100
  logger.info(f"Request successful. Status code: {response.status_code}")
101
- # Process the response data
102
  except requests.exceptions.RequestException as e:
103
  logger.error(f"Request failed: {e}")
104
- else:
105
- return response
106
 
107
 
 
108
  def request_product_review(product_id, page=1, limit=20):
109
  ENDPOINT = "https://gql.tokopedia.com/graphql/productReviewList"
110
  payload = {
@@ -168,40 +164,36 @@ def request_product_review(product_id, page=1, limit=20):
168
  }
169
  """,
170
  }
171
-
172
  headers = {
173
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
174
  "Referer": "https://www.tokopedia.com",
175
  "X-TKPD-AKAMAI": "productReviewList",
176
  }
177
-
178
  try:
179
- response = requests.request(
180
- method="POST", url=ENDPOINT, json=payload, headers=headers, timeout=60
181
- )
182
- response.raise_for_status() # Raise an exception for non-2xx status codes
183
  logger.info(f"Request successful. Status code: {response.status_code}")
184
- # Process the response data
185
  except requests.exceptions.RequestException as e:
186
  logger.error(f"Request failed: {e}")
187
- else:
188
- return response
189
 
190
 
 
191
  def scrape(product_id, max_reviews=LIMIT):
192
  all_reviews = []
193
  page = 1
194
  has_next = True
195
-
196
  logger.info("Extracting product reviews...")
197
- # while has_next and len(all_reviews) <= max_reviews:
198
- response = request_product_review(product_id, page=page)
199
- data = response.json()["data"]["productrevGetProductReviewList"]
200
- reviews = data["list"]
201
- all_reviews.extend(reviews)
202
- has_next = data["hasNext"]
203
- page += 1
204
-
 
205
  reviews_df = pd.json_normalize(all_reviews)
206
  reviews_df.rename(columns={"message": "comment"}, inplace=True)
207
  reviews_df = reviews_df[["comment"]]
@@ -209,97 +201,82 @@ def scrape(product_id, max_reviews=LIMIT):
209
  return reviews_df
210
 
211
 
 
212
  def get_product_id(URL):
213
  parsed_url = urlparse(URL)
214
- *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
215
- product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
216
- "basicInfo"
217
- ]["id"]
218
- logger.info(product_id)
219
- return product_id
 
 
 
220
 
221
 
222
- # Clean
223
  def clean(df):
224
- df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
225
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
226
- df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
227
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
228
- logger.info("cleaned")
229
  return df
230
 
231
 
 
232
  def clean_text(text):
233
- text = uni.normalize("NFKD", text) # normalise characters
234
- text = emoji.replace_emoji(text, "") # remove emoji
235
- text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
236
- text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
237
  return text
238
 
239
 
240
- # LLM
241
- OpenAIModel = "gpt-3.5-turbo"
242
  llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
243
-
244
- # Embeddings
245
  embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/all-indobert-base-v2")
246
 
247
- cache_URL = ""
248
- db = None
249
- qa = None
250
- cache = {}
251
-
252
 
 
253
  @spaces.GPU
254
  async def generate(URL, query):
255
  global cache_URL, db, qa, cache
256
 
257
- if URL == "" or query == "":
258
  return "Input kosong"
259
- else:
260
- try:
261
- product_id = get_product_id(URL)
262
-
263
- if URL not in cache:
264
- # Get reviews
265
- try:
266
- reviews = scrape(product_id)
267
- # Clean reviews
268
- cleaned_reviews = clean(reviews)
269
- # Load data
270
- loader = DataFrameLoader(
271
- cleaned_reviews, page_content_column="comment"
272
- )
273
- documents = loader.load()
274
- except Exception as e:
275
- return "Error getting reviews: " + str(e)
276
- else:
277
- # Split text
278
- text_splitter = RecursiveCharacterTextSplitter(
279
- chunk_size=1000, chunk_overlap=50
280
- )
281
- docs = text_splitter.split_documents(documents)
282
-
283
- # Vector store
284
- db = FAISS.from_documents(docs, embeddings)
285
-
286
- # Store in cache
287
- cache[URL] = (docs, db)
288
-
289
- # Retrieve from cache
290
  docs, db = cache[URL]
291
 
292
- # Chain to answer questions
293
- qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
294
- res = await qa.ainvoke(query)
295
-
296
- # Process result
297
- return res["result"]
298
- except:
299
- return "Gagal mendapatkan review dari URL"
300
 
301
 
302
- # Gradio
303
  product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
304
  query_box = gr.Textbox(
305
  lines=2,
 
1
  import spaces
2
  import os
 
 
3
  from dotenv import load_dotenv
 
 
4
  import re
5
+ from urllib.parse import urlparse
6
  import pandas as pd
7
  import unicodedata as uni
8
  import emoji
 
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_community.vectorstores import FAISS
14
  from langchain.chains import RetrievalQA
 
15
  import gradio as gr
16
+ import logging
17
+ import requests
18
 
19
+ # Load environment variables
20
+ load_dotenv()
 
 
21
 
22
+ # Set command line arguments for Gradio
23
+ os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
24
 
25
  # Configure logging
26
  logging.basicConfig(
 
28
  format="%(asctime)s [%(levelname)s] %(message)s",
29
  handlers=[logging.StreamHandler()],
30
  )
 
31
  logger = logging.getLogger(__name__)
32
 
33
+ # Constants
34
+ LIMIT = 1000 # Limit to 1000 reviews to avoid long processing times
35
+ OpenAIModel = "gpt-3.5-turbo"
36
+ shop_id = ""
37
+ item_id = ""
38
+ item = {}
39
+ cache_URL = ""
40
+ db = None
41
+ qa = None
42
+ cache = {}
43
 
44
 
45
+ # Function to request product ID from Tokopedia
46
  def request_product_id(shop_domain, product_key):
47
  ENDPOINT = "https://gql.tokopedia.com/graphql/PDPGetLayoutQuery"
48
  payload = {
 
85
  }
86
  """,
87
  }
 
88
  headers = {
89
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
90
  "Referer": "https://www.tokopedia.com",
91
  "X-TKPD-AKAMAI": "pdpGetLayout",
92
  }
 
93
  try:
94
+ response = requests.post(ENDPOINT, json=payload, headers=headers, timeout=60)
95
+ response.raise_for_status()
 
 
 
 
 
 
96
  logger.info(f"Request successful. Status code: {response.status_code}")
97
+ return response
98
  except requests.exceptions.RequestException as e:
99
  logger.error(f"Request failed: {e}")
100
+ return None
 
101
 
102
 
103
+ # Function to request product reviews from Tokopedia
104
  def request_product_review(product_id, page=1, limit=20):
105
  ENDPOINT = "https://gql.tokopedia.com/graphql/productReviewList"
106
  payload = {
 
164
  }
165
  """,
166
  }
 
167
  headers = {
168
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
169
  "Referer": "https://www.tokopedia.com",
170
  "X-TKPD-AKAMAI": "productReviewList",
171
  }
 
172
  try:
173
+ response = requests.post(ENDPOINT, json=payload, headers=headers, timeout=60)
174
+ response.raise_for_status()
 
 
175
  logger.info(f"Request successful. Status code: {response.status_code}")
176
+ return response
177
  except requests.exceptions.RequestException as e:
178
  logger.error(f"Request failed: {e}")
179
+ return None
 
180
 
181
 
182
+ # Function to scrape reviews for a product
183
  def scrape(product_id, max_reviews=LIMIT):
184
  all_reviews = []
185
  page = 1
186
  has_next = True
 
187
  logger.info("Extracting product reviews...")
188
+ while has_next and len(all_reviews) < max_reviews:
189
+ response = request_product_review(product_id, page=page)
190
+ if not response:
191
+ break
192
+ data = response.json()["data"]["productrevGetProductReviewList"]
193
+ reviews = data["list"]
194
+ all_reviews.extend(reviews)
195
+ has_next = data["hasNext"]
196
+ page += 1
197
  reviews_df = pd.json_normalize(all_reviews)
198
  reviews_df.rename(columns={"message": "comment"}, inplace=True)
199
  reviews_df = reviews_df[["comment"]]
 
201
  return reviews_df
202
 
203
 
204
+ # Function to extract product ID from URL
205
  def get_product_id(URL):
206
  parsed_url = urlparse(URL)
207
+ _, shop, product_key = parsed_url.path.strip("/").split("/")
208
+ response = request_product_id(shop, product_key)
209
+ if response:
210
+ product_id = response.json()["data"]["pdpGetLayout"]["basicInfo"]["id"]
211
+ logger.info(f"Product ID: {product_id}")
212
+ return product_id
213
+ else:
214
+ logger.error("Failed to get product ID")
215
+ return None
216
 
217
 
218
+ # Function to clean the reviews DataFrame
219
  def clean(df):
220
+ df = df.dropna().copy().reset_index(drop=True) # Drop reviews with empty comments
221
+ df = df[df["comment"] != ""].reset_index(drop=True) # Remove empty reviews
222
+ df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # Clean text
223
+ df = df[df["comment"] != ""].reset_index(drop=True) # Remove empty reviews
224
+ logger.info("Cleaned reviews DataFrame")
225
  return df
226
 
227
 
228
+ # Function to clean individual text entries
229
  def clean_text(text):
230
+ text = uni.normalize("NFKD", text) # Normalize characters
231
+ text = emoji.replace_emoji(text, "") # Remove emoji
232
+ text = re.sub(r"(\w)\1{2,}", r"\1", text) # Remove repeated characters
233
+ text = re.sub(r"[ ]+", " ", text).strip() # Remove extra spaces
234
  return text
235
 
236
 
237
+ # Initialize LLM and embeddings
 
238
  llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
 
 
239
  embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/all-indobert-base-v2")
240
 
 
 
 
 
 
241
 
242
+ # Function to generate a summary or answer based on reviews
243
  @spaces.GPU
244
  async def generate(URL, query):
245
  global cache_URL, db, qa, cache
246
 
247
+ if not URL or not query:
248
  return "Input kosong"
249
+ try:
250
+ product_id = get_product_id(URL)
251
+ if not product_id:
252
+ return "Gagal mendapatkan product ID"
253
+
254
+ if URL not in cache:
255
+ reviews = scrape(product_id)
256
+ if reviews.empty:
257
+ return "Tidak ada ulasan ditemukan"
258
+
259
+ cleaned_reviews = clean(reviews)
260
+ loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
261
+ documents = loader.load()
262
+ text_splitter = RecursiveCharacterTextSplitter(
263
+ chunk_size=1000, chunk_overlap=50
264
+ )
265
+ docs = text_splitter.split_documents(documents)
266
+ db = FAISS.from_documents(docs, embeddings)
267
+ cache[URL] = (docs, db)
268
+ else:
 
 
 
 
 
 
 
 
 
 
 
269
  docs, db = cache[URL]
270
 
271
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
272
+ res = await qa.ainvoke(query)
273
+ return res["result"]
274
+ except Exception as e:
275
+ logger.error(f"Error in generating response: {e}")
276
+ return "Gagal mendapatkan review dari URL"
 
 
277
 
278
 
279
+ # Set up Gradio interface
280
  product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
281
  query_box = gr.Textbox(
282
  lines=2,