kensvin commited on
Commit
3885d1d
1 Parent(s): 8464276
Files changed (1) hide show
  1. app.py +63 -53
app.py CHANGED
@@ -20,30 +20,34 @@ item_id = ""
20
  item = {}
21
  LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
22
 
23
- def scrape(URL, max_reviews=LIMIT):
24
- try:
25
- parsed_url = urlparse(URL)
26
- *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
27
- product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
28
- "basicInfo"
29
- ]["id"]
30
- except:
31
- return "Invalid URL"
32
- else:
33
- all_reviews = []
34
- page = 1
35
- has_next = True
36
 
37
- while has_next and len(all_reviews) <= max_reviews:
38
- response = request_product_review(product_id, page=page)
39
- data = response.json()["data"]["productrevGetProductReviewList"]
40
- reviews = data["list"]
41
- all_reviews.extend(reviews)
42
- has_next = data["hasNext"]
43
- page += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- reviews_df = pd.json_normalize(all_reviews)
46
- return reviews_df
47
 
48
  # Clean
49
  def clean(df):
@@ -74,41 +78,48 @@ db = None
74
  qa = None
75
 
76
 
77
- def generate(URL, query):
78
  global cache_URL, db, qa
79
- if URL != cache_URL:
80
- # Get reviews
 
81
  try:
82
- reviews = scrape(URL)
83
- if (reviews is None) or (len(reviews) == 0):
84
- return "No reviews found"
85
- elif reviews == "Invalid URL":
86
- return "Invalid URL"
87
- # Clean reviews
88
- cleaned_reviews = clean(reviews)
89
- # Load data
90
- loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
91
- documents = loader.load()
92
- except Exception as e:
93
- return "Error getting reviews: " + str(e)
94
- else:
95
- # Split text
96
- text_splitter = RecursiveCharacterTextSplitter(
97
- chunk_size=1000, chunk_overlap=50
98
- )
99
- docs = text_splitter.split_documents(documents)
100
- cache_URL = URL
101
- # Vector store
102
- db = FAISS.from_documents(docs, embeddings)
103
- # Chain to answer questions
104
- qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
105
- return qa.run(query)
 
 
 
 
 
 
 
 
106
 
107
 
108
  # Gradio
109
- product_box = gr.Textbox(
110
- label="URL Produk", placeholder="URL produk dari Tokopedia"
111
- )
112
  query_box = gr.Textbox(
113
  lines=2,
114
  label="Kueri",
@@ -122,5 +133,4 @@ gr.Interface(
122
  title="RingkasUlas",
123
  description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
124
  allow_flagging="never",
125
-
126
  ).launch()
 
20
  item = {}
21
  LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def scrape(product_id, max_reviews=LIMIT):
25
+ all_reviews = []
26
+ page = 1
27
+ has_next = True
28
+
29
+ while has_next and len(all_reviews) <= max_reviews:
30
+ response = request_product_review(product_id, page=page)
31
+ data = response.json()["data"]["productrevGetProductReviewList"]
32
+ reviews = data["list"]
33
+ all_reviews.extend(reviews)
34
+ has_next = data["hasNext"]
35
+ page += 1
36
+
37
+ reviews_df = pd.json_normalize(all_reviews)
38
+ reviews_df.rename(columns={"message": "comment"}, inplace=True)
39
+ reviews_df = reviews_df[["comment"]]
40
+ return reviews_df
41
+
42
+
43
+ def get_product_id(URL):
44
+ parsed_url = urlparse(URL)
45
+ *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
46
+ product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
47
+ "basicInfo"
48
+ ]["id"]
49
+ return product_id
50
 
 
 
51
 
52
  # Clean
53
  def clean(df):
 
78
  qa = None
79
 
80
 
81
+ async def generate(URL, query):
82
  global cache_URL, db, qa
83
+ if URL == "" or query == "":
84
+ return "Empty input"
85
+ else:
86
  try:
87
+ product_id = get_product_id(URL)
88
+ if URL != cache_URL:
89
+ # Get reviews
90
+ try:
91
+ reviews = scrape(URL)
92
+ # Clean reviews
93
+ cleaned_reviews = clean(reviews)
94
+ # Load data
95
+ loader = DataFrameLoader(
96
+ cleaned_reviews, page_content_column="comment"
97
+ )
98
+ documents = loader.load()
99
+ except Exception as e:
100
+ return "Error getting reviews: " + str(e)
101
+ else:
102
+ # Split text
103
+ text_splitter = RecursiveCharacterTextSplitter(
104
+ chunk_size=1000, chunk_overlap=50
105
+ )
106
+ docs = text_splitter.split_documents(documents)
107
+ cache_URL = URL
108
+ # Vector store
109
+ db = FAISS.from_documents(docs, embeddings)
110
+ # Chain to answer questions
111
+ qa = RetrievalQA.from_chain_type(
112
+ llm=llm, retriever=db.as_retriever()
113
+ )
114
+ res = await qa.ainvoke(query)
115
+ # Process result
116
+ return res["result"]
117
+ except:
118
+ return "URL tidak valid"
119
 
120
 
121
  # Gradio
122
+ product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
 
 
123
  query_box = gr.Textbox(
124
  lines=2,
125
  label="Kueri",
 
133
  title="RingkasUlas",
134
  description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
135
  allow_flagging="never",
 
136
  ).launch()