File size: 9,854 Bytes
fb6b415
1ee2db4
 
fb6b415
9cb4e8f
 
 
286516b
 
 
 
 
c66df08
27e0ec6
 
286516b
27e0ec6
286516b
f87f358
286516b
 
 
 
 
 
 
d050f83
 
 
 
a33de26
d050f83
 
 
 
 
 
36ab1a9
f87f358
 
 
 
95f9443
f87f358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95f9443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f87f358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95f9443
 
 
 
 
 
 
 
 
 
 
f87f358
 
3885d1d
 
 
 
 
2577f1f
f964a19
 
 
 
 
 
 
3885d1d
 
 
 
d050f83
3885d1d
 
 
 
 
 
 
 
 
d050f83
3885d1d
36ab1a9
286516b
 
 
 
 
 
 
d050f83
286516b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333b8cd
286516b
 
 
 
1977bde
286516b
 
fb6b415
3885d1d
1977bde
 
3885d1d
c034fa1
3885d1d
286516b
3885d1d
1977bde
 
3885d1d
 
5425a02
3885d1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1977bde
3885d1d
 
1977bde
 
 
 
 
 
 
 
 
 
 
 
 
3885d1d
c034fa1
286516b
 
 
3885d1d
286516b
 
 
 
 
 
 
 
 
8464276
286516b
 
7f75ebb
1ee2db4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import spaces
import os
os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"

from dotenv import load_dotenv
load_dotenv()

import re
from urllib.parse import urlparse, parse_qs
import pandas as pd
import unicodedata as uni
import emoji
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
# from tokopedia import request_product_id, request_product_review
import gradio as gr

shop_id = ""
item_id = ""
item = {}
LIMIT = 1000  # Limit to 1000 reviews so that processing does not take too long

import logging

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()],
)

logger = logging.getLogger(__name__)


import requests


def request_product_id(shop_domain, product_key):
    ENDPOINT = "https://gql.tokopedia.com/graphql/PDPGetLayoutQuery"
    payload = {
        "operationName": "PDPGetLayoutQuery",
        "variables": {
            "shopDomain": f"{shop_domain}",
            "productKey": f"{product_key}",
            "apiVersion": 1,
        },
        "query": """fragment ProductVariant on pdpDataProductVariant {
                        errorCode
                        parentID
                        defaultChild
                        children {
                            productID
                        }
                        __typename
                        }

                        query PDPGetLayoutQuery($shopDomain: String, $productKey: String, $layoutID: String, $apiVersion: Float, $userLocation: pdpUserLocation, $extParam: String, $tokonow: pdpTokoNow, $deviceID: String) {
                        pdpGetLayout(shopDomain: $shopDomain, productKey: $productKey, layoutID: $layoutID, apiVersion: $apiVersion, userLocation: $userLocation, extParam: $extParam, tokonow: $tokonow, deviceID: $deviceID) {
                            requestID
                            name
                            pdpSession
                            basicInfo {
                              id: productID
                            }
                            components {
                            name
                            type
                            position
                            data {
                                ...ProductVariant
                                __typename
                            }
                            __typename
                            }
                            __typename
                        }
                        }
                        """,
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Referer": "https://www.tokopedia.com",
        "X-TKPD-AKAMAI": "pdpGetLayout",
    }

    try:
        response = requests.request(
            method="POST",
            url=ENDPOINT,
            json=payload,
            headers=headers,
            timeout=30
        )
        response.raise_for_status()  # Raise an exception for non-2xx status codes
        logger.info(f"Request successful. Status code: {response.status_code}")
        # Process the response data
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
    else:
        return response


def request_product_review(product_id, page=1, limit=20):
    ENDPOINT = "https://gql.tokopedia.com/graphql/productReviewList"
    payload = {
        "operationName": "productReviewList",
        "variables": {
            "productID": f"{product_id}",
            "page": page,
            "limit": limit,
            "sortBy": "",
            "filterBy": "",
        },
        "query": """query productReviewList($productID: String!, $page: Int!, $limit: Int!, $sortBy: String, $filterBy: String) {
  productrevGetProductReviewList(productID: $productID, page: $page, limit: $limit, sortBy: $sortBy, filterBy: $filterBy) {
    productID
    list {
      id: feedbackID
      variantName
      message
      productRating
      reviewCreateTime
      reviewCreateTimestamp
      isReportable
      isAnonymous
      reviewResponse {
        message
        createTime
        __typename
      }
      user {
        userID
        fullName
        image
        url
        __typename
      }
      likeDislike {
        totalLike
        likeStatus
        __typename
      }
      stats {
        key
        formatted
        count
        __typename
      }
      badRatingReasonFmt
      __typename
    }
    shop {
      shopID
      name
      url
      image
      __typename
    }
    hasNext
    totalReviews
    __typename
  }
}
                        """,
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Referer": "https://www.tokopedia.com",
        "X-TKPD-AKAMAI": "productReviewList",
    }

    try:
        response = requests.request(
            method="POST", url=ENDPOINT, json=payload, headers=headers, timeout=30
        )
        response.raise_for_status()  # Raise an exception for non-2xx status codes
        logger.info(f"Request successful. Status code: {response.status_code}")
        # Process the response data
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
    else:
        return response


def scrape(product_id, max_reviews=LIMIT):
    all_reviews = []
    page = 1
    has_next = True

    logger.info("Extracting product reviews...")
    # while has_next and len(all_reviews) <= max_reviews:
    response = request_product_review(product_id, page=page)
    data = response.json()["data"]["productrevGetProductReviewList"]
    reviews = data["list"]
    all_reviews.extend(reviews)
    has_next = data["hasNext"]
    page += 1

    reviews_df = pd.json_normalize(all_reviews)
    reviews_df.rename(columns={"message": "comment"}, inplace=True)
    reviews_df = reviews_df[["comment"]]
    logger.info(reviews_df.head())
    return reviews_df


def get_product_id(URL):
    parsed_url = urlparse(URL)
    *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
    product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
        "basicInfo"
    ]["id"]
    logger.info(product_id)
    return product_id


# Clean
def clean(df):
    df = df.dropna().copy().reset_index(drop=True)  # drop reviews with empty comments
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    df["comment"] = df["comment"].apply(lambda x: clean_text(x))  # clean text
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    logger.info("cleaned")
    return df


def clean_text(text):
    text = uni.normalize("NFKD", text)  # normalise characters
    text = emoji.replace_emoji(text, "")  # remove emoji
    text = re.sub(r"(\w)\1{2,}", r"\1", text)  # repeated chars
    text = re.sub(r"[ ]+", " ", text).strip()  # remove extra spaces
    return text


# LLM
OpenAIModel = "gpt-3.5-turbo"
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)

# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/all-indobert-base-v2")

cache_URL = ""
db = None
qa = None
cache = {}


@spaces.GPU
async def generate(URL, query):
    global cache_URL, db, qa, cache

    if URL == "" or query == "":
        return "Input kosong"
    else:
        try:
            product_id = get_product_id(URL)

            if URL not in cache:
                # Get reviews
                try:
                    reviews = scrape(product_id)
                    # Clean reviews
                    cleaned_reviews = clean(reviews)
                    # Load data
                    loader = DataFrameLoader(
                        cleaned_reviews, page_content_column="comment"
                    )
                    documents = loader.load()
                except Exception as e:
                    return "Error getting reviews: " + str(e)
                else:
                    # Split text
                    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=1000, chunk_overlap=50
                    )
                    docs = text_splitter.split_documents(documents)

                    # Vector store
                    db = FAISS.from_documents(docs, embeddings)

                    # Store in cache
                    cache[URL] = (docs, db)

            # Retrieve from cache
            docs, db = cache[URL]

            # Chain to answer questions
            qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
            res = await qa.ainvoke(query)

            # Process result
            return res["result"]
        except:
            return "Gagal mendapatkan review dari URL"


# Gradio
product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
query_box = gr.Textbox(
    lines=2,
    label="Kueri",
    placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
)

gr.Interface(
    fn=generate,
    inputs=[product_box, query_box],
    outputs=[gr.Textbox(label="Jawaban")],
    title="RingkasUlas",
    description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
    allow_flagging="never",
).launch(debug=True)