File size: 8,911 Bytes
fb6b415
1ee2db4
9cb4e8f
286516b
470bdb8
286516b
 
 
c66df08
27e0ec6
 
286516b
27e0ec6
286516b
 
470bdb8
 
286516b
470bdb8
 
286516b
470bdb8
 
d050f83
 
 
a33de26
d050f83
 
 
 
 
8099d9b
 
 
 
 
 
 
470bdb8
 
 
 
 
 
 
 
 
 
f87f358
2ee750c
f87f358
470bdb8
c2122c6
d996c80
dd2752f
60e788e
 
d996c80
 
60e788e
 
38baeb2
dd2752f
d996c80
f87f358
d996c80
 
 
f87f358
2ee750c
d996c80
 
 
f87f358
 
470bdb8
f87f358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95f9443
470bdb8
 
95f9443
470bdb8
95f9443
 
470bdb8
f87f358
 
470bdb8
3885d1d
 
 
 
2577f1f
470bdb8
 
 
 
 
 
 
 
 
3885d1d
 
 
d050f83
3885d1d
 
 
470bdb8
3885d1d
 
f32ae2a
c2122c6
470bdb8
 
 
 
 
 
 
36ab1a9
286516b
470bdb8
286516b
470bdb8
 
 
 
 
286516b
 
 
470bdb8
286516b
470bdb8
 
 
 
286516b
 
 
470bdb8
286516b
333b8cd
286516b
 
470bdb8
fb6b415
3885d1d
1977bde
 
470bdb8
c034fa1
470bdb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1977bde
 
470bdb8
 
 
 
 
 
286516b
 
470bdb8
3885d1d
286516b
 
 
 
 
 
 
 
 
8464276
286516b
 
7f75ebb
1ee2db4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import spaces
import os
from dotenv import load_dotenv
import re
from urllib.parse import urlparse
import pandas as pd
import unicodedata as uni
import emoji
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr
import logging
import requests

# Load environment variables
load_dotenv()

# Set command line arguments for Gradio
os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()],
)
logger = logging.getLogger(__name__)

import http.client

http.client.HTTPConnection.debuglevel = 1
req_log = logging.getLogger("requests.packages.urllib3")
req_log.setLevel(logging.DEBUG)
req_log.propagate = True

# Constants
LIMIT = 1000  # Limit to 1000 reviews to avoid long processing times
OpenAIModel = "gpt-3.5-turbo"
shop_id = ""
item_id = ""
item = {}
cache_URL = ""
db = None
qa = None
cache = {}

import json

# Function to request product ID from Tokopedia
def request_product_id(shop_domain, product_key, url):
    endpoint = "https://gql.tokopedia.com/graphql/PDPGetLayoutQuery"
    payload = {
        "operationName": "PDPGetLayoutQuery",
        "variables": {
            "shopDomain": f"{shop_domain}",
            "productKey": f"{product_key}",
            "apiVersion": 1,
        },
        "query": "fragment ProductVariant on pdpDataProductVariant { errorCode parentID defaultChild children { productID } __typename } query PDPGetLayoutQuery($shopDomain: String, $productKey: String, $layoutID: String, $apiVersion: Float, $userLocation: pdpUserLocation, $extParam: String, $tokonow: pdpTokoNow, $deviceID: String) { pdpGetLayout(shopDomain: $shopDomain, productKey: $productKey, layoutID: $layoutID, apiVersion: $apiVersion, userLocation: $userLocation, extParam: $extParam, tokonow: $tokonow, deviceID: $deviceID) { requestID name pdpSession basicInfo { id: productID } components { name type position data { ...ProductVariant __typename } __typename } __typename } }",
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Referer": "https://www.tokopedia.com",
        "X-TKPD-AKAMAI": "pdpGetLayout",
    }

    return requests.request(
        method="POST", url=endpoint, json=payload, headers=headers, timeout=30
    )


# Function to request product reviews from Tokopedia
def request_product_review(product_id, page=1, limit=20):
    ENDPOINT = "https://gql.tokopedia.com/graphql/productReviewList"
    payload = {
        "operationName": "productReviewList",
        "variables": {
            "productID": f"{product_id}",
            "page": page,
            "limit": limit,
            "sortBy": "",
            "filterBy": "",
        },
        "query": """query productReviewList($productID: String!, $page: Int!, $limit: Int!, $sortBy: String, $filterBy: String) {
  productrevGetProductReviewList(productID: $productID, page: $page, limit: $limit, sortBy: $sortBy, filterBy: $filterBy) {
    productID
    list {
      id: feedbackID
      variantName
      message
      productRating
      reviewCreateTime
      reviewCreateTimestamp
      isReportable
      isAnonymous
      reviewResponse {
        message
        createTime
        __typename
      }
      user {
        userID
        fullName
        image
        url
        __typename
      }
      likeDislike {
        totalLike
        likeStatus
        __typename
      }
      stats {
        key
        formatted
        count
        __typename
      }
      badRatingReasonFmt
      __typename
    }
    shop {
      shopID
      name
      url
      image
      __typename
    }
    hasNext
    totalReviews
    __typename
  }
}
                        """,
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Referer": "https://www.tokopedia.com",
        "X-TKPD-AKAMAI": "productReviewList",
    }
    try:
        response = requests.post(ENDPOINT, json=payload, headers=headers, timeout=60)
        response.raise_for_status()
        logger.info(f"Request successful. Status code: {response.status_code}")
        return response
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
        return None


# Function to scrape reviews for a product
def scrape(product_id, max_reviews=LIMIT):
    all_reviews = []
    page = 1
    has_next = True
    logger.info("Extracting product reviews...")
    while has_next and len(all_reviews) < max_reviews:
        response = request_product_review(product_id, page=page)
        if not response:
            break
        data = response.json()["data"]["productrevGetProductReviewList"]
        reviews = data["list"]
        all_reviews.extend(reviews)
        has_next = data["hasNext"]
        page += 1
    reviews_df = pd.json_normalize(all_reviews)
    reviews_df.rename(columns={"message": "comment"}, inplace=True)
    reviews_df = reviews_df[["comment"]]
    logger.info(reviews_df.head())
    return reviews_df


# Function to extract product ID from URL
def get_product_id(URL):
    parsed_url = urlparse(URL)
    *_, shop, product_key = parsed_url.path.split("/")
    response = request_product_id(shop, product_key, URL)
    if response:
        product_id = response.json()["data"]["pdpGetLayout"]["basicInfo"]["id"]
        logger.info(f"Product ID: {product_id}")
        return product_id
    else:
        logger.error("Failed to get product ID")
        return None


# Function to clean the reviews DataFrame
def clean(df):
    df = df.dropna().copy().reset_index(drop=True)  # Drop reviews with empty comments
    df = df[df["comment"] != ""].reset_index(drop=True)  # Remove empty reviews
    df["comment"] = df["comment"].apply(lambda x: clean_text(x))  # Clean text
    df = df[df["comment"] != ""].reset_index(drop=True)  # Remove empty reviews
    logger.info("Cleaned reviews DataFrame")
    return df


# Function to clean individual text entries
def clean_text(text):
    text = uni.normalize("NFKD", text)  # Normalize characters
    text = emoji.replace_emoji(text, "")  # Remove emoji
    text = re.sub(r"(\w)\1{2,}", r"\1", text)  # Remove repeated characters
    text = re.sub(r"[ ]+", " ", text).strip()  # Remove extra spaces
    return text


# Initialize LLM and embeddings
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/all-indobert-base-v2")


# Function to generate a summary or answer based on reviews
@spaces.GPU
async def generate(URL, query):
    global cache_URL, db, qa, cache

    if not URL or not query:
        return "Input kosong"
    try:
        product_id = get_product_id(URL)
        if not product_id:
            return "Gagal mendapatkan product ID"

        if URL not in cache:
            reviews = scrape(product_id)
            if reviews.empty:
                return "Tidak ada ulasan ditemukan"

            cleaned_reviews = clean(reviews)
            loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
            documents = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=50
            )
            docs = text_splitter.split_documents(documents)
            db = FAISS.from_documents(docs, embeddings)
            cache[URL] = (docs, db)
        else:
            docs, db = cache[URL]

        qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
        res = await qa.ainvoke(query)
        return res["result"]
    except Exception as e:
        logger.error(f"Error in generating response: {e}")
        return "Gagal mendapatkan review dari URL"


# Set up Gradio interface
product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
query_box = gr.Textbox(
    lines=2,
    label="Kueri",
    placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
)

gr.Interface(
    fn=generate,
    inputs=[product_box, query_box],
    outputs=[gr.Textbox(label="Jawaban")],
    title="RingkasUlas",
    description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
    allow_flagging="never",
).launch(debug=True)