File size: 4,209 Bytes
286516b
 
 
 
 
27e0ec6
 
 
286516b
27e0ec6
286516b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
from urllib.parse import urlparse, parse_qs
import pandas as pd
import unicodedata as uni
import emoji
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from tokopedia import request_product_id, request_product_review
import gradio as gr

shop_id = ""
item_id = ""
item = {}
LIMIT = 1000  # Limit to 1000 reviews so that processing does not take too long

def scrape(URL, max_reviews=LIMIT):
    parsed_url = urlparse(URL)
    *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
    product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
        "basicInfo"
    ]["id"]
    all_reviews = []
    page = 1
    has_next = True

    while has_next and len(all_reviews) <= max_reviews:
        response = request_product_review(product_id, page=page)
        data = response.json()["data"]["productrevGetProductReviewList"]
        reviews = data["list"]
        all_reviews.extend(reviews)
        has_next = data["hasNext"]
        page += 1

    reviews_df = pd.json_normalize(all_reviews)
    return reviews_df

# Clean
def clean(df):
    df = df.dropna().copy().reset_index(drop=True)  # drop reviews with empty comments
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    df["comment"] = df["comment"].apply(lambda x: clean_text(x))  # clean text
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    return df


def clean_text(text):
    text = uni.normalize("NFKD", text)  # normalise characters
    text = emoji.replace_emoji(text, "")  # remove emoji
    text = re.sub(r"(\w)\1{2,}", r"\1", text)  # repeated chars
    text = re.sub(r"[ ]+", " ", text).strip()  # remove extra spaces
    return text


# LLM
OpenAIModel = "gpt-3.5-turbo"
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)

# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")

cache_URL = ""
db = None
qa = None


def generate(URL, query):
    global cache_URL, db, qa
    if URL != cache_URL:
        # Get reviews
        try:
            reviews = scrape(URL)
            # Clean reviews
            cleaned_reviews = clean(reviews)
            # Load data
            loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
            documents = loader.load()
        except Exception as e:
            return "Error getting reviews: " + str(e)

        # Split text
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=50
        )
        docs = text_splitter.split_documents(documents)
        cache_URL = URL
        # Vector store
        db = FAISS.from_documents(docs, embeddings)
        # Chain to answer questions
        qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
    return qa.run(query)


# Gradio
product_box = gr.Textbox(
    label="URL Produk", placeholder="URL produk dari Tokopedia"
)
query_box = gr.Textbox(
    lines=2,
    label="Kueri",
    placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
)

gr.Interface(
    fn=generate,
    inputs=[product_box, query_box],
    outputs=gr.Textbox(label="Jawaban"),
    title="RingkasUlas",
    description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
    allow_flagging="never",
    examples=[
        [
            "https://www.tokopedia.com/benitashop/telur-asin-powder-madam-kwan-golden-salted-egg-powder",
            "Berapa lama produknya bisa bertahan?",
        ],
        [
            "https://www.tokopedia.com/benitashop/telur-asin-powder-madam-kwan-golden-salted-egg-powder",
            "Produknya bisa dipakai untuk memasak apa?",
        ],
    ],
).launch()