File size: 5,031 Bytes
9cb4e8f
 
 
286516b
 
 
 
 
c66df08
27e0ec6
 
286516b
27e0ec6
286516b
 
 
 
 
 
 
 
 
d050f83
 
 
 
a33de26
d050f83
 
 
 
 
 
36ab1a9
3885d1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d050f83
3885d1d
 
 
 
 
 
 
 
 
d050f83
3885d1d
36ab1a9
286516b
 
 
 
 
 
 
d050f83
286516b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333b8cd
286516b
 
 
 
 
 
3885d1d
286516b
3885d1d
 
 
286516b
3885d1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d050f83
3885d1d
 
 
 
 
 
 
 
d050f83
3885d1d
 
 
 
286516b
 
 
3885d1d
286516b
 
 
 
 
 
 
 
 
8464276
286516b
 
7f75ebb
6e16551
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from dotenv import load_dotenv
load_dotenv()

import re
from urllib.parse import urlparse, parse_qs
import pandas as pd
import unicodedata as uni
import emoji
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from tokopedia import request_product_id, request_product_review
import gradio as gr

shop_id = ""
item_id = ""
item = {}
LIMIT = 1000  # Limit to 1000 reviews so that processing does not take too long

import logging

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()],
)

logger = logging.getLogger(__name__)


def scrape(product_id, max_reviews=LIMIT):
    all_reviews = []
    page = 1
    has_next = True

    while has_next and len(all_reviews) <= max_reviews:
        response = request_product_review(product_id, page=page)
        data = response.json()["data"]["productrevGetProductReviewList"]
        reviews = data["list"]
        all_reviews.extend(reviews)
        has_next = data["hasNext"]
        page += 1

    reviews_df = pd.json_normalize(all_reviews)
    reviews_df.rename(columns={"message": "comment"}, inplace=True)
    reviews_df = reviews_df[["comment"]]
    logger.info(reviews_df.head())
    return reviews_df


def get_product_id(URL):
    parsed_url = urlparse(URL)
    *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
    product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
        "basicInfo"
    ]["id"]
    logger.info(product_id)
    return product_id


# Clean
def clean(df):
    df = df.dropna().copy().reset_index(drop=True)  # drop reviews with empty comments
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    df["comment"] = df["comment"].apply(lambda x: clean_text(x))  # clean text
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    logger.info("cleaned")
    return df


def clean_text(text):
    text = uni.normalize("NFKD", text)  # normalise characters
    text = emoji.replace_emoji(text, "")  # remove emoji
    text = re.sub(r"(\w)\1{2,}", r"\1", text)  # repeated chars
    text = re.sub(r"[ ]+", " ", text).strip()  # remove extra spaces
    return text


# LLM
OpenAIModel = "gpt-3.5-turbo"
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)

# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/all-indobert-base-v2")

cache_URL = ""
db = None
qa = None


async def generate(URL, query):
    global cache_URL, db, qa
    if URL == "" or query == "":
        return "Empty input"
    else:
        try:
            product_id = get_product_id(URL)
            if URL != cache_URL:
                # Get reviews
                try:
                    reviews = scrape(URL)
                    # Clean reviews
                    cleaned_reviews = clean(reviews)
                    # Load data
                    loader = DataFrameLoader(
                        cleaned_reviews, page_content_column="comment"
                    )
                    documents = loader.load()
                except Exception as e:
                    return "Error getting reviews: " + str(e)
                else:
                    # Split text
                    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=1000, chunk_overlap=50
                    )
                    docs = text_splitter.split_documents(documents)
                    logger.info("split")
                    cache_URL = URL
                    # Vector store
                    db = FAISS.from_documents(docs, embeddings)
                    # Chain to answer questions
                    qa = RetrievalQA.from_chain_type(
                        llm=llm, retriever=db.as_retriever()
                    )
                    res = await qa.ainvoke(query)
                    logger.info("generated")
                    # Process result
                    return res["result"]
        except:
            return "URL tidak valid"


# Gradio
product_box = gr.Textbox(label="URL Produk", placeholder="URL produk dari Tokopedia")
query_box = gr.Textbox(
    lines=2,
    label="Kueri",
    placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
)

gr.Interface(
    fn=generate,
    inputs=[product_box, query_box],
    outputs=[gr.Textbox(label="Jawaban")],
    title="RingkasUlas",
    description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
    allow_flagging="never",
).launch(debug=True)