File size: 4,475 Bytes
8be6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import re
import requests
import pandas as pd
import unicodedata as uni
import emoji
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr

SHOPEE_API_URL = """https://shopee.co.id/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"""
shop_id = ""
item_id = ""
item = {}
LIMIT = 1000  # Limit to 1000 reviews so that processing does not take too long


def get_product_id(URL):
    # Get shop id and item id from input URL
    r = re.search(r"i\.(\d+)\.(\d+)", URL)
    shop_id, item_id = r[1], r[2]
    return shop_id, item_id


def scrape(URL):
    try:
        shop_id, item_id = get_product_id(URL)
    except:
        return None

    offset = 0
    reviews = []
    while True:
        # Get JSON data using shop_id and item_id from input URL
        data = requests.get(
            SHOPEE_API_URL.format(shop_id=shop_id, item_id=item_id, offset=offset)
        ).json()

        i = 1
        for i, review in enumerate(data["data"]["ratings"], 1):
            reviews.append(review["comment"])

        if i % 20:
            break

        offset += 20
        if offset >= LIMIT:
            break

    df = pd.DataFrame(reviews, columns=["comment"])

    return df


# Clean
def clean(df):
    df = df.dropna().copy().reset_index(drop=True)  # drop reviews with empty comments
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    df["comment"] = df["comment"].apply(lambda x: clean_text(x))  # clean text
    df = df[df["comment"] != ""].reset_index(drop=True)  # remove empty reviews
    return df


def clean_text(text):
    text = uni.normalize("NFKD", text)  # normalise characters
    text = emoji.replace_emoji(text, "")  # remove emoji
    text = re.sub(r"(\w)\1{2,}", r"\1", text)  # repeated chars
    text = re.sub(r"[ ]+", " ", text).strip()  # remove extra spaces
    return text


# LLM
OpenAIModel = "gpt-3.5-turbo"
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)

# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")

cache_URL = ""
db = None
qa = None


def generate(URL, query):
    global cache_URL, db, qa
    if URL != cache_URL:
        # Get reviews
        try:
            reviews = scrape(URL)
            # Clean reviews
            cleaned_reviews = clean(reviews)
            # Load data
            loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
            documents = loader.load()
        except Exception as e:
            return "Error getting reviews: " + str(e)

        # Split text
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=50
        )
        docs = text_splitter.split_documents(documents)
        cache_URL = URL
        # Vector store
        db = FAISS.from_documents(docs, embeddings)
        # Chain to answer questions
        qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
    return qa.run(query)


# Gradio
product_box = gr.Textbox(
    label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
)
query_box = gr.Textbox(
    lines=2,
    label="Kueri",
    placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
)

gr.Interface(
    fn=generate,
    inputs=[product_box, query_box],
    outputs=gr.Textbox(label="Jawaban"),
    title="RingkasUlas",
    description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia (https://shopee.co.id/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
    allow_flagging="never",
    examples=[
        [
            "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
            "Apa yang orang katakan tentang kualitas produknya?",
        ],
        [
            "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
            "Bagaimana pendapat orang yang kurang puas dengan produknya?",
        ],
    ],
).launch()