kensvin commited on
Commit
8be6571
·
1 Parent(s): 1b16ea0
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +141 -0
  3. requirements.txt +0 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Ringkas Ulas
3
  emoji: 🐢
4
- colorFrom: blue
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.47.1
8
  app_file: app.py
 
1
  ---
2
  title: Ringkas Ulas
3
  emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.47.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import pandas as pd
4
+ import unicodedata as uni
5
+ import emoji
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.document_loaders import DataFrameLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.chains import RetrievalQA
12
+ import gradio as gr
13
+
14
+ SHOPEE_API_URL = """https://shopee.co.id/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"""
15
+ shop_id = ""
16
+ item_id = ""
17
+ item = {}
18
+ LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
19
+
20
+
21
+ def get_product_id(URL):
22
+ # Get shop id and item id from input URL
23
+ r = re.search(r"i\.(\d+)\.(\d+)", URL)
24
+ shop_id, item_id = r[1], r[2]
25
+ return shop_id, item_id
26
+
27
+
28
+ def scrape(URL):
29
+ try:
30
+ shop_id, item_id = get_product_id(URL)
31
+ except:
32
+ return None
33
+
34
+ offset = 0
35
+ reviews = []
36
+ while True:
37
+ # Get JSON data using shop_id and item_id from input URL
38
+ data = requests.get(
39
+ SHOPEE_API_URL.format(shop_id=shop_id, item_id=item_id, offset=offset)
40
+ ).json()
41
+
42
+ i = 1
43
+ for i, review in enumerate(data["data"]["ratings"], 1):
44
+ reviews.append(review["comment"])
45
+
46
+ if i % 20:
47
+ break
48
+
49
+ offset += 20
50
+ if offset >= LIMIT:
51
+ break
52
+
53
+ df = pd.DataFrame(reviews, columns=["comment"])
54
+
55
+ return df
56
+
57
+
58
+ # Clean
59
+ def clean(df):
60
+ df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
61
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
62
+ df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
63
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
64
+ return df
65
+
66
+
67
+ def clean_text(text):
68
+ text = uni.normalize("NFKD", text) # normalise characters
69
+ text = emoji.replace_emoji(text, "") # remove emoji
70
+ text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
71
+ text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
72
+ return text
73
+
74
+
75
+ # LLM
76
+ OpenAIModel = "gpt-3.5-turbo"
77
+ llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
78
+
79
+ # Embeddings
80
+ embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
81
+
82
+ cache_URL = ""
83
+ db = None
84
+ qa = None
85
+
86
+
87
+ def generate(URL, query):
88
+ global cache_URL, db, qa
89
+ if URL != cache_URL:
90
+ # Get reviews
91
+ try:
92
+ reviews = scrape(URL)
93
+ # Clean reviews
94
+ cleaned_reviews = clean(reviews)
95
+ # Load data
96
+ loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
97
+ documents = loader.load()
98
+ except Exception as e:
99
+ return "Error getting reviews: " + str(e)
100
+
101
+ # Split text
102
+ text_splitter = RecursiveCharacterTextSplitter(
103
+ chunk_size=1000, chunk_overlap=50
104
+ )
105
+ docs = text_splitter.split_documents(documents)
106
+ cache_URL = URL
107
+ # Vector store
108
+ db = FAISS.from_documents(docs, embeddings)
109
+ # Chain to answer questions
110
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
111
+ return qa.run(query)
112
+
113
+
114
+ # Gradio
115
+ product_box = gr.Textbox(
116
+ label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
117
+ )
118
+ query_box = gr.Textbox(
119
+ lines=2,
120
+ label="Kueri",
121
+ placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
122
+ )
123
+
124
+ gr.Interface(
125
+ fn=generate,
126
+ inputs=[product_box, query_box],
127
+ outputs=gr.Textbox(label="Jawaban"),
128
+ title="RingkasUlas",
129
+ description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia (https://shopee.co.id/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
130
+ allow_flagging="never",
131
+ examples=[
132
+ [
133
+ "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
134
+ "Apa yang orang katakan tentang kualitas produknya?",
135
+ ],
136
+ [
137
+ "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
138
+ "Bagaimana pendapat orang yang kurang puas dengan produknya?",
139
+ ],
140
+ ],
141
+ ).launch()
requirements.txt ADDED
Binary file (228 Bytes). View file