kensvin commited on
Commit
cd4d249
1 Parent(s): 0e6112b

Switch e-commerce platform

Browse files
Files changed (3) hide show
  1. app.py +125 -141
  2. requirements.txt +0 -0
  3. tokopedia.py +126 -0
app.py CHANGED
@@ -1,141 +1,125 @@
1
- import re
2
- import requests
3
- import pandas as pd
4
- import unicodedata as uni
5
- import emoji
6
- from langchain.chat_models import ChatOpenAI
7
- from langchain.embeddings import HuggingFaceEmbeddings
8
- from langchain.document_loaders import DataFrameLoader
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- from langchain.vectorstores import FAISS
11
- from langchain.chains import RetrievalQA
12
- import gradio as gr
13
-
14
- SHOPEE_API_URL = """https://shopee.co.id/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"""
15
- shop_id = ""
16
- item_id = ""
17
- item = {}
18
- LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
19
-
20
-
21
- def get_product_id(URL):
22
- # Get shop id and item id from input URL
23
- r = re.search(r"i\.(\d+)\.(\d+)", URL)
24
- shop_id, item_id = r[1], r[2]
25
- return shop_id, item_id
26
-
27
-
28
- def scrape(URL):
29
- try:
30
- shop_id, item_id = get_product_id(URL)
31
- except:
32
- return None
33
-
34
- offset = 0
35
- reviews = []
36
- while True:
37
- # Get JSON data using shop_id and item_id from input URL
38
- data = requests.get(
39
- SHOPEE_API_URL.format(shop_id=shop_id, item_id=item_id, offset=offset)
40
- ).json()
41
-
42
- i = 1
43
- for i, review in enumerate(data["data"]["ratings"], 1):
44
- reviews.append(review["comment"])
45
-
46
- if i % 20:
47
- break
48
-
49
- offset += 20
50
- if offset >= LIMIT:
51
- break
52
-
53
- df = pd.DataFrame(reviews, columns=["comment"])
54
-
55
- return df
56
-
57
-
58
- # Clean
59
- def clean(df):
60
- df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
61
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
62
- df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
63
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
64
- return df
65
-
66
-
67
- def clean_text(text):
68
- text = uni.normalize("NFKD", text) # normalise characters
69
- text = emoji.replace_emoji(text, "") # remove emoji
70
- text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
71
- text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
72
- return text
73
-
74
-
75
- # LLM
76
- OpenAIModel = "gpt-3.5-turbo"
77
- llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
78
-
79
- # Embeddings
80
- embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
81
-
82
- cache_URL = ""
83
- db = None
84
- qa = None
85
-
86
-
87
- def generate(URL, query):
88
- global cache_URL, db, qa
89
- if URL != cache_URL:
90
- # Get reviews
91
- try:
92
- reviews = scrape(URL)
93
- # Clean reviews
94
- cleaned_reviews = clean(reviews)
95
- # Load data
96
- loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
97
- documents = loader.load()
98
- except Exception as e:
99
- return "Error getting reviews: " + str(e)
100
-
101
- # Split text
102
- text_splitter = RecursiveCharacterTextSplitter(
103
- chunk_size=1000, chunk_overlap=50
104
- )
105
- docs = text_splitter.split_documents(documents)
106
- cache_URL = URL
107
- # Vector store
108
- db = FAISS.from_documents(docs, embeddings)
109
- # Chain to answer questions
110
- qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
111
- return qa.run(query)
112
-
113
-
114
- # Gradio
115
- product_box = gr.Textbox(
116
- label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
117
- )
118
- query_box = gr.Textbox(
119
- lines=2,
120
- label="Kueri",
121
- placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
122
- )
123
-
124
- gr.Interface(
125
- fn=generate,
126
- inputs=[product_box, query_box],
127
- outputs=gr.Textbox(label="Jawaban"),
128
- title="RingkasUlas",
129
- description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia (https://shopee.co.id/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
130
- allow_flagging="never",
131
- examples=[
132
- [
133
- "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
134
- "Apa yang orang katakan tentang kualitas produknya?",
135
- ],
136
- [
137
- "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
138
- "Bagaimana pendapat orang yang kurang puas dengan produknya?",
139
- ],
140
- ],
141
- ).launch()
 
1
+ !pip install transformers sentence_transformers langchain openai python-dotenv chromadb faiss-gpu
2
+ import re
3
+ from urllib.parse import urlparse, parse_qs
4
+ import pandas as pd
5
+ import unicodedata as uni
6
+ import emoji
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.document_loaders import DataFrameLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.vectorstores import FAISS
12
+ from langchain.chains import RetrievalQA
13
+ from tokopedia import request_product_id, request_product_review
14
+ import gradio as gr
15
+
16
+ shop_id = ""
17
+ item_id = ""
18
+ item = {}
19
+ LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
20
+
21
+ def scrape(URL, max_reviews=LIMIT):
22
+ parsed_url = urlparse(URL)
23
+ *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
24
+ product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
25
+ "basicInfo"
26
+ ]["id"]
27
+ all_reviews = []
28
+ page = 1
29
+ has_next = True
30
+
31
+ while has_next and len(all_reviews) <= max_reviews:
32
+ response = request_product_review(product_id, page=page)
33
+ data = response.json()["data"]["productrevGetProductReviewList"]
34
+ reviews = data["list"]
35
+ all_reviews.extend(reviews)
36
+ has_next = data["hasNext"]
37
+ page += 1
38
+
39
+ reviews_df = pd.json_normalize(all_reviews)
40
+ return reviews_df
41
+
42
+ # Clean
43
+ def clean(df):
44
+ df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
45
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
46
+ df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
47
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
48
+ return df
49
+
50
+
51
+ def clean_text(text):
52
+ text = uni.normalize("NFKD", text) # normalise characters
53
+ text = emoji.replace_emoji(text, "") # remove emoji
54
+ text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
55
+ text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
56
+ return text
57
+
58
+
59
+ # LLM
60
+ OpenAIModel = "gpt-3.5-turbo"
61
+ llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
62
+
63
+ # Embeddings
64
+ embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
65
+
66
+ cache_URL = ""
67
+ db = None
68
+ qa = None
69
+
70
+
71
+ def generate(URL, query):
72
+ global cache_URL, db, qa
73
+ if URL != cache_URL:
74
+ # Get reviews
75
+ try:
76
+ reviews = scrape(URL)
77
+ # Clean reviews
78
+ cleaned_reviews = clean(reviews)
79
+ # Load data
80
+ loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
81
+ documents = loader.load()
82
+ except Exception as e:
83
+ return "Error getting reviews: " + str(e)
84
+
85
+ # Split text
86
+ text_splitter = RecursiveCharacterTextSplitter(
87
+ chunk_size=1000, chunk_overlap=50
88
+ )
89
+ docs = text_splitter.split_documents(documents)
90
+ cache_URL = URL
91
+ # Vector store
92
+ db = FAISS.from_documents(docs, embeddings)
93
+ # Chain to answer questions
94
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
95
+ return qa.run(query)
96
+
97
+
98
+ # Gradio
99
+ product_box = gr.Textbox(
100
+ label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
101
+ )
102
+ query_box = gr.Textbox(
103
+ lines=2,
104
+ label="Kueri",
105
+ placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
106
+ )
107
+
108
+ gr.Interface(
109
+ fn=generate,
110
+ inputs=[product_box, query_box],
111
+ outputs=gr.Textbox(label="Jawaban"),
112
+ title="RingkasUlas",
113
+ description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia (https://shopee.co.id/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
114
+ allow_flagging="never",
115
+ examples=[
116
+ [
117
+ "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
118
+ "Apa yang orang katakan tentang kualitas produknya?",
119
+ ],
120
+ [
121
+ "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
122
+ "Bagaimana pendapat orang yang kurang puas dengan produknya?",
123
+ ],
124
+ ],
125
+ ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
tokopedia.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ ENDPOINT = "https://gql.tokopedia.com/graphql/productReviewList"
4
+
5
+ def request_product_id(shop_domain, product_key):
6
+ endpoint = "https://gql.tokopedia.com/graphql/PDPGetLayoutQuery"
7
+ payload = {
8
+ "operationName": "PDPGetLayoutQuery",
9
+ "variables": {
10
+ "shopDomain": f"{shop_domain}",
11
+ "productKey": f"{product_key}",
12
+ "apiVersion": 1,
13
+ },
14
+ "query": """fragment ProductVariant on pdpDataProductVariant {
15
+ errorCode
16
+ parentID
17
+ defaultChild
18
+ children {
19
+ productID
20
+ }
21
+ __typename
22
+ }
23
+
24
+ query PDPGetLayoutQuery($shopDomain: String, $productKey: String, $layoutID: String, $apiVersion: Float, $userLocation: pdpUserLocation, $extParam: String, $tokonow: pdpTokoNow, $deviceID: String) {
25
+ pdpGetLayout(shopDomain: $shopDomain, productKey: $productKey, layoutID: $layoutID, apiVersion: $apiVersion, userLocation: $userLocation, extParam: $extParam, tokonow: $tokonow, deviceID: $deviceID) {
26
+ requestID
27
+ name
28
+ pdpSession
29
+ basicInfo {
30
+ id: productID
31
+ }
32
+ components {
33
+ name
34
+ type
35
+ position
36
+ data {
37
+ ...ProductVariant
38
+ __typename
39
+ }
40
+ __typename
41
+ }
42
+ __typename
43
+ }
44
+ }
45
+ """,
46
+ }
47
+
48
+ headers = {
49
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
50
+ "Referer": "https://www.tokopedia.com",
51
+ "X-TKPD-AKAMAI": "pdpGetLayout",
52
+ }
53
+
54
+ return requests.request(method="POST", url=endpoint, json=payload, headers=headers)
55
+
56
+
57
+ def request_product_review(product_id, page=1, limit=20):
58
+ payload = {
59
+ "operationName": "productReviewList",
60
+ "variables": {
61
+ "productID": f"{product_id}",
62
+ "page": page,
63
+ "limit": limit,
64
+ "sortBy": "",
65
+ "filterBy": "",
66
+ },
67
+ "query": """query productReviewList($productID: String!, $page: Int!, $limit: Int!, $sortBy: String, $filterBy: String) {
68
+ productrevGetProductReviewList(productID: $productID, page: $page, limit: $limit, sortBy: $sortBy, filterBy: $filterBy) {
69
+ productID
70
+ list {
71
+ id: feedbackID
72
+ variantName
73
+ message
74
+ productRating
75
+ reviewCreateTime
76
+ reviewCreateTimestamp
77
+ isReportable
78
+ isAnonymous
79
+ reviewResponse {
80
+ message
81
+ createTime
82
+ __typename
83
+ }
84
+ user {
85
+ userID
86
+ fullName
87
+ image
88
+ url
89
+ __typename
90
+ }
91
+ likeDislike {
92
+ totalLike
93
+ likeStatus
94
+ __typename
95
+ }
96
+ stats {
97
+ key
98
+ formatted
99
+ count
100
+ __typename
101
+ }
102
+ badRatingReasonFmt
103
+ __typename
104
+ }
105
+ shop {
106
+ shopID
107
+ name
108
+ url
109
+ image
110
+ __typename
111
+ }
112
+ hasNext
113
+ totalReviews
114
+ __typename
115
+ }
116
+ }
117
+ """,
118
+ }
119
+
120
+ headers = {
121
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
122
+ "Referer": "https://www.tokopedia.com",
123
+ "X-TKPD-AKAMAI": "productReviewList",
124
+ }
125
+
126
+ return requests.request(method="POST", url=ENDPOINT, json=payload, headers=headers)