File size: 4,475 Bytes
8be6571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import re
import requests
import pandas as pd
import unicodedata as uni
import emoji
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr
SHOPEE_API_URL = """{item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"""
shop_id = ""
item_id = ""
item = {}
LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
def get_product_id(URL):
# Get shop id and item id from input URL
r ="i\.(\d+)\.(\d+)", URL)
shop_id, item_id = r[1], r[2]
return shop_id, item_id
def scrape(URL):
shop_id, item_id = get_product_id(URL)
return None
offset = 0
reviews = []
while True:
# Get JSON data using shop_id and item_id from input URL
data = requests.get(
SHOPEE_API_URL.format(shop_id=shop_id, item_id=item_id, offset=offset)
i = 1
for i, review in enumerate(data["data"]["ratings"], 1):
if i % 20:
offset += 20
if offset >= LIMIT:
df = pd.DataFrame(reviews, columns=["comment"])
return df
# Clean
def clean(df):
df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
return df
def clean_text(text):
text = uni.normalize("NFKD", text) # normalise characters
text = emoji.replace_emoji(text, "") # remove emoji
text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
return text
OpenAIModel = "gpt-3.5-turbo"
llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
cache_URL = ""
db = None
qa = None
def generate(URL, query):
global cache_URL, db, qa
if URL != cache_URL:
# Get reviews
reviews = scrape(URL)
# Clean reviews
cleaned_reviews = clean(reviews)
# Load data
loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
documents = loader.load()
except Exception as e:
return "Error getting reviews: " + str(e)
# Split text
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=50
docs = text_splitter.split_documents(documents)
cache_URL = URL
# Vector store
db = FAISS.from_documents(docs, embeddings)
# Chain to answer questions
qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
# Gradio
product_box = gr.Textbox(
label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
query_box = gr.Textbox(
placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
inputs=[product_box, query_box],
description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia ( Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
"Apa yang orang katakan tentang kualitas produknya?",
"Bagaimana pendapat orang yang kurang puas dengan produknya?",