import time
import urllib
import spacy
import pandas as pd
import unicodedata
import requests
import json
import os
import tiktoken
from bs4 import BeautifulSoup
from openai import OpenAI
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
# from langchain.vectorstores.deeplake import DeepLake
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableParallel
from urllib.parse import quote
from urllib.request import Request
class MLSalesPitch:
def __init__(self):
self.retriever_sales_pitch = None
self.retriever_about = None
self.TOKEN_ML = None#os.environ['TOKEN_ML']
os.environ['OPENAI_API_KEY'] = os.environ['OPENAI_KEY']
self.client = OpenAI(api_key=os.environ['OPENAI_KEY'])
self.nlp = None#spacy.load("pt_core_news_sm")
OpenAI.api_key = os.environ['OPENAI_KEY']
self.output_parser = StrOutputParser()
self.model = ChatOpenAI(openai_api_key=os.environ['OPENAI_KEY'], model="gpt-3.5-turbo")
template = """Com base nas seguintes informações de produtos fornecidas abaixo:
Crie um discurso muito convincente e interessante de venda para os seguintes produtos:
Que fazem parte da sub_categoria:
Pontue bem as vantagens dos produtos e suas caracteristicas bem como a grande oportunidade que o cliente está tendo em adquiri-los
Tenha como base os seguintes discursos de venda:
Não fique preso apenas um discurso de venda. Leve mais em consideração a construção dos discrusos bem como as vantagens, caracterisitcas e descrição dos produtos.
Adicione preços para os produtos como variáveis com prefixo _PRECO_
Não se identifique e não coloque o nome da empresa
self.prompt = ChatPromptTemplate.from_template(template)
def get_ml_product_descriptions(self):
f = open('/data/ml_categories.json', 'r')
categories_json = json.load(f)
df = pd.read_csv('/data/mercado_livre_products.csv')
for item in categories_json:
category = {'name': categories_json[item]['name'], 'id': categories_json[item]['id']}
for sub_category in categories_json[item]['children_categories']:
offset = 0
limit = 50
while offset < 1000:
headers = {'Authorization': f'Bearer {self.TOKEN_ML}'}
ans = \
if ans.ok:
data_ans = ans.json()
f"[{sub_category['name']}]: {100.0 * (offset / int(data_ans['paging']['total']))}"
f" downloaded...")
if len(data_ans['results']) == 0:
lt_prod_info = [{'id': info['id'], 'title': info['title']} for info in data_ans['results']]
for info in lt_prod_info:
resp = requests.get(f"{info['id']}/description",
if resp.ok:
data_resp = resp.json()
if 'plain_text' in data_resp:
info['description'] = data_resp['plain_text']
df_tmp = pd.DataFrame.from_dict(lt_prod_info)
df_tmp['category'] = category['name']
df_tmp['sub_category'] = sub_category['name']
df: pd.DataFrame = pd.concat([df, df_tmp])
df.to_csv('/data/mercado_livre_products.csv', header=True, index=False)
print(f'FAIL! Error {ans.status_code}: {ans.content}')
offset = offset + limit + 1
def enrich_with_google_search():
df = pd.read_csv('/data/mercado_livre_products.csv', low_memory=False)
df.to_csv('/data/mercado_livre_products_enriched_with_google_about.csv', header=True, index=False)
p_names = list(set(df['title'].to_list()))
map_name = {}
k = 0
for name in p_names:
url = '' + quote('sobre ou descrição: ' + name)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
lt_text = []
response = urllib.request.urlopen(req)
if response.code == 429:
print(f'Sleeping {int(response.headers["Retry-After"])} seconds...')
content ='UTF-8').replace(u'\xa0', u' ')
soup = BeautifulSoup(content, 'html.parser')
div_bs4 = soup.find_all('div', {"class": "BNeawe s3v9rd AP7Wnd"})
lt_text = [p.get_text() for p in div_bs4]
except Exception as error:
map_name[name] = ', '.join(lt_text)
df2 = pd.read_csv('/data/mercado_livre_products_enriched_with_google_about.csv', low_memory=False)
df2['about'] = df2[['title', 'about']].apply(
lambda x: map_name[x[0]] if ((x[0] in map_name.keys()) and (x[1] is None)) else x[1], axis=1)
df2.to_csv('/data/mercado_livre_products_enriched_with_google_about.csv', header=True, index=False)
k = k + 1
print(f'[{k} of {len(p_names)}]: {(k / len(p_names)) * 100.0}% completed')
def cleans_and_preprocesses_the_data(self) -> pd.DataFrame:
df_ml = pd.read_csv('/data/mercado_livre_products_enriched_with_google_about.csv', low_memory=False)
df_ml = df_ml[~(df_ml['description'].isna() | df_ml['about'].isna())]
df_ml['description'] = df_ml['description'].apply(lambda x: self.__clean_txt(x))
df_ml['description'] = df_ml['description'].apply(lambda x: x[0: self.__find_best_position_to_cut(x) + 1])
df_ml['about'] = df_ml['about'].apply(lambda x: x[0: self.__find_best_position_to_cut(x) + 1])
df_ml['size'] = df_ml['description'].apply(lambda x: self.__count_tokens(x))
df_ml = df_ml.sort_values(by=['size'], ascending=False)
df_ml = df_ml.reset_index(drop=True)
df_ml['sales_pitch'] = df_ml[['title', 'sub_category', 'description']].apply(
lambda x: f'Nome do produto:{x[0]}\nCategoria do produto:{x[1]}\nSugestão de como vender:{x[2]}', axis=1)
df_ml['about'] = df_ml[['title', 'sub_category', 'about']].apply(
lambda x: f'Nome do produto:{x[0]}\nCategoria do produto:{x[1]}\nSobre o produto:{x[2]}', axis=1)
# df_ml['sales_pitch'] = df_ml['sales_pitch'].apply(lambda x: self.__chat_gpt_summarize(x))
# map_about = {}
# lt_about = list(df_ml[['id', 'about']].apply(lambda x: {'id': x[0], 'about': x[1]}, axis=1).to_list())
# for about in lt_about:
# map_about[about['id']] = self.__chat_gpt_summarize(about['about'])
# df_ml['about'] = df_ml['about'].map(map_about)
df_ml = df_ml[['title', 'category', 'sub_category', 'sales_pitch', 'about']]
return df_ml
def embedding(self, df_ml: pd.DataFrame = pd.DataFrame(), add_docs=False):
if add_docs:
loader_sales_pitch = DataFrameLoader(df_ml, page_content_column="sales_pitch")
documents_sales_pitch = loader_sales_pitch.load()
loader_about = DataFrameLoader(df_ml, page_content_column="about")
documents_about = loader_about.load()
text_splitter = CharacterTextSplitter(chunk_size=2000, separator='\n', chunk_overlap=0)
docs_sales_pitch = text_splitter.split_documents(documents_sales_pitch)
docs_about = text_splitter.split_documents(documents_sales_pitch)
docs_sales_pitch = None
docs_about = None
embeddings = HuggingFaceEmbeddings()
from langchain.vectorstores.deeplake import DeepLake
vector_store_sales_pitch = \
DeepLake(dataset_path="data/my_deeplake/sales_pitch/", embedding_function=embeddings, read_only=True)
vector_store_about = \
DeepLake(dataset_path="data/my_deeplake/about/", embedding_function=embeddings, read_only=True)
if add_docs:
self.retriever_sales_pitch = vector_store_sales_pitch.as_retriever()
self.retriever_about = vector_store_about.as_retriever()
def generate_sales_pitch(self, query: dict) -> str:
chain = RunnableParallel({
"sales_pitch": lambda x: self.retriever_sales_pitch.get_relevant_documents(x["products"])[0:1],
"about": lambda x: self.retriever_about.get_relevant_documents(x["products"])[0:1],
"products": lambda x: x["products"],
"sub_category": lambda x: x["sub_category"]
}) | self.prompt | self.model | self.output_parser
return chain.invoke(query)
def __count_tokens(text):
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
return len(encoding.encode(text))
def __text_to_chunks(self, text):
chunks = [[]]
chunk_total_words = 0
sentences = self.nlp(text)
for sentence in sentences.sents:
chunk_total_words += len(sentence.text.split(" "))
if chunk_total_words > 2000:
chunk_total_words = len(sentence.text.split(" "))
chunks[len(chunks) - 1].append(sentence.text)
return chunks
def __chat_gpt_summarize(self, text):
prompt = f"Resuma o seguinte texto em no máximo 5 frases:\n{text}"
response = self.client.completions.create(
return response.choices[0].text
def __summarize_text(self, text):
chunks = self.__text_to_chunks(text)
chunk_summaries = []
for chunk in chunks:
chunk_summary = self.__chat_gpt_summarize(" ".join(chunk))
summary = " ".join(chunk_summaries)
return summary
def __find_best_position_to_cut(self, text):
lo = 0
hi = len(text) - 1
mid = hi
while lo <= hi:
mid = (lo + hi) >> 1
if self.__count_tokens(text[0:mid]) >= 1000:
hi = mid - 1
lo = mid + 1
return mid
def __clean_txt(txt):
while txt.find('\n\n') != -1:
txt = txt.replace('\n\n', '\n')
while txt.find('--') != -1:
txt = txt.replace('--', '-')
while txt.find(' ') != -1:
txt = txt.replace(' ', ' ')
while txt.find('__') != -1:
txt = txt.replace('__', '_')
while txt.find('\n_\n') != -1:
txt = txt.replace('\n_\n', '\n')
while txt.find('\n \n') != -1:
txt = txt.replace('\n \n', '\n')
return txt