climatechat / app.py
oriza's picture
Update app.py
7085ded verified
import streamlit as st
from dotenv import load_dotenv
import PyPDF2
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores import Chroma
import pandas as pd
import glob
import os
import re
from PyPDF2 import PdfReader
#tempat vectordb
dirload = '24feb24-openaiv2'
dirsave = "terbaru"
#embeddings
embeddings = OpenAIEmbeddings()
def import_text_file(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
return text
except FileNotFoundError:
print(f"Error: File not found at path: {file_path}")
return ""
except Exception as e:
print(f"Error reading file: {e}")
return ""
def import_text_file(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
return text
except FileNotFoundError:
print(f"Error: File not found at path: {file_path}")
return ""
except Exception as e:
print(f"Error reading file: {e}")
return ""
#list semua pdf dalam direktori
def list_pdf_files_and_save_titles(folder_path):
pdf_file_titles = []
try:
files = os.listdir(folder_path)
pdf_files = [file for file in files if file.lower().endswith('.pdf')]
for pdf_file in pdf_files:
pdf_file_titles.append(pdf_file)
except FileNotFoundError:
print(f"Folder not found: {folder_path}")
except Exception as e:
print(f"An error occurred: {e}")
return pdf_file_titles
#read the document
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator=" ",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings)
return vectorstore
def get_conversation_chain(vectorstore):
llm = ChatOpenAI()
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
def handle_userinput(user_question):
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
#load vector
vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings)
# create conversation chain
st.session_state.conversation = get_conversation_chain(vectorstore)
st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot",
page_icon=":sun_behind_rain_cloud:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:")
user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
st.header(":blue[Jumlah Dokumen dan Berita]")
banyakDokumen = import_text_file("banyakdokumen.txt")
banyakBerita = import_text_file("banyakberita.txt")
#showing the regulation docs
with open("file_titles.txt", "r") as file:
my_list = file.readlines() # Reads all lines into a list
# Remove trailing newlines (if necessary)
file_titles = [item.strip() for item in my_list]
#show pdf files yang dipakai
with st.container(height=300):
s = ''
for i in file_titles:
s += "- " + i + "\n"
st.markdown(s)
st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen))
st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita))
# st.subheader("Your documents")
# pdf_docs = st.file_uploader(
# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
# if st.button("Process"):
# with st.spinner("Processing"):
# # get pdf text
# raw_text = get_pdf_text(pdf_docs)
# # get the text chunks
# text_chunks = get_text_chunks(raw_text)
# # create vector store
# vectorstore = get_vectorstore(text_chunks)
# # create conversation chain
# st.session_state.conversation = get_conversation_chain(
# vectorstore)
if st.button("Re-Processing New Data"):
with st.spinner("Processing..."):
# BERITA
# Find a CSV files in the directory
sumber = glob.glob("berita/*.csv")
df = pd.read_csv(sumber[0])
banyakBerita = len(df)
print("sumber berita ditemukan")
#update banyak berita txt
with open("banyakBerita.txt", "w") as file:
file.write(str(banyakBerita))
print("update file text berita berhasil")
#combining and converting
df["combined"] = ""
for row in range(len(df)):
kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n "
df['combined'].loc[row] = kombinasi
listberita = df["combined"].tolist()
textberita = " ".join(listberita)
print("combining and converting berhasil")
# directory ke pdf regulasi
folder_path = 'pdf/'
file_titles = list_pdf_files_and_save_titles(folder_path)
banyakDokumen = len(file_titles)
#saving the file titles
with open("file_titles.txt", "w") as file:
for item in file_titles:
file.write(item + "\n")
#update banyak dokumen txt
with open("banyakDokumen.txt", "w") as file:
file.write(str(banyakDokumen))
print("update file text dokumen berhasil")
#converting ke text untuk pdf dokument
textdokumen=''
for doc in range(len(file_titles)):
judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n "
batas = "=========="
akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n "
textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir)
print("converting ke text untuk pdf dokumen berhasil")
#combine text berita sama dokumen
final = textdokumen
# + textberita
print("combining 2 sumber pelatihan berhasil")
#splitting
texts = get_text_chunks(final)
print("splitting final text berhasil")
#save dengan chroma
vectorstore = Chroma.from_texts(texts,
embeddings,
persist_directory=dirsave)
# persist the db to disk
vectorstore.persist()
vectorstore = None
print("simpan hasil vektor ke chroma berhasil")
st.write(":orange[Pembaharuan Berhasil!]")
# Create an empty placeholder at the bottom
placeholder = st.sidebar.empty()
# Add the label within the placeholder
with placeholder:
st.markdown("**by Oriza Nurfajri**")
if __name__ == '__main__':
main()