Spaces:

oriza
/

climatechat

Sleeping

App Files Files Community

climatechat / app.py

oriza

Update app.py

7085ded verified about 1 year ago

raw

history blame contribute delete

9.53 kB

	import streamlit as st
	from dotenv import load_dotenv
	import PyPDF2
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_openai import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from htmlTemplates import css, bot_template, user_template
	from langchain_community.llms import HuggingFaceHub
	from langchain_community.vectorstores import Chroma
	import pandas as pd
	import glob
	import os
	import re
	from PyPDF2 import PdfReader

	#tempat vectordb
	dirload = '24feb24-openaiv2'
	dirsave = "terbaru"

	#embeddings
	embeddings = OpenAIEmbeddings()

	def import_text_file(file_path):
	try:
	with open(file_path, "r", encoding="utf-8") as file:
	text = file.read()
	return text
	except FileNotFoundError:
	print(f"Error: File not found at path: {file_path}")
	return ""
	except Exception as e:
	print(f"Error reading file: {e}")
	return ""

	def import_text_file(file_path):
	try:
	with open(file_path, "r", encoding="utf-8") as file:
	text = file.read()
	return text
	except FileNotFoundError:
	print(f"Error: File not found at path: {file_path}")
	return ""
	except Exception as e:
	print(f"Error reading file: {e}")
	return ""

	#list semua pdf dalam direktori
	def list_pdf_files_and_save_titles(folder_path):
	pdf_file_titles = []
	try:
	files = os.listdir(folder_path)
	pdf_files = [file for file in files if file.lower().endswith('.pdf')]
	for pdf_file in pdf_files:
	pdf_file_titles.append(pdf_file)

	except FileNotFoundError:
	print(f"Folder not found: {folder_path}")
	except Exception as e:
	print(f"An error occurred: {e}")

	return pdf_file_titles

	#read the document
	def extract_text_from_pdf(pdf_path):
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ''
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text() + "\n"
	return text


	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator=" ",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text)
	return chunks


	def get_vectorstore(text_chunks):
	# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings)
	return vectorstore


	def get_conversation_chain(vectorstore):
	llm = ChatOpenAI()
	memory = ConversationBufferMemory(
	memory_key='chat_history', return_messages=True)
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory
	)
	return conversation_chain


	def handle_userinput(user_question):
	response = st.session_state.conversation({'question': user_question})
	st.session_state.chat_history = response['chat_history']

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(user_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	else:
	st.write(bot_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)


	def main():
	load_dotenv()
	#load vector
	vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings)
	# create conversation chain
	st.session_state.conversation = get_conversation_chain(vectorstore)

	st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot",
	page_icon=":sun_behind_rain_cloud:")
	st.write(css, unsafe_allow_html=True)

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:")
	user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:")
	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	st.header(":blue[Jumlah Dokumen dan Berita]")
	banyakDokumen = import_text_file("banyakdokumen.txt")
	banyakBerita = import_text_file("banyakberita.txt")

	#showing the regulation docs
	with open("file_titles.txt", "r") as file:
	my_list = file.readlines() # Reads all lines into a list

	# Remove trailing newlines (if necessary)
	file_titles = [item.strip() for item in my_list]

	#show pdf files yang dipakai
	with st.container(height=300):
	s = ''
	for i in file_titles:
	s += "- " + i + "\n"
	st.markdown(s)

	st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen))
	st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita))

	# st.subheader("Your documents")
	# pdf_docs = st.file_uploader(
	# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
	# if st.button("Process"):
	# with st.spinner("Processing"):
	# # get pdf text
	# raw_text = get_pdf_text(pdf_docs)

	# # get the text chunks
	# text_chunks = get_text_chunks(raw_text)

	# # create vector store
	# vectorstore = get_vectorstore(text_chunks)

	# # create conversation chain
	# st.session_state.conversation = get_conversation_chain(
	# vectorstore)

	if st.button("Re-Processing New Data"):
	with st.spinner("Processing..."):
	# BERITA
	# Find a CSV files in the directory
	sumber = glob.glob("berita/*.csv")
	df = pd.read_csv(sumber[0])
	banyakBerita = len(df)
	print("sumber berita ditemukan")

	#update banyak berita txt
	with open("banyakBerita.txt", "w") as file:
	file.write(str(banyakBerita))
	print("update file text berita berhasil")

	#combining and converting
	df["combined"] = ""
	for row in range(len(df)):
	kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n "
	df['combined'].loc[row] = kombinasi
	listberita = df["combined"].tolist()
	textberita = " ".join(listberita)
	print("combining and converting berhasil")

	# directory ke pdf regulasi
	folder_path = 'pdf/'
	file_titles = list_pdf_files_and_save_titles(folder_path)
	banyakDokumen = len(file_titles)

	#saving the file titles
	with open("file_titles.txt", "w") as file:
	for item in file_titles:
	file.write(item + "\n")

	#update banyak dokumen txt
	with open("banyakDokumen.txt", "w") as file:
	file.write(str(banyakDokumen))
	print("update file text dokumen berhasil")

	#converting ke text untuk pdf dokument
	textdokumen=''
	for doc in range(len(file_titles)):
	judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n "
	batas = "=========="
	akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n "
	textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir)
	print("converting ke text untuk pdf dokumen berhasil")

	#combine text berita sama dokumen
	final = textdokumen
	# + textberita
	print("combining 2 sumber pelatihan berhasil")

	#splitting
	texts = get_text_chunks(final)
	print("splitting final text berhasil")

	#save dengan chroma
	vectorstore = Chroma.from_texts(texts,
	embeddings,
	persist_directory=dirsave)
	# persist the db to disk
	vectorstore.persist()
	vectorstore = None
	print("simpan hasil vektor ke chroma berhasil")

	st.write(":orange[Pembaharuan Berhasil!]")



	# Create an empty placeholder at the bottom
	placeholder = st.sidebar.empty()

	# Add the label within the placeholder
	with placeholder:
	st.markdown("by Oriza Nurfajri")

	if __name__ == '__main__':
	main()