Spaces:
Sleeping
Sleeping
import streamlit as st | |
from dotenv import load_dotenv | |
import PyPDF2 | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain_openai import ChatOpenAI | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from htmlTemplates import css, bot_template, user_template | |
from langchain_community.llms import HuggingFaceHub | |
from langchain_community.vectorstores import Chroma | |
import pandas as pd | |
import glob | |
import os | |
import re | |
from PyPDF2 import PdfReader | |
#tempat vectordb | |
dirload = '24feb24-openaiv2' | |
dirsave = "terbaru" | |
#embeddings | |
embeddings = OpenAIEmbeddings() | |
def import_text_file(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as file: | |
text = file.read() | |
return text | |
except FileNotFoundError: | |
print(f"Error: File not found at path: {file_path}") | |
return "" | |
except Exception as e: | |
print(f"Error reading file: {e}") | |
return "" | |
def import_text_file(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as file: | |
text = file.read() | |
return text | |
except FileNotFoundError: | |
print(f"Error: File not found at path: {file_path}") | |
return "" | |
except Exception as e: | |
print(f"Error reading file: {e}") | |
return "" | |
#list semua pdf dalam direktori | |
def list_pdf_files_and_save_titles(folder_path): | |
pdf_file_titles = [] | |
try: | |
files = os.listdir(folder_path) | |
pdf_files = [file for file in files if file.lower().endswith('.pdf')] | |
for pdf_file in pdf_files: | |
pdf_file_titles.append(pdf_file) | |
except FileNotFoundError: | |
print(f"Folder not found: {folder_path}") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return pdf_file_titles | |
#read the document | |
def extract_text_from_pdf(pdf_path): | |
with open(pdf_path, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
text = '' | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() + "\n" | |
return text | |
def get_text_chunks(text): | |
text_splitter = CharacterTextSplitter( | |
separator=" ", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_vectorstore(text_chunks): | |
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") | |
# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings) | |
return vectorstore | |
def get_conversation_chain(vectorstore): | |
llm = ChatOpenAI() | |
memory = ConversationBufferMemory( | |
memory_key='chat_history', return_messages=True) | |
conversation_chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=vectorstore.as_retriever(), | |
memory=memory | |
) | |
return conversation_chain | |
def handle_userinput(user_question): | |
response = st.session_state.conversation({'question': user_question}) | |
st.session_state.chat_history = response['chat_history'] | |
for i, message in enumerate(st.session_state.chat_history): | |
if i % 2 == 0: | |
st.write(user_template.replace( | |
"{{MSG}}", message.content), unsafe_allow_html=True) | |
else: | |
st.write(bot_template.replace( | |
"{{MSG}}", message.content), unsafe_allow_html=True) | |
def main(): | |
load_dotenv() | |
#load vector | |
vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings) | |
# create conversation chain | |
st.session_state.conversation = get_conversation_chain(vectorstore) | |
st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot", | |
page_icon=":sun_behind_rain_cloud:") | |
st.write(css, unsafe_allow_html=True) | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
if "chat_history" not in st.session_state: | |
st.session_state.chat_history = None | |
st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:") | |
user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:") | |
if user_question: | |
handle_userinput(user_question) | |
with st.sidebar: | |
st.header(":blue[Jumlah Dokumen dan Berita]") | |
banyakDokumen = import_text_file("banyakdokumen.txt") | |
banyakBerita = import_text_file("banyakberita.txt") | |
#showing the regulation docs | |
with open("file_titles.txt", "r") as file: | |
my_list = file.readlines() # Reads all lines into a list | |
# Remove trailing newlines (if necessary) | |
file_titles = [item.strip() for item in my_list] | |
#show pdf files yang dipakai | |
with st.container(height=300): | |
s = '' | |
for i in file_titles: | |
s += "- " + i + "\n" | |
st.markdown(s) | |
st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen)) | |
st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita)) | |
# st.subheader("Your documents") | |
# pdf_docs = st.file_uploader( | |
# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True) | |
# if st.button("Process"): | |
# with st.spinner("Processing"): | |
# # get pdf text | |
# raw_text = get_pdf_text(pdf_docs) | |
# # get the text chunks | |
# text_chunks = get_text_chunks(raw_text) | |
# # create vector store | |
# vectorstore = get_vectorstore(text_chunks) | |
# # create conversation chain | |
# st.session_state.conversation = get_conversation_chain( | |
# vectorstore) | |
if st.button("Re-Processing New Data"): | |
with st.spinner("Processing..."): | |
# BERITA | |
# Find a CSV files in the directory | |
sumber = glob.glob("berita/*.csv") | |
df = pd.read_csv(sumber[0]) | |
banyakBerita = len(df) | |
print("sumber berita ditemukan") | |
#update banyak berita txt | |
with open("banyakBerita.txt", "w") as file: | |
file.write(str(banyakBerita)) | |
print("update file text berita berhasil") | |
#combining and converting | |
df["combined"] = "" | |
for row in range(len(df)): | |
kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n " | |
df['combined'].loc[row] = kombinasi | |
listberita = df["combined"].tolist() | |
textberita = " ".join(listberita) | |
print("combining and converting berhasil") | |
# directory ke pdf regulasi | |
folder_path = 'pdf/' | |
file_titles = list_pdf_files_and_save_titles(folder_path) | |
banyakDokumen = len(file_titles) | |
#saving the file titles | |
with open("file_titles.txt", "w") as file: | |
for item in file_titles: | |
file.write(item + "\n") | |
#update banyak dokumen txt | |
with open("banyakDokumen.txt", "w") as file: | |
file.write(str(banyakDokumen)) | |
print("update file text dokumen berhasil") | |
#converting ke text untuk pdf dokument | |
textdokumen='' | |
for doc in range(len(file_titles)): | |
judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n " | |
batas = "==========" | |
akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n " | |
textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir) | |
print("converting ke text untuk pdf dokumen berhasil") | |
#combine text berita sama dokumen | |
final = textdokumen | |
# + textberita | |
print("combining 2 sumber pelatihan berhasil") | |
#splitting | |
texts = get_text_chunks(final) | |
print("splitting final text berhasil") | |
#save dengan chroma | |
vectorstore = Chroma.from_texts(texts, | |
embeddings, | |
persist_directory=dirsave) | |
# persist the db to disk | |
vectorstore.persist() | |
vectorstore = None | |
print("simpan hasil vektor ke chroma berhasil") | |
st.write(":orange[Pembaharuan Berhasil!]") | |
# Create an empty placeholder at the bottom | |
placeholder = st.sidebar.empty() | |
# Add the label within the placeholder | |
with placeholder: | |
st.markdown("**by Oriza Nurfajri**") | |
if __name__ == '__main__': | |
main() | |