import streamlit as st from dotenv import load_dotenv import PyPDF2 from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain_openai import ChatOpenAI from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from htmlTemplates import css, bot_template, user_template from langchain_community.llms import HuggingFaceHub from langchain_community.vectorstores import Chroma import pandas as pd import glob import os import re from PyPDF2 import PdfReader #tempat vectordb dirload = '24feb24-openaiv2' dirsave = "terbaru" #embeddings embeddings = OpenAIEmbeddings() def import_text_file(file_path): try: with open(file_path, "r", encoding="utf-8") as file: text = file.read() return text except FileNotFoundError: print(f"Error: File not found at path: {file_path}") return "" except Exception as e: print(f"Error reading file: {e}") return "" def import_text_file(file_path): try: with open(file_path, "r", encoding="utf-8") as file: text = file.read() return text except FileNotFoundError: print(f"Error: File not found at path: {file_path}") return "" except Exception as e: print(f"Error reading file: {e}") return "" #list semua pdf dalam direktori def list_pdf_files_and_save_titles(folder_path): pdf_file_titles = [] try: files = os.listdir(folder_path) pdf_files = [file for file in files if file.lower().endswith('.pdf')] for pdf_file in pdf_files: pdf_file_titles.append(pdf_file) except FileNotFoundError: print(f"Folder not found: {folder_path}") except Exception as e: print(f"An error occurred: {e}") return pdf_file_titles #read the document def extract_text_from_pdf(pdf_path): with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = '' for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n" return text def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator=" ", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) return chunks def get_vectorstore(text_chunks): # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings) return vectorstore def get_conversation_chain(vectorstore): llm = ChatOpenAI() memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True) conversation_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(), memory=memory ) return conversation_chain def handle_userinput(user_question): response = st.session_state.conversation({'question': user_question}) st.session_state.chat_history = response['chat_history'] for i, message in enumerate(st.session_state.chat_history): if i % 2 == 0: st.write(user_template.replace( "{{MSG}}", message.content), unsafe_allow_html=True) else: st.write(bot_template.replace( "{{MSG}}", message.content), unsafe_allow_html=True) def main(): load_dotenv() #load vector vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings) # create conversation chain st.session_state.conversation = get_conversation_chain(vectorstore) st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot", page_icon=":sun_behind_rain_cloud:") st.write(css, unsafe_allow_html=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:") user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:") if user_question: handle_userinput(user_question) with st.sidebar: st.header(":blue[Jumlah Dokumen dan Berita]") banyakDokumen = import_text_file("banyakdokumen.txt") banyakBerita = import_text_file("banyakberita.txt") #showing the regulation docs with open("file_titles.txt", "r") as file: my_list = file.readlines() # Reads all lines into a list # Remove trailing newlines (if necessary) file_titles = [item.strip() for item in my_list] #show pdf files yang dipakai with st.container(height=300): s = '' for i in file_titles: s += "- " + i + "\n" st.markdown(s) st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen)) st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita)) # st.subheader("Your documents") # pdf_docs = st.file_uploader( # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True) # if st.button("Process"): # with st.spinner("Processing"): # # get pdf text # raw_text = get_pdf_text(pdf_docs) # # get the text chunks # text_chunks = get_text_chunks(raw_text) # # create vector store # vectorstore = get_vectorstore(text_chunks) # # create conversation chain # st.session_state.conversation = get_conversation_chain( # vectorstore) if st.button("Re-Processing New Data"): with st.spinner("Processing..."): # BERITA # Find a CSV files in the directory sumber = glob.glob("berita/*.csv") df = pd.read_csv(sumber[0]) banyakBerita = len(df) print("sumber berita ditemukan") #update banyak berita txt with open("banyakBerita.txt", "w") as file: file.write(str(banyakBerita)) print("update file text berita berhasil") #combining and converting df["combined"] = "" for row in range(len(df)): kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n " df['combined'].loc[row] = kombinasi listberita = df["combined"].tolist() textberita = " ".join(listberita) print("combining and converting berhasil") # directory ke pdf regulasi folder_path = 'pdf/' file_titles = list_pdf_files_and_save_titles(folder_path) banyakDokumen = len(file_titles) #saving the file titles with open("file_titles.txt", "w") as file: for item in file_titles: file.write(item + "\n") #update banyak dokumen txt with open("banyakDokumen.txt", "w") as file: file.write(str(banyakDokumen)) print("update file text dokumen berhasil") #converting ke text untuk pdf dokument textdokumen='' for doc in range(len(file_titles)): judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n " batas = "==========" akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n " textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir) print("converting ke text untuk pdf dokumen berhasil") #combine text berita sama dokumen final = textdokumen # + textberita print("combining 2 sumber pelatihan berhasil") #splitting texts = get_text_chunks(final) print("splitting final text berhasil") #save dengan chroma vectorstore = Chroma.from_texts(texts, embeddings, persist_directory=dirsave) # persist the db to disk vectorstore.persist() vectorstore = None print("simpan hasil vektor ke chroma berhasil") st.write(":orange[Pembaharuan Berhasil!]") # Create an empty placeholder at the bottom placeholder = st.sidebar.empty() # Add the label within the placeholder with placeholder: st.markdown("**by Oriza Nurfajri**") if __name__ == '__main__': main()