Spaces:

oriza
/

climatechat

Sleeping

File size: 9,526 Bytes

import streamlit as st
from dotenv import load_dotenv
import PyPDF2
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores import Chroma
import pandas as pd
import glob
import os
import re
from PyPDF2 import PdfReader

#tempat vectordb
dirload = '24feb24-openaiv2'
dirsave = "terbaru"

#embeddings
embeddings = OpenAIEmbeddings()

def import_text_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return ""
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

def import_text_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return ""
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

#list semua pdf dalam direktori
def list_pdf_files_and_save_titles(folder_path):
    pdf_file_titles = []
    try:
        files = os.listdir(folder_path)
        pdf_files = [file for file in files if file.lower().endswith('.pdf')]
        for pdf_file in pdf_files:
            pdf_file_titles.append(pdf_file)

    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return pdf_file_titles

#read the document
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"
        return text


def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings)
    return vectorstore


def get_conversation_chain(vectorstore):
    llm = ChatOpenAI()
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain


def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)


def main():
    load_dotenv()
    #load vector
    vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings)
    # create conversation chain
    st.session_state.conversation = get_conversation_chain(vectorstore)

    st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot",
                       page_icon=":sun_behind_rain_cloud:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:")
    user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:")
    if user_question:
        handle_userinput(user_question)

    with st.sidebar:
        st.header(":blue[Jumlah Dokumen dan Berita]")
        banyakDokumen =  import_text_file("banyakdokumen.txt")
        banyakBerita =  import_text_file("banyakberita.txt")

        #showing the regulation docs
        with open("file_titles.txt", "r") as file:
            my_list = file.readlines()  # Reads all lines into a list

        # Remove trailing newlines (if necessary)
        file_titles = [item.strip() for item in my_list]
       
         #show pdf files yang dipakai
        with st.container(height=300):
            s = ''
            for i in file_titles:
                s += "- " + i + "\n"
            st.markdown(s)
        
        st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen))
        st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita))

        # st.subheader("Your documents")
        # pdf_docs = st.file_uploader(
        #     "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
        # if st.button("Process"):
        #     with st.spinner("Processing"):
        #         # get pdf text
        #         raw_text = get_pdf_text(pdf_docs)

        #         # get the text chunks
        #         text_chunks = get_text_chunks(raw_text)

        #         # create vector store
        #         vectorstore = get_vectorstore(text_chunks)

        #         # create conversation chain
        #         st.session_state.conversation = get_conversation_chain(
        #             vectorstore)
                
        if st.button("Re-Processing New Data"):
            with st.spinner("Processing..."):
                # BERITA
                # Find a CSV files in the directory
                sumber = glob.glob("berita/*.csv")
                df = pd.read_csv(sumber[0])
                banyakBerita = len(df)
                print("sumber berita ditemukan")

                #update banyak berita txt
                with open("banyakBerita.txt", "w") as file: 
                    file.write(str(banyakBerita))
                print("update file text berita berhasil")

                #combining and converting
                df["combined"] = ""
                for row in range(len(df)):
                    kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n "
                    df['combined'].loc[row] = kombinasi
                listberita = df["combined"].tolist()
                textberita = " ".join(listberita)
                print("combining and converting berhasil")

                # directory ke pdf regulasi
                folder_path = 'pdf/'
                file_titles = list_pdf_files_and_save_titles(folder_path)
                banyakDokumen = len(file_titles)

                #saving the file titles
                with open("file_titles.txt", "w") as file:
                    for item in file_titles:
                        file.write(item + "\n")

                #update banyak dokumen txt
                with open("banyakDokumen.txt", "w") as file: 
                    file.write(str(banyakDokumen))
                print("update file text dokumen berhasil")

                #converting ke text untuk pdf dokument
                textdokumen=''
                for doc in range(len(file_titles)):
                    judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n "
                    batas = "=========="
                    akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n "
                    textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir)
                print("converting ke text untuk pdf dokumen berhasil")

                #combine text berita sama dokumen
                final = textdokumen 
                # + textberita
                print("combining 2 sumber pelatihan berhasil")

                #splitting
                texts = get_text_chunks(final)
                print("splitting final text berhasil")

                #save dengan chroma
                vectorstore = Chroma.from_texts(texts, 
                                                embeddings,
                                                persist_directory=dirsave)
                # persist the db to disk
                vectorstore.persist()
                vectorstore = None
                print("simpan hasil vektor ke chroma berhasil")

                st.write(":orange[Pembaharuan Berhasil!]")
        

        
        # Create an empty placeholder at the bottom
        placeholder = st.sidebar.empty()

        # Add the label within the placeholder
        with placeholder:
            st.markdown("**by Oriza Nurfajri**")

if __name__ == '__main__':
    main()