# -*- coding: utf-8 -*- import os import shutil from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter import transformers import re import string import random import pandas as pd import streamlit as st from langchain import HuggingFaceHub from langchain.embeddings import HuggingFaceEmbeddings from langchain.chains.question_answering import load_qa_chain from langchain import HuggingFacePipeline import tempfile from PyPDF2 import PdfReader import random st.markdown( """ # đŸĩ Homo Inspectus (Question Answering Engine) ## Suat ATAN """) embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") def process_string(input_string): alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string) no_blank_string = alphanumeric_string.replace(' ', '') random_numbers = ''.join(random.choices(string.digits, k=2)) final_string = no_blank_string + random_numbers return final_string def show_long_repeating_sentences(text, min_words=4): sentences = text.split('.') unique_sentences = set() repeating_sentences = [] for s in sentences: stripped_sentence = s.strip() if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words: repeating_sentences.append(stripped_sentence) else: unique_sentences.add(stripped_sentence) return repeating_sentences def create_vdb(fileobj, chunk_size = 200, overlap = 100): if chunk_size < overlap: chunk_size, overlap = 500, 100 print(f'Chunk size: {chunk_size}, overlap: {overlap}') #loader = PyPDFLoader(fileobj) pdf_reader = PdfReader(fileobj) all_text = "" for page in pdf_reader.pages: all_text += page.extract_text() repeating_sentences = show_long_repeating_sentences(all_text) print('Repeating sentences') print(repeating_sentences) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap, separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], ) chunks = text_splitter.split_text(text = all_text) embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") cname = str(random.randint(1000, 9999)) db = Chroma.from_texts(chunks, embedding_function, persist_directory=cname) print(''' Index is created â›šī¸''') return db """ # Information Retrieval (IR) Tool This tool is your small local and sweet đŸŦ Google for the document """ db = None uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf") if uploaded_file is not None: db = create_vdb(uploaded_file, 500, 100) st.write("filename:", uploaded_file.name) st.write(uploaded_file) else: st.info("Please upload a PDF file.") """👌 You did. Now write your keyword, or question or anything to search in the document""" # q = 'leak' # @param {type:"string"} q = st.text_input('Write your question or keywords', 'HVAC', key='q1') st.write(f'Your question is: {q}') if db: answers = [] pgnos = [] for d in db.similarity_search(q, k=10): answers.append(d.page_content) #pgnos.append(d.metadata['pdf_page_number']) results = pd.DataFrame({'Cite':answers}) st.markdown('## Results') st.dataframe(results) st.markdown( """The order of result is based the most relevant ones to less. Like Google # Chat Tool (Humble ) 🆓 🚗 This tool is allow your direct question to answer like my rival đŸ”Ē ChatGPT """) def ask_to_alpaca(question, vector_store, qa_chain): similar_docs = vector_store.similarity_search(question) response = qa_chain.run(input_documents=similar_docs, question=question) return response explicit_question = st.text_input('Write your question', 'Roof problems', key='q2') if db: max_length=1024 model_name = "declare-lab/flan-alpaca-large" llm = HuggingFacePipeline.from_model_id( model_id=model_name, task="text2text-generation", model_kwargs={"max_length": max_length}, ) qa_chain = load_qa_chain(llm, chain_type="stuff") # explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"} st.markdown("Answer:") st.write(ask_to_alpaca(explicit_question, db, qa_chain)) else: st.markdown('Please upload a PDF file.')