# -*- coding: utf-8 -*- """homo-inspectus.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Syuv1HSvRJe5Lf_Y9PRPjB3lupei-O5b # đŸĩ Homo Inspectus (Question Answering Engine for Home Inspection Reports) No need to be data scientist to run my codes 😀 Just click left triangles in the circles couple times until seeing 🛑 sign. - First running may take 5-10 minutes. - Once engine is ready it will quickly answer your questions """ import os import shutil from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter import transformers import re import string import random import pandas as pd from google.colab import data_table embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") def process_string(input_string): alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string) no_blank_string = alphanumeric_string.replace(' ', '') random_numbers = ''.join(random.choices(string.digits, k=2)) final_string = no_blank_string + random_numbers return final_string def show_long_repeating_sentences(text, min_words=4): sentences = text.split('.') unique_sentences = set() repeating_sentences = [] for s in sentences: stripped_sentence = s.strip() if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words: repeating_sentences.append(stripped_sentence) else: unique_sentences.add(stripped_sentence) return repeating_sentences def create_vdb(filename, chunk_size = 200, overlap = 100): if chunk_size < overlap: chunk_size, overlap = 500, 100 print(f'Chunk size: {chunk_size}, overlap: {overlap}') path = os.path.abspath(f'{filename}') loader = PyPDFLoader(path) pages = loader.load_and_split() # all text all_text = ' '.join([page.page_content for page in pages]) repeating_sentences = show_long_repeating_sentences(all_text) print('Repeating sentences') print(repeating_sentences) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=50, separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], ) text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) corpus = [] for index, page in enumerate(pages): metadatas = [{"pdf_page_number": index}] content = page.page_content # repeating sentences remuval for sentence in repeating_sentences: content= content.replace(sentence, '#RRC') texts = text_splitter.create_documents([content], metadatas=metadatas) output_message =f'==================Page {page.metadata["page"]} len(texts) = {len(texts)} =================' print(output_message) for text in texts: corpus.insert(0,text) #-------------------------------- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") cname = process_string(filename) db = Chroma.from_documents(corpus, embedding_function, persist_directory=cname) print(''' Index is created ⛹ī¸''') return db """🛑 Stop. Now do in one more time for next code block. Then it will show you the upload button. Click it and upload the **home inspection** file Then you will be able to ask any question without running previous codes # Information Retrieval (IR) Tool This tool is your small local and sweet đŸŦ Google for the document """ db = create_vdb('hir.pdf', 500, 100) """👌 You did. Now write your keyword, or question or anything to search in the document""" q = 'leak' # @param {type:"string"} print(f'Your question is: {q}') answers = [] pgnos = [] for d in db.similarity_search(q, k=10): answers.append(d.page_content) pgnos.append(d.metadata['pdf_page_number']) results = pd.DataFrame({'Page': pgnos,'Cite':answers,}) data_table.DataTable(results, include_index=False, num_rows_per_page=5) """The order of result is based the most relevant ones to less. Like Google # Chat Tool (Humble ) 🆓 🚗 This tool is allow your direct question to answer like my rival đŸ”Ē ChatGPT """ from langchain import HuggingFaceHub from langchain.embeddings import HuggingFaceEmbeddings from langchain.chains.question_answering import load_qa_chain from langchain import HuggingFacePipeline max_length=1024 model_name = "declare-lab/flan-alpaca-large" llm = HuggingFacePipeline.from_model_id( model_id=model_name, task="text2text-generation", model_kwargs={"max_length": max_length}, ) qa_chain = load_qa_chain(llm, chain_type="stuff") def ask_to_alpaca(question, vector_store, qa_chain): similar_docs = vector_store.similarity_search(question) response = qa_chain.run(input_documents=similar_docs, question=question) return response explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"} ask_to_alpaca(explicit_question, db, qa_chain)