suatatan commited on
Commit
64e8180
Β·
1 Parent(s): d13a68d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """homo-inspectus.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Syuv1HSvRJe5Lf_Y9PRPjB3lupei-O5b
8
+
9
+ # 🐡 Homo Inspectus (Question Answering Engine for Home Inspection Reports)
10
+
11
+ No need to be data scientist to run my codes πŸ˜€ Just click left triangles in the circles couple times until seeing πŸ›‘ sign.
12
+
13
+ - First running may take 5-10 minutes.
14
+ - Once engine is ready it will quickly answer your questions
15
+ """
16
+
17
+
18
+
19
+ import os
20
+ import shutil
21
+ from langchain.document_loaders import PyPDFLoader
22
+ from langchain.vectorstores import Chroma
23
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
24
+ from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
25
+ import transformers
26
+ import re
27
+ import string
28
+ import random
29
+ import pandas as pd
30
+ from google.colab import data_table
31
+
32
+
33
+
34
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
35
+
36
+ def process_string(input_string):
37
+ alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
38
+ no_blank_string = alphanumeric_string.replace(' ', '')
39
+ random_numbers = ''.join(random.choices(string.digits, k=2))
40
+ final_string = no_blank_string + random_numbers
41
+ return final_string
42
+
43
+ def show_long_repeating_sentences(text, min_words=4):
44
+ sentences = text.split('.')
45
+ unique_sentences = set()
46
+ repeating_sentences = []
47
+ for s in sentences:
48
+ stripped_sentence = s.strip()
49
+ if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
50
+ repeating_sentences.append(stripped_sentence)
51
+ else:
52
+ unique_sentences.add(stripped_sentence)
53
+
54
+ return repeating_sentences
55
+
56
+ def create_vdb(filename, chunk_size = 200, overlap = 100):
57
+ if chunk_size < overlap:
58
+ chunk_size, overlap = 500, 100
59
+ print(f'Chunk size: {chunk_size}, overlap: {overlap}')
60
+ path = os.path.abspath(f'{filename}')
61
+ loader = PyPDFLoader(path)
62
+ pages = loader.load_and_split()
63
+ # all text
64
+ all_text = ' '.join([page.page_content for page in pages])
65
+ repeating_sentences = show_long_repeating_sentences(all_text)
66
+ print('Repeating sentences')
67
+ print(repeating_sentences)
68
+ text_splitter = RecursiveCharacterTextSplitter(
69
+ chunk_size=chunk_size,
70
+ chunk_overlap=50,
71
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
72
+ )
73
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
74
+ corpus = []
75
+ for index, page in enumerate(pages):
76
+ metadatas = [{"pdf_page_number": index}]
77
+ content = page.page_content
78
+ # repeating sentences remuval
79
+ for sentence in repeating_sentences:
80
+ content= content.replace(sentence, '#RRC')
81
+ texts = text_splitter.create_documents([content], metadatas=metadatas)
82
+ output_message =f'==================Page {page.metadata["page"]} len(texts) = {len(texts)} ================='
83
+ print(output_message)
84
+ for text in texts:
85
+ corpus.insert(0,text)
86
+ #--------------------------------
87
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
88
+ cname = process_string(filename)
89
+ db = Chroma.from_documents(corpus, embedding_function, persist_directory=cname)
90
+ print(''' Index is created ⛹️''')
91
+ return db
92
+
93
+ """πŸ›‘ Stop. Now do in one more time for next code block. Then it will show you the upload button. Click it and upload the **home inspection** file Then you will be able to ask any question without running previous codes
94
+
95
+ # Information Retrieval (IR) Tool
96
+
97
+ This tool is your small local and sweet 🍬 Google for the document
98
+ """
99
+
100
+ from google.colab import files
101
+ uploaded = files.upload()
102
+ fnames = []
103
+ for filename, content in uploaded.items():
104
+ print(f"Processing file: {filename}")
105
+ fnames.append(filename)
106
+ db = create_vdb(fnames[0], 500, 100)
107
+
108
+ """πŸ‘Œ You did. Now write your keyword, or question or anything to search in the document"""
109
+
110
+ q = 'leak' # @param {type:"string"}
111
+ print(f'Your question is: {q}')
112
+ answers = []
113
+ pgnos = []
114
+ for d in db.similarity_search(q, k=10):
115
+ answers.append(d.page_content)
116
+ pgnos.append(d.metadata['pdf_page_number'])
117
+ results = pd.DataFrame({'Page': pgnos,'Cite':answers,})
118
+ data_table.DataTable(results, include_index=False, num_rows_per_page=5)
119
+
120
+ """The order of result is based the most relevant ones to less. Like Google
121
+
122
+ # Chat Tool (Humble ) πŸ†“ πŸš—
123
+
124
+ This tool is allow your direct question to answer like my rival πŸ”ͺ ChatGPT
125
+ """
126
+
127
+ from langchain import HuggingFaceHub
128
+ from langchain.embeddings import HuggingFaceEmbeddings
129
+ from langchain.chains.question_answering import load_qa_chain
130
+ from langchain import HuggingFacePipeline
131
+
132
+ max_length=1024
133
+ model_name = "declare-lab/flan-alpaca-large"
134
+ llm = HuggingFacePipeline.from_model_id(
135
+ model_id=model_name,
136
+ task="text2text-generation",
137
+ model_kwargs={"max_length": max_length},
138
+ )
139
+ qa_chain = load_qa_chain(llm, chain_type="stuff")
140
+
141
+ def ask_to_alpaca(question, vector_store, qa_chain):
142
+ similar_docs = vector_store.similarity_search(question)
143
+ response = qa_chain.run(input_documents=similar_docs, question=question)
144
+ return response
145
+
146
+ explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
147
+ ask_to_alpaca(explicit_question, db, qa_chain)