File size: 4,584 Bytes
64e8180
 
78ec8cc
2631752
64e8180
 
 
 
 
 
 
 
 
 
 
38c9bda
64e8180
cd40ec3
 
 
 
c6515a5
ffd8646
2295808
cd40ec3
15a98a5
 
 
cd40ec3
 
 
15a98a5
 
 
 
 
 
64e8180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe99a
64e8180
 
 
ffd8646
b826878
ffd8646
 
 
 
 
64e8180
 
 
2295808
64e8180
 
0b31cee
64e8180
 
2295808
 
 
64e8180
2295808
0b31cee
64e8180
0b31cee
64e8180
2631752
64e8180
 
 
 
 
 
0e51722
cd40ec3
0e51722
cd40ec3
 
f812c0e
0b31cee
3508d84
0e51722
 
15a98a5
 
64e8180
 
38c9bda
fe8a6df
38c9bda
15a98a5
cd40ec3
 
 
 
 
0b31cee
 
cd40ec3
4359be5
64e8180
15a98a5
64e8180
 
 
 
 
15a98a5
64e8180
4359be5
 
 
 
 
 
0b31cee
 
cd40ec3
 
 
 
 
 
 
 
 
 
 
0b31cee
cd40ec3
 
0b31cee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: utf-8 -*-



import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
import transformers
import re
import string
import random
import pandas as pd
import streamlit as st 

from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFacePipeline
import tempfile
from PyPDF2 import PdfReader
import random

st.markdown(
"""

# 🐡 Homo Inspectus 

(Question Answering Engine)

## Suat ATAN


""")

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

def process_string(input_string):
    alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
    no_blank_string = alphanumeric_string.replace(' ', '')
    random_numbers = ''.join(random.choices(string.digits, k=2))
    final_string = no_blank_string + random_numbers
    return final_string

def show_long_repeating_sentences(text, min_words=4):
    sentences = text.split('.')
    unique_sentences = set()
    repeating_sentences = []
    for s in sentences:
        stripped_sentence = s.strip()
        if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
            repeating_sentences.append(stripped_sentence)
        else:
            unique_sentences.add(stripped_sentence)

    return repeating_sentences

def create_vdb(fileobj, chunk_size = 200, overlap = 100):
    if chunk_size < overlap:
        chunk_size, overlap = 500, 100
    print(f'Chunk size: {chunk_size}, overlap: {overlap}')
    #loader = PyPDFLoader(fileobj)
    pdf_reader = PdfReader(fileobj)
        
    all_text = ""
    for page in pdf_reader.pages:
        all_text += page.extract_text()
    
    repeating_sentences = show_long_repeating_sentences(all_text)
    print('Repeating sentences')
    print(repeating_sentences)

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=overlap,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    )
    

    chunks = text_splitter.split_text(text = all_text)
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    cname = str(random.randint(1000, 9999))
    db = Chroma.from_texts(chunks, embedding_function, persist_directory=cname)
    print(''' Index is created ⛹️''')
    return db

"""

# Information Retrieval (IR) Tool

This tool is your small local and sweet 🍬 Google for the document
"""


db = None

uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
if uploaded_file is not None:
    db = create_vdb(uploaded_file, 500, 100)
    st.write("filename:", uploaded_file.name)
    st.write(uploaded_file)
else:
    st.info("Please upload a PDF file.")


"""πŸ‘Œ  You did. Now write your keyword, or question or anything to search in the document"""

# q = 'leak' # @param {type:"string"}
q = st.text_input('Write your question or keywords', 'HVAC', key='q1')
st.write(f'Your question is: {q}')

if db:
    answers = []
    pgnos = []
    for  d in db.similarity_search(q, k=10):
      answers.append(d.page_content)
      #pgnos.append(d.metadata['pdf_page_number'])
    results = pd.DataFrame({'Cite':answers})
    st.markdown('## Results')
    st.dataframe(results)

st.markdown(
"""The order of result is based the most relevant ones to less. Like Google

# Chat Tool (Humble ) πŸ†“ πŸš—

This tool is allow your direct question to answer like my rival πŸ”ͺ ChatGPT
""")


def ask_to_alpaca(question, vector_store, qa_chain):
        similar_docs = vector_store.similarity_search(question)
        response = qa_chain.run(input_documents=similar_docs, question=question)
        return response


explicit_question = st.text_input('Write your question', 'Roof problems', key='q2')
if db:
    max_length=1024
    model_name = "declare-lab/flan-alpaca-large"
    llm = HuggingFacePipeline.from_model_id(
        model_id=model_name,
        task="text2text-generation",
        model_kwargs={"max_length": max_length},
    )
    qa_chain = load_qa_chain(llm, chain_type="stuff")
    
    # explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
   
    st.markdown("Answer:")
    st.write(ask_to_alpaca(explicit_question, db, qa_chain))
else:
    st.markdown('Please upload a PDF file.')