reyemhorts commited on
Commit
d895362
1 Parent(s): c7b7044

first commit

Browse files
Files changed (3) hide show
  1. app.py +53 -0
  2. load_db.py +51 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pypdf import PdfReader
4
+ from typing import Optional
5
+ import json
6
+
7
+ from load_db import load_vectorestore_from_pdf
8
+
9
+
10
+ TEMP_PDF_PATH = "temp.pdf"
11
+ retriever = None
12
+ db = None
13
+ documents = None
14
+
15
+ def pdf_to_text(file_path:str, page_num:Optional[int]=None):
16
+ reader = PdfReader(file_path)
17
+ if page_num:
18
+ return reader.pages[page_num-1].extract_text()
19
+ text = ""
20
+ for page in reader.pages:
21
+ page_text = page.extract_text()
22
+ text += page_text
23
+ return text
24
+
25
+ def load_vectore_store():
26
+ global retriever, db
27
+ db = load_vectorestore_from_pdf(TEMP_PDF_PATH,persist=False)
28
+ retriever = db.as_retriever(search_kwargs={"k": 4})
29
+
30
+ def load_pdf(inp):
31
+ # Convert bytes back to a PDF file
32
+ with open(TEMP_PDF_PATH, "wb") as f:
33
+ f.write(inp)
34
+ # Extract text from the PDF file
35
+ text = pdf_to_text(TEMP_PDF_PATH)
36
+ load_vectore_store()
37
+ #print(text)
38
+ return text
39
+
40
+
41
+ with gr.Blocks() as app:
42
+ file = gr.File(type="binary")
43
+ load_file_button = gr.Button("Load")
44
+ with gr.Accordion("Modulhandbuch anzeigen",open=False):
45
+ handbook = gr.TextArea(label="Modulhandbuch")
46
+
47
+ load_file_button.click(load_pdf,inputs=file,outputs=handbook)
48
+
49
+
50
+
51
+
52
+ if __name__ == "__main__":
53
+ app.launch(debug=True)
load_db.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ #from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
4
+
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.llms import OpenAI
8
+ from langchain.chains import ConversationalRetrievalChain, RetrievalQA
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
11
+ from langchain.document_loaders import TextLoader, PyPDFLoader
12
+ from typing import Optional
13
+ import os
14
+
15
+
16
+ load_dotenv()
17
+
18
+ embeddings_model_name ="multi-qa-MiniLM-L6-cos-v1"
19
+ persist_directory = "db"
20
+ target_source_chunks = 4
21
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
22
+
23
+
24
+ #embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
25
+ embeddings = SentenceTransformerEmbeddings(model_name=embeddings_model_name)
26
+
27
+
28
+ def load_vectorestore_from_pdf(path:str, embeddings=embeddings, persist:Optional[bool]=True):
29
+
30
+ loader = PyPDFLoader(path)
31
+ documents = loader.load()
32
+ #print(len(documents))
33
+
34
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
35
+ documents = text_splitter.split_documents(documents)
36
+
37
+ #print(len(documents))
38
+
39
+
40
+
41
+ if not persist:
42
+ vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=None)
43
+ return vectorstore
44
+ vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
45
+ vectorstore.persist()
46
+ vectorstore = None
47
+ return None
48
+
49
+
50
+ if __name__ == "__main__":
51
+ load_vectorestore_from_pdf()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pypdf
2
+ sentence-transformers
3
+ openai
4
+ gradio
5
+ langchain