dragonSwing commited on
Commit
c3629c7
1 Parent(s): 43c222e

Add application file

Browse files
Files changed (4) hide show
  1. .gitignore +195 -0
  2. app.py +127 -0
  3. requirements.txt +13 -0
  4. style.css +25 -0
.gitignore ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ ### VisualStudioCode ###
177
+ .vscode/*
178
+ !.vscode/settings.json
179
+ !.vscode/tasks.json
180
+ !.vscode/launch.json
181
+ !.vscode/extensions.json
182
+ !.vscode/*.code-snippets
183
+
184
+ # Local History for Visual Studio Code
185
+ .history/
186
+
187
+ # Built Visual Studio Code Extensions
188
+ *.vsix
189
+
190
+ ### VisualStudioCode Patch ###
191
+ # Ignore all local history of files
192
+ .history
193
+ .ionide
194
+
195
+ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.chains.question_answering import load_qa_chain
3
+ from langchain.document_loaders import UnstructuredFileLoader
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.llms import OpenAI
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.vectorstores import FAISS
8
+ from pypdf import PdfReader
9
+ import mimetypes
10
+ import validators
11
+ import requests
12
+ import tempfile
13
+ import gradio as gr
14
+
15
+
16
+ def get_empty_state():
17
+ return {"knowledge_base": None}
18
+
19
+
20
+ def on_token_change(user_token):
21
+ os.environ["OPENAI_API_KEY"] = user_token
22
+
23
+
24
+ def create_knowledge_base(docs):
25
+ # split into chunks
26
+ text_splitter = CharacterTextSplitter(
27
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
28
+ )
29
+ chunks = text_splitter.split_documents(docs)
30
+
31
+ # Create embeddings
32
+ embeddings = OpenAIEmbeddings()
33
+ knowledge_base = FAISS.from_documents(chunks, embeddings)
34
+ return knowledge_base
35
+
36
+
37
+ def upload_file(file_obj):
38
+ # pdf_reader = PdfReader(file_obj.name)
39
+ # text = ""
40
+ # for page in pdf_reader.pages:
41
+ # text += page.extract_text()
42
+ loader = UnstructuredFileLoader(file_obj.name, strategy="fast")
43
+ docs = loader.load()
44
+
45
+ knowledge_base = create_knowledge_base(docs)
46
+ return file_obj.name, {"knowledge_base": knowledge_base}
47
+
48
+
49
+ def upload_via_url(url):
50
+ if validators.url(url):
51
+ r = requests.get(url)
52
+
53
+ if r.status_code != 200:
54
+ raise ValueError(
55
+ "Check the url of your file; returned status code %s" % r.status_code
56
+ )
57
+
58
+ content_type = r.headers.get("content-type")
59
+ file_extension = mimetypes.guess_extension(content_type)
60
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
61
+ temp_file.write(r.content)
62
+ file_path = temp_file.name
63
+ loader = UnstructuredFileLoader(file_path, strategy="fast")
64
+ docs = loader.load()
65
+ with open(file_path, mode="rb") as f:
66
+ pass
67
+ knowledge_base = create_knowledge_base(docs)
68
+ return file_path, {"knowledge_base": knowledge_base}
69
+ else:
70
+ raise ValueError("Please enter a valid URL")
71
+
72
+
73
+ def answer_question(question, state):
74
+ knowledge_base = state["knowledge_base"]
75
+ if knowledge_base:
76
+ docs = knowledge_base.similarity_search(question)
77
+
78
+ llm = OpenAI(temperature=0.4)
79
+ chain = load_qa_chain(llm, chain_type="stuff")
80
+ response = chain.run(input_documents=docs, question=question)
81
+ return response
82
+ else:
83
+ return "Please upload a file first"
84
+
85
+
86
+ with gr.Blocks(css="style.css") as demo:
87
+ state = gr.State(get_empty_state())
88
+ with gr.Column(elem_id="col-container"):
89
+ gr.Markdown(
90
+ """
91
+ # Ask your PDF 💬
92
+ """
93
+ )
94
+ user_token = gr.Textbox(
95
+ value="",
96
+ label="OpenAI API Key",
97
+ placeholder="OpenAI API Key",
98
+ type="password",
99
+ show_label=True,
100
+ )
101
+ gr.Markdown("**Upload your file**")
102
+ with gr.Row(elem_id="row-flex"):
103
+ with gr.Column(scale=3):
104
+ file_url = gr.Textbox(
105
+ value="",
106
+ label="Upload your file",
107
+ placeholder="Enter a url",
108
+ show_label=False,
109
+ )
110
+ with gr.Column(scale=1, min_width=160):
111
+ upload_button = gr.UploadButton(
112
+ "Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"]
113
+ )
114
+ file_output = gr.File()
115
+ user_question = gr.Textbox(value="", label="Ask a question about your file:")
116
+ answer = gr.Textbox(value="", label="Answer:")
117
+ gr.Examples(
118
+ ["What is the main topic of the file?", "Who is the author of the file?"],
119
+ user_question,
120
+ )
121
+
122
+ file_url.submit(upload_via_url, file_url, [file_output, state])
123
+ upload_button.upload(upload_file, upload_button, [file_output, state])
124
+ user_token.change(on_token_change, inputs=[user_token], outputs=[])
125
+ user_question.submit(answer_question, [user_question, state], [answer])
126
+
127
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ tiktoken
3
+ langchain
4
+ gradio
5
+ pypdf
6
+ requests
7
+ unstructured
8
+ validators
9
+ pytesseract
10
+ pdf2image
11
+ tabulate
12
+ nltk
13
+ python-dotenv
style.css ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #col-container {
2
+ max-width: 600px;
3
+ margin-left: auto;
4
+ margin-right: auto;
5
+ }
6
+
7
+ #row-flex {
8
+ display: flex;
9
+ align-items: center;
10
+ justify-content: center;
11
+ }
12
+
13
+ a,
14
+ a:hover,
15
+ a:visited {
16
+ text-decoration-line: underline;
17
+ font-weight: 600;
18
+ color: #1f2937 !important;
19
+ }
20
+
21
+ .dark a,
22
+ .dark a:hover,
23
+ .dark a:visited {
24
+ color: #f3f4f6 !important;
25
+ }