Spaces:
Build error
Build error
ankur-bohra
commited on
Commit
•
0d99179
1
Parent(s):
901322a
Add basic structure
Browse files- .gitattributes +2 -0
- .gitignore +3 -0
- Dockerfile +26 -0
- app.py +366 -0
- categories/__init__.py +192 -0
- categories/accomodation/__init__.py +41 -0
- categories/accomodation/model.py +29 -0
- categories/random_/__init__.py +134 -0
- categories/random_/model.py +82 -0
- categories/travel_cab/__init__.py +37 -0
- categories/travel_cab/model.py +19 -0
- categories/travel_flight/__init__.py +23 -0
- categories/travel_flight/model.py +30 -0
- categories/vendor/__init__.py +38 -0
- categories/vendor/model.py +46 -0
- environment.yml +181 -0
- examples/example1.pdf +3 -0
- examples/rotated.jpeg +3 -0
- examples/rotated.pdf +3 -0
- examples/upright.jpeg +3 -0
- examples/upright.pdf +3 -0
- extract.py +67 -0
- main.py +66 -0
- packages.txt +1 -0
- processing.py +171 -0
- requirements.txt +124 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.conda
|
2 |
+
temp*
|
3 |
+
__pycache__/
|
Dockerfile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM continuumio/miniconda3
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
# Create the environment:
|
6 |
+
COPY ./environment.yml /code/environment.yml
|
7 |
+
|
8 |
+
RUN conda config --set channel_priority strict
|
9 |
+
RUN conda config --add channels conda-forge
|
10 |
+
RUN conda env create -f environment.yml
|
11 |
+
|
12 |
+
# Make RUN commands use the new environment:
|
13 |
+
SHELL ["conda", "run", "-n", "env", "/bin/bash", "-c"]
|
14 |
+
|
15 |
+
RUN pip install -r requirements.txt
|
16 |
+
|
17 |
+
# Demonstrate the environment is activated:
|
18 |
+
RUN echo "Making sure installation worked:"
|
19 |
+
RUN python -c "import gradio, pypdf, pdf2image, langchain, openai, datasets"
|
20 |
+
|
21 |
+
COPY . .
|
22 |
+
|
23 |
+
# The code to run when container is started:
|
24 |
+
|
25 |
+
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "env", "python", "app.py"]
|
26 |
+
|
app.py
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
from io import BytesIO
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from langchain.schema.output_parser import OutputParserException
|
7 |
+
import gradio as gr
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
import categories
|
11 |
+
from categories import Category
|
12 |
+
from main import process_image, process_pdf
|
13 |
+
|
14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
15 |
+
PDF_IFRAME = """
|
16 |
+
<div style="border-radius: 10px; width: 100%; overflow: hidden;">
|
17 |
+
<iframe
|
18 |
+
src="data:application/pdf;base64,{0}"
|
19 |
+
width="100%"
|
20 |
+
height="400"
|
21 |
+
type="application/pdf">
|
22 |
+
</iframe>
|
23 |
+
</div>"""
|
24 |
+
|
25 |
+
hf_writer_normal = gr.HuggingFaceDatasetSaver(
|
26 |
+
HF_TOKEN, "automatic-reimbursement-tool-demo", separate_dirs=False
|
27 |
+
)
|
28 |
+
hf_writer_incorrect = gr.HuggingFaceDatasetSaver(
|
29 |
+
HF_TOKEN, "automatic-reimbursement-tool-demo-incorrect", separate_dirs=False
|
30 |
+
)
|
31 |
+
# with open("examples/example1.pdf", "rb") as pdf_file:
|
32 |
+
# base64_pdf = base64.b64encode(pdf_file.read())
|
33 |
+
|
34 |
+
|
35 |
+
# example_paths = []
|
36 |
+
# current_file_path = None
|
37 |
+
|
38 |
+
# def ignore_examples(function):
|
39 |
+
# def new_function(*args, **kwargs):
|
40 |
+
# global example_paths, current_file_path
|
41 |
+
# if current_file_path not in example_paths:
|
42 |
+
# return function(*args, **kwargs)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def display_file(input_file):
|
47 |
+
global current_file_path
|
48 |
+
current_file_path = input_file.name if input_file else None
|
49 |
+
if not input_file:
|
50 |
+
return gr.HTML.update(visible=False), gr.Image.update(visible=False)
|
51 |
+
if input_file.name.endswith(".pdf"):
|
52 |
+
with open(input_file.name, "rb") as input_file:
|
53 |
+
pdf_base64 = base64.b64encode(input_file.read()).decode()
|
54 |
+
return gr.HTML.update(
|
55 |
+
PDF_IFRAME.format(pdf_base64), visible=True
|
56 |
+
), gr.Image.update(visible=False)
|
57 |
+
else:
|
58 |
+
# image = Image.open(input_file.name)
|
59 |
+
return gr.HTML.update(visible=False), gr.Image.update(
|
60 |
+
input_file.name, visible=True
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def show_intermediate_outputs(show_intermediate):
|
65 |
+
if show_intermediate:
|
66 |
+
return gr.Accordion.update(visible=True)
|
67 |
+
else:
|
68 |
+
return gr.Accordion.update(visible=False)
|
69 |
+
|
70 |
+
|
71 |
+
def show_share_contact(share_result):
|
72 |
+
return gr.Textbox.update(visible=share_result)
|
73 |
+
|
74 |
+
|
75 |
+
def clear_inputs():
|
76 |
+
return gr.File.update(value=None)
|
77 |
+
|
78 |
+
|
79 |
+
def submit(input_file, old_text):
|
80 |
+
if not input_file:
|
81 |
+
gr.Error("Please upload a file to continue!")
|
82 |
+
return gr.Textbox.update()
|
83 |
+
|
84 |
+
# Send change to preprocessed image or to extracted text
|
85 |
+
if input_file.name.endswith(".pdf"):
|
86 |
+
text = process_pdf(Path(input_file.name), extract_only=True)
|
87 |
+
else:
|
88 |
+
text = process_image(Path(input_file.name), extract_only=True)
|
89 |
+
return text
|
90 |
+
|
91 |
+
|
92 |
+
def categorize_extracted_text(extracted_text):
|
93 |
+
category = categories.categorize_text(extracted_text)
|
94 |
+
# gr.Info(f"Recognized category: {category}")
|
95 |
+
return category
|
96 |
+
|
97 |
+
|
98 |
+
def extract_from_category(category, extracted_text):
|
99 |
+
# gr.Info("Received category: " + category)
|
100 |
+
if not category:
|
101 |
+
return (
|
102 |
+
gr.Chatbot.update(None),
|
103 |
+
gr.JSON.update(None),
|
104 |
+
gr.Button.update(interactive=False),
|
105 |
+
gr.Button.update(interactive=False),
|
106 |
+
)
|
107 |
+
category = Category[category]
|
108 |
+
chain = categories.category_modules[category].chain
|
109 |
+
formatted_prompt = chain.prompt.format_prompt(
|
110 |
+
text=extracted_text,
|
111 |
+
format_instructions=chain.output_parser.get_format_instructions(),
|
112 |
+
)
|
113 |
+
result = chain.generate(
|
114 |
+
input_list=[
|
115 |
+
{
|
116 |
+
"text": extracted_text,
|
117 |
+
"format_instructions": chain.output_parser.get_format_instructions(),
|
118 |
+
}
|
119 |
+
]
|
120 |
+
)
|
121 |
+
question = f""
|
122 |
+
if len(formatted_prompt.messages) > 1:
|
123 |
+
question += f"**System:**\n{formatted_prompt.messages[1].content}"
|
124 |
+
question += f"\n\n**Human:**\n{formatted_prompt.messages[0].content}"
|
125 |
+
answer = result.generations[0][0].text
|
126 |
+
try:
|
127 |
+
information = chain.output_parser.parse_with_prompt(answer, formatted_prompt)
|
128 |
+
information = information.json() if information else {}
|
129 |
+
except OutputParserException as e:
|
130 |
+
information = {
|
131 |
+
"error": "Unable to parse chatbot output",
|
132 |
+
"details": str(e),
|
133 |
+
"output": e.llm_output,
|
134 |
+
}
|
135 |
+
return (
|
136 |
+
gr.Chatbot.update([[question, answer]]),
|
137 |
+
gr.JSON.update(information),
|
138 |
+
gr.Button.update(interactive=True),
|
139 |
+
gr.Button.update(interactive=True),
|
140 |
+
)
|
141 |
+
|
142 |
+
|
143 |
+
def dynamic_auto_flag(flag_method):
|
144 |
+
def modified_flag_method(share_result, *args, **kwargs):
|
145 |
+
if share_result:
|
146 |
+
flag_method(*args, **kwargs)
|
147 |
+
|
148 |
+
return modified_flag_method
|
149 |
+
|
150 |
+
|
151 |
+
# def save_example_and_submit(input_file):
|
152 |
+
# example_paths.append(input_file.name)
|
153 |
+
# submit(input_file, "")
|
154 |
+
|
155 |
+
|
156 |
+
with gr.Blocks(title="Automatic Reimbursement Tool Demo") as page:
|
157 |
+
gr.Markdown("<center><h1>Automatic Reimbursement Tool Demo</h1></center>")
|
158 |
+
gr.Markdown("<h2>Description</h2>")
|
159 |
+
gr.Markdown(
|
160 |
+
"The reimbursement filing process can be time-consuming and cumbersome, causing "
|
161 |
+
"frustration for faculty members and finance departments. Our project aims to "
|
162 |
+
"automate the information extraction involved in the process by feeding "
|
163 |
+
"extracted text to language models such as ChatGPT. This demo showcases the "
|
164 |
+
"categorization and extraction parts of the pipeline. Categorization is done "
|
165 |
+
"to identify the relevant details associated with the text, after which "
|
166 |
+
"extraction is done for those details using a language model."
|
167 |
+
)
|
168 |
+
gr.Markdown("<h2>Try it out!</h2>")
|
169 |
+
with gr.Box() as demo:
|
170 |
+
with gr.Row():
|
171 |
+
with gr.Column(variant="panel"):
|
172 |
+
gr.HTML(
|
173 |
+
'<div><center style="color:rgb(200, 200, 200);">Input</center></div>'
|
174 |
+
)
|
175 |
+
pdf_preview = gr.HTML(label="Preview", show_label=True, visible=False)
|
176 |
+
image_preview = gr.Image(
|
177 |
+
label="Preview", show_label=True, visible=False, height=350
|
178 |
+
)
|
179 |
+
input_file = gr.File(
|
180 |
+
label="Input receipt",
|
181 |
+
show_label=True,
|
182 |
+
type="file",
|
183 |
+
file_count="single",
|
184 |
+
file_types=["image", ".pdf"],
|
185 |
+
)
|
186 |
+
input_file.change(
|
187 |
+
display_file, input_file, [pdf_preview, image_preview]
|
188 |
+
)
|
189 |
+
|
190 |
+
with gr.Row():
|
191 |
+
clear = gr.Button("Clear", variant="secondary")
|
192 |
+
submit_button = gr.Button("Submit", variant="primary")
|
193 |
+
|
194 |
+
show_intermediate = gr.Checkbox(
|
195 |
+
False,
|
196 |
+
label="Show intermediate outputs",
|
197 |
+
info="There are several intermediate steps in the process such as preprocessing, OCR, chatbot interaction. You can choose to show their results here.",
|
198 |
+
)
|
199 |
+
share_result = gr.Checkbox(
|
200 |
+
True,
|
201 |
+
label="Share results",
|
202 |
+
info="Sharing your result with us will help us immensely in improving this tool.",
|
203 |
+
interactive=True,
|
204 |
+
)
|
205 |
+
contact = gr.Textbox(
|
206 |
+
type="email",
|
207 |
+
label="Contact",
|
208 |
+
interactive=True,
|
209 |
+
placeholder="Enter your email address",
|
210 |
+
info="Optionally, enter your email address to allow us to contact you regarding your result.",
|
211 |
+
visible=True,
|
212 |
+
)
|
213 |
+
share_result.change(show_share_contact, share_result, [contact])
|
214 |
+
|
215 |
+
with gr.Column(variant="panel"):
|
216 |
+
gr.HTML(
|
217 |
+
'<div><center style="color:rgb(200, 200, 200);">Output</center></div>'
|
218 |
+
)
|
219 |
+
category = gr.Dropdown(
|
220 |
+
value=None,
|
221 |
+
choices=Category.__members__.keys(),
|
222 |
+
label=f"Recognized category ({', '.join(Category.__members__.keys())})",
|
223 |
+
show_label=True,
|
224 |
+
interactive=False,
|
225 |
+
)
|
226 |
+
intermediate_outputs = gr.Accordion(
|
227 |
+
"Intermediate outputs", open=True, visible=False
|
228 |
+
)
|
229 |
+
with intermediate_outputs:
|
230 |
+
extracted_text = gr.Textbox(
|
231 |
+
label="Extracted text",
|
232 |
+
show_label=True,
|
233 |
+
max_lines=5,
|
234 |
+
show_copy_button=True,
|
235 |
+
lines=5,
|
236 |
+
interactive=False,
|
237 |
+
)
|
238 |
+
chatbot = gr.Chatbot(
|
239 |
+
None,
|
240 |
+
label="Chatbot interaction",
|
241 |
+
show_label=True,
|
242 |
+
interactive=False,
|
243 |
+
height=240,
|
244 |
+
)
|
245 |
+
information = gr.JSON(label="Extracted information")
|
246 |
+
with gr.Row():
|
247 |
+
flag_incorrect_button = gr.Button(
|
248 |
+
"Flag as incorrect", variant="stop", interactive=True
|
249 |
+
)
|
250 |
+
flag_irrelevant_button = gr.Button(
|
251 |
+
"Flag as irrelevant", variant="stop", interactive=True
|
252 |
+
)
|
253 |
+
|
254 |
+
show_intermediate.change(
|
255 |
+
show_intermediate_outputs, show_intermediate, [intermediate_outputs]
|
256 |
+
)
|
257 |
+
|
258 |
+
clear.click(clear_inputs, None, [input_file])
|
259 |
+
submit_button.click(
|
260 |
+
submit,
|
261 |
+
[input_file, extracted_text],
|
262 |
+
[extracted_text],
|
263 |
+
)
|
264 |
+
submit_button.click(
|
265 |
+
lambda input_file, category, chatbot, information: (
|
266 |
+
gr.Dropdown.update(None),
|
267 |
+
gr.Chatbot.update(None),
|
268 |
+
gr.Textbox.update(None),
|
269 |
+
) if input_file else (category, chatbot, information),
|
270 |
+
[input_file, category, chatbot, information],
|
271 |
+
[category, chatbot, information],
|
272 |
+
)
|
273 |
+
extracted_text.change(
|
274 |
+
categorize_extracted_text,
|
275 |
+
[extracted_text],
|
276 |
+
[category],
|
277 |
+
)
|
278 |
+
category.change(
|
279 |
+
extract_from_category,
|
280 |
+
[category, extracted_text],
|
281 |
+
[chatbot, information, flag_incorrect_button, flag_irrelevant_button],
|
282 |
+
)
|
283 |
+
|
284 |
+
hf_writer_normal.setup(
|
285 |
+
[input_file, extracted_text, category, chatbot, information, contact],
|
286 |
+
flagging_dir="flagged",
|
287 |
+
)
|
288 |
+
flag_method = gr.flagging.FlagMethod(
|
289 |
+
hf_writer_normal, "", "", visual_feedback=True
|
290 |
+
)
|
291 |
+
information.change(
|
292 |
+
dynamic_auto_flag(flag_method),
|
293 |
+
inputs=[
|
294 |
+
share_result,
|
295 |
+
input_file,
|
296 |
+
extracted_text,
|
297 |
+
category,
|
298 |
+
chatbot,
|
299 |
+
information,
|
300 |
+
contact,
|
301 |
+
],
|
302 |
+
outputs=None,
|
303 |
+
preprocess=False,
|
304 |
+
queue=False,
|
305 |
+
)
|
306 |
+
|
307 |
+
hf_writer_incorrect.setup(
|
308 |
+
[input_file, extracted_text, category, chatbot, information, contact],
|
309 |
+
flagging_dir="flagged_incorrect",
|
310 |
+
)
|
311 |
+
flag_incorrect_method = gr.flagging.FlagMethod(
|
312 |
+
hf_writer_incorrect,
|
313 |
+
"Flag as incorrect",
|
314 |
+
"Incorrect",
|
315 |
+
visual_feedback=True,
|
316 |
+
)
|
317 |
+
flag_incorrect_button.click(
|
318 |
+
lambda: gr.Button.update(value="Saving...", interactive=False),
|
319 |
+
None,
|
320 |
+
flag_incorrect_button,
|
321 |
+
queue=False,
|
322 |
+
)
|
323 |
+
flag_incorrect_button.click(
|
324 |
+
flag_incorrect_method,
|
325 |
+
inputs=[
|
326 |
+
input_file,
|
327 |
+
extracted_text,
|
328 |
+
category,
|
329 |
+
chatbot,
|
330 |
+
information,
|
331 |
+
contact,
|
332 |
+
],
|
333 |
+
outputs=[flag_incorrect_button],
|
334 |
+
preprocess=False,
|
335 |
+
queue=False,
|
336 |
+
)
|
337 |
+
|
338 |
+
flag_irrelevant_method = gr.flagging.FlagMethod(
|
339 |
+
hf_writer_incorrect,
|
340 |
+
"Flag as irrelevant",
|
341 |
+
"Irrelevant",
|
342 |
+
visual_feedback=True,
|
343 |
+
)
|
344 |
+
flag_irrelevant_button.click(
|
345 |
+
lambda: gr.Button.update(value="Saving...", interactive=False),
|
346 |
+
None,
|
347 |
+
flag_irrelevant_button,
|
348 |
+
queue=False,
|
349 |
+
)
|
350 |
+
flag_irrelevant_button.click(
|
351 |
+
flag_irrelevant_method,
|
352 |
+
inputs=[
|
353 |
+
input_file,
|
354 |
+
extracted_text,
|
355 |
+
category,
|
356 |
+
chatbot,
|
357 |
+
information,
|
358 |
+
contact,
|
359 |
+
],
|
360 |
+
outputs=[flag_irrelevant_button],
|
361 |
+
preprocess=False,
|
362 |
+
queue=False,
|
363 |
+
)
|
364 |
+
|
365 |
+
|
366 |
+
page.launch(show_api=True, show_error=True, debug=True)
|
categories/__init__.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
from typing import Union
|
3 |
+
|
4 |
+
# from . import vendor
|
5 |
+
from langchain.chains import LLMChain
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.output_parsers import PydanticOutputParser
|
8 |
+
from langchain.output_parsers.enum import EnumOutputParser
|
9 |
+
from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate,
|
10 |
+
SystemMessagePromptTemplate)
|
11 |
+
from pydantic import BaseModel
|
12 |
+
|
13 |
+
from . import accomodation, random_, travel_cab, travel_flight
|
14 |
+
|
15 |
+
|
16 |
+
class Category(Enum):
|
17 |
+
ACCOMODATION = "ACCOMODATION"
|
18 |
+
TRAVEL_FLIGHT = "TRAVEL_FLIGHT"
|
19 |
+
TRAVEL_CAB = "TRAVEL_CAB"
|
20 |
+
# VENDOR = "VENDOR"
|
21 |
+
RANDOM = "RANDOM"
|
22 |
+
|
23 |
+
|
24 |
+
category_modules = {
|
25 |
+
Category.ACCOMODATION: accomodation,
|
26 |
+
Category.TRAVEL_FLIGHT: travel_flight,
|
27 |
+
Category.TRAVEL_CAB: travel_cab,
|
28 |
+
# Category.VENDOR: vendor,
|
29 |
+
Category.RANDOM: random_,
|
30 |
+
}
|
31 |
+
|
32 |
+
model = ChatOpenAI(
|
33 |
+
temperature=0,
|
34 |
+
n=1,
|
35 |
+
# max_tokens=300,
|
36 |
+
model_kwargs={
|
37 |
+
"stop": None,
|
38 |
+
"top_p": 1,
|
39 |
+
"frequency_penalty": 0,
|
40 |
+
"presence_penalty": 0,
|
41 |
+
},
|
42 |
+
)
|
43 |
+
|
44 |
+
# Build categorizing chain
|
45 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
46 |
+
"You are a classifier that, given a bill's text, states what type of bill "
|
47 |
+
"category it belongs to: accomodation (bills regarding stays), travel (bills "
|
48 |
+
"concerning cab or other land rides), travel (bills concerning flights), random "
|
49 |
+
"(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
|
50 |
+
"You may want to see if there are Room Details, Check-in/Check-out Date for "
|
51 |
+
"Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
|
52 |
+
"Travel; Conference Details for Conference organizers; anything else comes under "
|
53 |
+
"random category. Your answers must be only the appropriate choice e.g. 'option' and "
|
54 |
+
"not 'The given bill belongs to the option category.'\n"
|
55 |
+
"{format_instructions}"
|
56 |
+
)
|
57 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
58 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
59 |
+
[system_message_prompt, human_message_prompt]
|
60 |
+
)
|
61 |
+
category_parser = EnumOutputParser(enum=Category)
|
62 |
+
categorize_chain = LLMChain(
|
63 |
+
llm=model, prompt=chat_prompt, output_parser=category_parser
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
def categorize_text(text: str) -> Category:
|
68 |
+
"""Categories the text into one of the categories defined in Category by querying
|
69 |
+
ChatGPT.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
text(str): The text to categorize.
|
73 |
+
|
74 |
+
Returns: The category of the text.
|
75 |
+
"""
|
76 |
+
return categorize_chain.run(
|
77 |
+
text=text, format_instructions=category_parser.get_format_instructions()
|
78 |
+
)
|
79 |
+
|
80 |
+
|
81 |
+
def run_category_chain(category: Category, text: str) -> Union[BaseModel, None]:
|
82 |
+
"""Runs the chain for the given category on the given text.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
category(Category): The category for which the chain is to be run.
|
86 |
+
text(str): The text on which the chain is to be run.
|
87 |
+
|
88 |
+
Returns: The output of the chain.
|
89 |
+
"""
|
90 |
+
output_parser = category_modules[category].output_parser
|
91 |
+
try:
|
92 |
+
return category_modules[category].chain.run(
|
93 |
+
text=text, format_instructions=output_parser.get_format_instructions()
|
94 |
+
)
|
95 |
+
except Exception as e:
|
96 |
+
print("Error in running chain for category", category, ":", e)
|
97 |
+
|
98 |
+
|
99 |
+
if __name__ == "__main__":
|
100 |
+
text = """amazonin
|
101 |
+
we)
|
102 |
+
|
103 |
+
Sold By :
|
104 |
+
|
105 |
+
Spigen India Pvt. Ltd.
|
106 |
+
|
107 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
108 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
109 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
110 |
+
Gurgaon, Haryana, 122413
|
111 |
+
|
112 |
+
IN
|
113 |
+
|
114 |
+
PAN No: ABACS5056L
|
115 |
+
GST Registration No: O6ABACS5056L12Z5
|
116 |
+
|
117 |
+
Order Number: 407-5335982-7837125
|
118 |
+
Order Date: 30.05.2023
|
119 |
+
|
120 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
121 |
+
(Original for Recipient)
|
122 |
+
|
123 |
+
Billing Address :
|
124 |
+
|
125 |
+
Praveen Bohra
|
126 |
+
|
127 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
128 |
+
GURGAON, HARYANA, 122018
|
129 |
+
|
130 |
+
IN
|
131 |
+
|
132 |
+
State/UT Code: 06
|
133 |
+
|
134 |
+
Shipping Address :
|
135 |
+
|
136 |
+
Praveen Bohra
|
137 |
+
|
138 |
+
Praveen Bohra
|
139 |
+
|
140 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
141 |
+
GURGAON, HARYANA, 122018
|
142 |
+
|
143 |
+
IN
|
144 |
+
|
145 |
+
State/UT Code: 06
|
146 |
+
|
147 |
+
Place of supply: HARYANA
|
148 |
+
|
149 |
+
Place of delivery: HARYANA
|
150 |
+
|
151 |
+
Invoice Number : DEL5-21033
|
152 |
+
Invoice Details : HR-DEL5-918080915-2324
|
153 |
+
Invoice Date : 30.05.2023
|
154 |
+
|
155 |
+
Description at Tax |Tax /|Tax Total
|
156 |
+
p y Rate |Type |Amount|Amount
|
157 |
+
|
158 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
159 |
+
HSN:39269099
|
160 |
+
|
161 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
162 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
163 |
+
9% |SGST| %76.19
|
164 |
+
|
165 |
+
TOTAL:
|
166 |
+
|
167 |
+
Amount in Words:
|
168 |
+
Nine Hundred Ninety-nine only
|
169 |
+
|
170 |
+
Whether tax is payable under reverse charge - No
|
171 |
+
|
172 |
+
For Spigen India Pvt. Ltd.:
|
173 |
+
sSoigenrn
|
174 |
+
|
175 |
+
Authorized Signatory
|
176 |
+
|
177 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
178 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
179 |
+
|
180 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
181 |
+
|
182 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
183 |
+
|
184 |
+
Please note that this invoice is not a demand for payment
|
185 |
+
|
186 |
+
Page 1 of 1"""
|
187 |
+
category = categorize_text(text)
|
188 |
+
print("Category:", category)
|
189 |
+
|
190 |
+
print("\n\n")
|
191 |
+
result = run_category_chain(category, text)
|
192 |
+
print(result)
|
categories/accomodation/__init__.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0.6,
|
14 |
+
max_tokens=300,
|
15 |
+
n=1,
|
16 |
+
request_timeout=None,
|
17 |
+
model_kwargs={
|
18 |
+
'stop': None,
|
19 |
+
'top_p': 1,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are tasked with developing an OCR data extraction system for hotel bills in PDF "
|
26 |
+
"format given as text. The system should extract important information necessary for "
|
27 |
+
"the reimbursement process from a college. Your prompt should fetch the following "
|
28 |
+
"essential details from the hotel bill: hotel name, address, bill number/invoice "
|
29 |
+
"number, booking ID / confirmation ID / booking number, check-in date and time, "
|
30 |
+
"check-out date and time, total amount, booking platform, bill date.\n"
|
31 |
+
"Ensure that the system accurately extracts the above information from the OCR text "
|
32 |
+
"of the hotel bill.\n"
|
33 |
+
"{format_instructions}"
|
34 |
+
)
|
35 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
36 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
37 |
+
[system_message_prompt, human_message_prompt]
|
38 |
+
)
|
39 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
40 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
41 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/accomodation/model.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
"""
|
10 |
+
1. Hotel Name: [Hotel Name]
|
11 |
+
2. Address: [Hotel Address]
|
12 |
+
3. Bill number/Invoice number: [Bill Number]
|
13 |
+
4. booking ID / Confirmation ID / Booking #: [Booking ID]
|
14 |
+
5. Check-in Date and Time: [Check-in Date Time]
|
15 |
+
6. Check-out Date and Time: [Check-out Date Time]
|
16 |
+
7. Total Amount: [Total Amount Charged]
|
17 |
+
8. Booking platform: [Booking Platform]
|
18 |
+
9. Bill date: [Bill Date]
|
19 |
+
"""
|
20 |
+
|
21 |
+
hostel_name: str = Field(..., title="The name of the hotel")
|
22 |
+
address: str = Field(..., title="The address of the hotel")
|
23 |
+
bill_number: str = Field(..., title="The bill number/invoice number")
|
24 |
+
booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
|
25 |
+
check_in_date_time: datetime = Field(..., title="The check-in date and time")
|
26 |
+
check_out_date_time: datetime = Field(..., title="The check-out date and time")
|
27 |
+
total_amount_charged: float = Field(..., title="The total amount charged")
|
28 |
+
booking_platform: str = Field(..., title="The booking platform")
|
29 |
+
bill_date: datetime = Field(..., title="The bill date")
|
categories/random_/__init__.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs={
|
16 |
+
'stop': None,
|
17 |
+
'top_p': 1,
|
18 |
+
'frequency_penalty': 0,
|
19 |
+
'presence_penalty': 0,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents like uids, total, tax, name, currency, date, seller details, summary. You "
|
27 |
+
"may use context to make an educated guess about the currency. Use null if you are "
|
28 |
+
"unable to find certain details\n"
|
29 |
+
"{format_instructions}"
|
30 |
+
)
|
31 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
32 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
33 |
+
[system_message_prompt, human_message_prompt]
|
34 |
+
)
|
35 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
36 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
37 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
text = """amazonin
|
41 |
+
we)
|
42 |
+
|
43 |
+
Sold By :
|
44 |
+
|
45 |
+
Spigen India Pvt. Ltd.
|
46 |
+
|
47 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
48 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
49 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
50 |
+
Gurgaon, Haryana, 122413
|
51 |
+
|
52 |
+
IN
|
53 |
+
|
54 |
+
PAN No: ABACS5056L
|
55 |
+
GST Registration No: O6ABACS5056L12Z5
|
56 |
+
|
57 |
+
Order Number: 407-5335982-7837125
|
58 |
+
Order Date: 30.05.2023
|
59 |
+
|
60 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
61 |
+
(Original for Recipient)
|
62 |
+
|
63 |
+
Billing Address :
|
64 |
+
|
65 |
+
Praveen Bohra
|
66 |
+
|
67 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
68 |
+
GURGAON, HARYANA, 122018
|
69 |
+
|
70 |
+
IN
|
71 |
+
|
72 |
+
State/UT Code: 06
|
73 |
+
|
74 |
+
Shipping Address :
|
75 |
+
|
76 |
+
Praveen Bohra
|
77 |
+
|
78 |
+
Praveen Bohra
|
79 |
+
|
80 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
81 |
+
GURGAON, HARYANA, 122018
|
82 |
+
|
83 |
+
IN
|
84 |
+
|
85 |
+
State/UT Code: 06
|
86 |
+
|
87 |
+
Place of supply: HARYANA
|
88 |
+
|
89 |
+
Place of delivery: HARYANA
|
90 |
+
|
91 |
+
Invoice Number : DEL5-21033
|
92 |
+
Invoice Details : HR-DEL5-918080915-2324
|
93 |
+
Invoice Date : 30.05.2023
|
94 |
+
|
95 |
+
Description at Tax |Tax /|Tax Total
|
96 |
+
p y Rate |Type |Amount|Amount
|
97 |
+
|
98 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
99 |
+
HSN:39269099
|
100 |
+
|
101 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
102 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
103 |
+
9% |SGST| %76.19
|
104 |
+
|
105 |
+
TOTAL:
|
106 |
+
|
107 |
+
Amount in Words:
|
108 |
+
Nine Hundred Ninety-nine only
|
109 |
+
|
110 |
+
Whether tax is payable under reverse charge - No
|
111 |
+
|
112 |
+
For Spigen India Pvt. Ltd.:
|
113 |
+
sSoigenrn
|
114 |
+
|
115 |
+
Authorized Signatory
|
116 |
+
|
117 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
118 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
119 |
+
|
120 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
121 |
+
|
122 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
123 |
+
|
124 |
+
Please note that this invoice is not a demand for payment
|
125 |
+
|
126 |
+
Page 1 of 1"""
|
127 |
+
# result = chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions())
|
128 |
+
# print(result.json(indent=4))
|
129 |
+
result = chain.generate(input_list=[{"text": text, "format_instructions": fixing_parser.get_format_instructions()}])
|
130 |
+
print(result)
|
131 |
+
result = fixing_parser.parse_with_prompt(result.generations[0][0].text, chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions()))
|
132 |
+
print(result)
|
133 |
+
# result = chain.run(text=text, format_instructions=output_parser.get_format_instructions(), verbose=True)
|
134 |
+
# print(result)
|
categories/random_/model.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# generated by datamodel-codegen:
|
2 |
+
# filename: schema.json
|
3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
from datetime import date
|
8 |
+
from typing import Dict, Optional, Union
|
9 |
+
|
10 |
+
import iso4217
|
11 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
12 |
+
|
13 |
+
|
14 |
+
class TaxItem(BaseModel):
|
15 |
+
gst: float = Field(
|
16 |
+
...,
|
17 |
+
title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
class TaxItem1(BaseModel):
|
22 |
+
vat: float = Field(..., title="The total VAT present in the invoice")
|
23 |
+
|
24 |
+
|
25 |
+
class TaxNumberItem(BaseModel):
|
26 |
+
gst_number: constr(min_length=15) = Field(
|
27 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class TaxNumberItem1(BaseModel):
|
32 |
+
vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
|
33 |
+
|
34 |
+
|
35 |
+
class TaxNumberItem2(BaseModel):
|
36 |
+
ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
|
37 |
+
|
38 |
+
|
39 |
+
class SellerDetails(BaseModel):
|
40 |
+
name: Optional[str] = None
|
41 |
+
address: Optional[str] = None
|
42 |
+
contact: Optional[str] = None
|
43 |
+
tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
|
44 |
+
..., title="Tax information"
|
45 |
+
)
|
46 |
+
pan_number: constr(min_length=10, max_length=10) = Field(
|
47 |
+
..., title="The 10-character alphanumeric PAN code"
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
class UIDDict(BaseModel):
|
52 |
+
invoice_number: str = Field(..., title="The invoice number")
|
53 |
+
other_uids: Dict[str, str] = Field(
|
54 |
+
...,
|
55 |
+
title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
60 |
+
uids: UIDDict = Field(..., title="Invoice number and other UIDs")
|
61 |
+
total: float = Field(..., title="Total amount or price")
|
62 |
+
tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
|
63 |
+
name: str = Field(
|
64 |
+
...,
|
65 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
66 |
+
)
|
67 |
+
currency: str = Field(
|
68 |
+
default="INR",
|
69 |
+
title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
|
70 |
+
)
|
71 |
+
issue_date: date = Field(
|
72 |
+
..., title="The date the invoice was issued"
|
73 |
+
)
|
74 |
+
seller_details: SellerDetails = Field(..., title="Information about the seller")
|
75 |
+
summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
|
76 |
+
|
77 |
+
@validator("currency")
|
78 |
+
@classmethod
|
79 |
+
def check_currency(cls, v: str) -> str:
|
80 |
+
if not iso4217.Currency.__members__.get(v.lower()):
|
81 |
+
raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
|
82 |
+
return v.upper()
|
categories/travel_cab/__init__.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs= {
|
16 |
+
'stop': None,
|
17 |
+
'top_p': 1,
|
18 |
+
'frequency_penalty': 0,
|
19 |
+
'presence_penalty': 0,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build categorizing chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents such as date/time/place of departure and arrival.\n"
|
27 |
+
"{format_instructions}"
|
28 |
+
)
|
29 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
30 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
31 |
+
[system_message_prompt, human_message_prompt]
|
32 |
+
)
|
33 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
34 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
35 |
+
chain = LLMChain(
|
36 |
+
llm=model, prompt=chat_prompt, output_parser=fixing_parser
|
37 |
+
)
|
categories/travel_cab/model.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import date, time
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
''''''
|
10 |
+
|
11 |
+
place_from: str = Field(..., title="place where journey starts")
|
12 |
+
date_from: date = Field(
|
13 |
+
..., title="date on which journey starts (DD/MM/YYYY)"
|
14 |
+
)
|
15 |
+
time_from: time = Field(..., title="time at which journey starts")
|
16 |
+
place_to: str = Field(..., title="place where journey end")
|
17 |
+
date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
|
18 |
+
time_to: time = Field(..., title="time at which journey end")
|
19 |
+
amount: float = Field(..., title="cost of journey ticket")
|
categories/travel_flight/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
)
|
10 |
+
|
11 |
+
model = ChatOpenAI(temperature=0)
|
12 |
+
|
13 |
+
# Build categorizing chain
|
14 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(
|
15 |
+
"Parse through and find the following details from the text extracted from a travel "
|
16 |
+
"bill\n"
|
17 |
+
"{format_instructions}\n"
|
18 |
+
"{text}"
|
19 |
+
)
|
20 |
+
chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
|
21 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
22 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
23 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/travel_flight/model.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import date, time
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
"""
|
10 |
+
response_schemas = [
|
11 |
+
ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
|
12 |
+
ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
|
13 |
+
ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
|
14 |
+
ResponseSchema(name="place (to)", description="place where flight end/lands"),
|
15 |
+
ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
|
16 |
+
ResponseSchema(name="time (to)", description="time at which flight end/lands"),
|
17 |
+
ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
|
18 |
+
ResponseSchema(name="amount", description="cost of flight ticket")
|
19 |
+
]"""
|
20 |
+
|
21 |
+
place_from: str = Field(..., title="place where flight starts/takes-off")
|
22 |
+
date_from: date = Field(
|
23 |
+
..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
|
24 |
+
)
|
25 |
+
time_from: time = Field(..., title="time at which flight starts/takes-off")
|
26 |
+
place_to: str = Field(..., title="place where flight end/lands")
|
27 |
+
date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
|
28 |
+
time_to: time = Field(..., title="time at which flight end/lands")
|
29 |
+
pnr_number: str = Field(..., title="PNR Number of flight")
|
30 |
+
amount: float = Field(..., title="cost of flight ticket")
|
categories/vendor/__init__.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs={
|
16 |
+
"stop": None,
|
17 |
+
"top_p": 1,
|
18 |
+
"frequency_penalty": 0,
|
19 |
+
"presence_penalty": 0,
|
20 |
+
},
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents like uids, total, tax, addresses, bank details, invoice details, "
|
27 |
+
"participant registration details."
|
28 |
+
"{format_instructions}"
|
29 |
+
)
|
30 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
31 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
32 |
+
[system_message_prompt, human_message_prompt]
|
33 |
+
)
|
34 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
35 |
+
print(output_parser.get_format_instructions())
|
36 |
+
# exit()
|
37 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
38 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/vendor/model.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# generated by datamodel-codegen:
|
2 |
+
# filename: schema.json
|
3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
10 |
+
|
11 |
+
|
12 |
+
class BankDetails(BaseModel):
|
13 |
+
"""account holder name, bank name, account number, branch, ifs code, swift code"""
|
14 |
+
|
15 |
+
account_holder_name: str = Field(..., title="The name of the account holder")
|
16 |
+
bank_name: str = Field(..., title="The name of the bank")
|
17 |
+
account_number: str = Field(..., title="The account number")
|
18 |
+
branch: str = Field(..., title="The branch of the bank")
|
19 |
+
ifs_code: str = Field(..., title="The IFS code of the bank")
|
20 |
+
swift_code: str = Field(..., title="The SWIFT code of the bank")
|
21 |
+
|
22 |
+
|
23 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
24 |
+
"""
|
25 |
+
GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
|
26 |
+
bank details: (account holder name, bank name, account number, branch, ifs code, swift
|
27 |
+
code), recipient, registration id, registration fee, registration date/time
|
28 |
+
"""
|
29 |
+
|
30 |
+
gstin: constr(min_length=15) = Field(
|
31 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
32 |
+
)
|
33 |
+
billing_address: str = Field(..., title="The billing address")
|
34 |
+
invoice_number: str = Field(..., title="The invoice number")
|
35 |
+
invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
|
36 |
+
due_date: datetime = Field(..., title="The date-time the invoice is due")
|
37 |
+
total: float = Field(..., title="Total amount or price")
|
38 |
+
balance_due: float = Field(..., title="The amount due")
|
39 |
+
bank_details: BankDetails = Field(..., title="Bank details")
|
40 |
+
recipient: str = Field(
|
41 |
+
...,
|
42 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
43 |
+
)
|
44 |
+
registration_id: str = Field(..., title="The registration ID")
|
45 |
+
registration_fee: float = Field(..., title="The registration fee")
|
46 |
+
registration_date_time: datetime = Field(..., title="The registration date-time")
|
environment.yml
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: env
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- defaults
|
5 |
+
dependencies:
|
6 |
+
- asttokens=2.2.1=pyhd8ed1ab_0
|
7 |
+
- backcall=0.2.0=pyh9f0ad1d_0
|
8 |
+
- backports=1.0=pyhd8ed1ab_3
|
9 |
+
- backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
|
10 |
+
- boost-cpp=1.78.0=h9f4b32c_3
|
11 |
+
- bzip2=1.0.8=h8ffe710_4
|
12 |
+
- ca-certificates=2023.7.22=h56e8100_0
|
13 |
+
- cairo=1.16.0=hdecc03f_1016
|
14 |
+
- colorama=0.4.6=pyhd8ed1ab_0
|
15 |
+
- comm=0.1.3=pyhd8ed1ab_0
|
16 |
+
- debugpy=1.6.7=py39h99910a6_0
|
17 |
+
- decorator=5.1.1=pyhd8ed1ab_0
|
18 |
+
- executing=1.2.0=pyhd8ed1ab_0
|
19 |
+
- expat=2.5.0=h63175ca_1
|
20 |
+
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
|
21 |
+
- font-ttf-inconsolata=3.000=h77eed37_0
|
22 |
+
- font-ttf-source-code-pro=2.038=h77eed37_0
|
23 |
+
- font-ttf-ubuntu=0.83=hab24e00_0
|
24 |
+
- fontconfig=2.14.2=hbde0cde_0
|
25 |
+
- fonts-conda-ecosystem=1=0
|
26 |
+
- fonts-conda-forge=1=0
|
27 |
+
- freetype=2.12.1=h546665d_1
|
28 |
+
- gettext=0.21.1=h5728263_0
|
29 |
+
- icu=72.1=h63175ca_0
|
30 |
+
- importlib-metadata=6.8.0=pyha770c72_0
|
31 |
+
- importlib_metadata=6.8.0=hd8ed1ab_0
|
32 |
+
- ipykernel=6.25.0=pyh6817e22_0
|
33 |
+
- ipython=8.14.0=pyh08f2357_0
|
34 |
+
- jedi=0.18.2=pyhd8ed1ab_0
|
35 |
+
- jupyter_client=8.3.0=pyhd8ed1ab_0
|
36 |
+
- jupyter_core=5.3.1=py39hcbf5309_0
|
37 |
+
- krb5=1.21.1=heb0366b_0
|
38 |
+
- lcms2=2.15=h3e3b177_1
|
39 |
+
- lerc=4.0.0=h63175ca_0
|
40 |
+
- libcurl=8.2.1=hd5e4a3a_0
|
41 |
+
- libdeflate=1.18=hcfcfb64_0
|
42 |
+
- libexpat=2.5.0=h63175ca_1
|
43 |
+
- libffi=3.4.2=h8ffe710_5
|
44 |
+
- libglib=2.76.4=he8f3873_0
|
45 |
+
- libiconv=1.17=h8ffe710_0
|
46 |
+
- libjpeg-turbo=2.1.5.1=hcfcfb64_0
|
47 |
+
- libpng=1.6.39=h19919ed_0
|
48 |
+
- libsodium=1.0.18=h8d14728_1
|
49 |
+
- libssh2=1.11.0=h7dfc565_0
|
50 |
+
- libtiff=4.5.1=h6c8260b_0
|
51 |
+
- libzlib=1.2.13=hcfcfb64_5
|
52 |
+
- matplotlib-inline=0.1.6=pyhd8ed1ab_0
|
53 |
+
- nest-asyncio=1.5.6=pyhd8ed1ab_0
|
54 |
+
- openjpeg=2.5.0=ha2aaf27_2
|
55 |
+
- openssl=3.1.1=hcfcfb64_1
|
56 |
+
- packaging=23.1=pyhd8ed1ab_0
|
57 |
+
- parso=0.8.3=pyhd8ed1ab_0
|
58 |
+
- pcre2=10.40=h17e33f8_0
|
59 |
+
- pickleshare=0.7.5=py_1003
|
60 |
+
- pip=23.2.1=py39haa95532_0
|
61 |
+
- pixman=0.40.0=h8ffe710_0
|
62 |
+
- platformdirs=3.9.1=pyhd8ed1ab_0
|
63 |
+
- poppler=23.07.0=h45d20d0_0
|
64 |
+
- poppler-data=0.4.12=hd8ed1ab_0
|
65 |
+
- prompt-toolkit=3.0.39=pyha770c72_0
|
66 |
+
- prompt_toolkit=3.0.39=hd8ed1ab_0
|
67 |
+
- psutil=5.9.5=py39ha55989b_0
|
68 |
+
- pure_eval=0.2.2=pyhd8ed1ab_0
|
69 |
+
- pygments=2.15.1=pyhd8ed1ab_0
|
70 |
+
- python=3.9.17=h1aa4202_0
|
71 |
+
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
72 |
+
- python_abi=3.9=2_cp39
|
73 |
+
- pywin32=304=py39h99910a6_2
|
74 |
+
- pyzmq=25.1.0=py39hea35a22_0
|
75 |
+
- setuptools=68.0.0=py39haa95532_0
|
76 |
+
- six=1.16.0=pyh6c4a22f_0
|
77 |
+
- sqlite=3.41.2=h2bbff1b_0
|
78 |
+
- stack_data=0.6.2=pyhd8ed1ab_0
|
79 |
+
- tornado=6.3.2=py39ha55989b_0
|
80 |
+
- traitlets=5.9.0=pyhd8ed1ab_0
|
81 |
+
- typing-extensions=4.7.1=hd8ed1ab_0
|
82 |
+
- typing_extensions=4.7.1=pyha770c72_0
|
83 |
+
- ucrt=10.0.22621.0=h57928b3_0
|
84 |
+
- vc=14.3=h64f974e_17
|
85 |
+
- vc14_runtime=14.36.32532=hfdfe4a8_17
|
86 |
+
- vs2015_runtime=14.36.32532=h05e6639_17
|
87 |
+
- wcwidth=0.2.6=pyhd8ed1ab_0
|
88 |
+
- wheel=0.38.4=py39haa95532_0
|
89 |
+
- xz=5.2.6=h8d14728_0
|
90 |
+
- zeromq=4.3.4=h0e60522_1
|
91 |
+
- zipp=3.16.2=pyhd8ed1ab_0
|
92 |
+
- zlib=1.2.13=hcfcfb64_5
|
93 |
+
- zstd=1.5.2=h12be248_7
|
94 |
+
- pip:
|
95 |
+
- aiofiles==23.1.0
|
96 |
+
- aiohttp==3.8.5
|
97 |
+
- aiosignal==1.3.1
|
98 |
+
- altair==5.0.1
|
99 |
+
- annotated-types==0.5.0
|
100 |
+
- anyio==3.7.1
|
101 |
+
- async-timeout==4.0.2
|
102 |
+
- attrs==23.1.0
|
103 |
+
- certifi==2023.7.22
|
104 |
+
- charset-normalizer==3.2.0
|
105 |
+
- click==8.1.6
|
106 |
+
- contourpy==1.1.0
|
107 |
+
- cycler==0.11.0
|
108 |
+
- dataclasses-json==0.5.13
|
109 |
+
- datasets==2.14.1
|
110 |
+
- dill==0.3.7
|
111 |
+
- exceptiongroup==1.1.2
|
112 |
+
- fastapi==0.100.1
|
113 |
+
- ffmpy==0.3.1
|
114 |
+
- filelock==3.12.2
|
115 |
+
- fonttools==4.41.1
|
116 |
+
- frozenlist==1.4.0
|
117 |
+
- fsspec==2023.6.0
|
118 |
+
- gradio==3.39.0
|
119 |
+
- gradio-client==0.3.0
|
120 |
+
- greenlet==2.0.2
|
121 |
+
- h11==0.14.0
|
122 |
+
- httpcore==0.17.3
|
123 |
+
- httpx==0.24.1
|
124 |
+
- huggingface-hub==0.16.4
|
125 |
+
- idna==3.4
|
126 |
+
- importlib-resources==6.0.0
|
127 |
+
- iso4217==1.11.20220401
|
128 |
+
- jinja2==3.1.2
|
129 |
+
- jsonschema==4.18.4
|
130 |
+
- jsonschema-specifications==2023.7.1
|
131 |
+
- kiwisolver==1.4.4
|
132 |
+
- langchain==0.0.247
|
133 |
+
- langsmith==0.0.15
|
134 |
+
- linkify-it-py==2.0.2
|
135 |
+
- markdown-it-py==2.2.0
|
136 |
+
- markupsafe==2.1.3
|
137 |
+
- marshmallow==3.20.1
|
138 |
+
- matplotlib==3.7.2
|
139 |
+
- mdit-py-plugins==0.3.3
|
140 |
+
- mdurl==0.1.2
|
141 |
+
- multidict==6.0.4
|
142 |
+
- multiprocess==0.70.15
|
143 |
+
- mypy-extensions==1.0.0
|
144 |
+
- numexpr==2.8.4
|
145 |
+
- numpy==1.25.1
|
146 |
+
- openai==0.27.8
|
147 |
+
- openapi-schema-pydantic==1.2.4
|
148 |
+
- opencv-python-headless==4.8.0.74
|
149 |
+
- orjson==3.9.2
|
150 |
+
- pandas==2.0.3
|
151 |
+
- pdf2image==1.16.3
|
152 |
+
- pillow==10.0.0
|
153 |
+
- pyarrow==12.0.1
|
154 |
+
- pydantic==1.10.12
|
155 |
+
- pydantic-core==2.4.0
|
156 |
+
- pydub==0.25.1
|
157 |
+
- pyocr==0.8.3
|
158 |
+
- pyparsing==3.0.9
|
159 |
+
- pypdf==3.13.0
|
160 |
+
- pypiwin32==223
|
161 |
+
- python-multipart==0.0.6
|
162 |
+
- pytz==2023.3
|
163 |
+
- pyyaml==6.0.1
|
164 |
+
- referencing==0.30.0
|
165 |
+
- requests==2.31.0
|
166 |
+
- rpds-py==0.9.2
|
167 |
+
- semantic-version==2.10.0
|
168 |
+
- sniffio==1.3.0
|
169 |
+
- sqlalchemy==2.0.19
|
170 |
+
- starlette==0.27.0
|
171 |
+
- tenacity==8.2.2
|
172 |
+
- toolz==0.12.0
|
173 |
+
- tqdm==4.65.0
|
174 |
+
- typing-inspect==0.9.0
|
175 |
+
- tzdata==2023.3
|
176 |
+
- uc-micro-py==1.0.2
|
177 |
+
- urllib3==2.0.4
|
178 |
+
- uvicorn==0.23.1
|
179 |
+
- websockets==11.0.3
|
180 |
+
- xxhash==3.3.0
|
181 |
+
- yarl==1.9.2
|
examples/example1.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
|
3 |
+
size 45782
|
examples/rotated.jpeg
ADDED
Git LFS Details
|
examples/rotated.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
|
3 |
+
size 333463
|
examples/upright.jpeg
ADDED
Git LFS Details
|
examples/upright.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
|
3 |
+
size 325064
|
extract.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
|
2 |
+
"""
|
3 |
+
from io import BytesIO
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
import pyocr.tesseract
|
7 |
+
import pypdf
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
|
12 |
+
"""Extracts text from the given PDF file using pypdf.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
bytes_stream (BytesIO): The PDF file to extract text from.
|
16 |
+
|
17 |
+
Returns: The extracted text
|
18 |
+
"""
|
19 |
+
pdf_reader = pypdf.PdfReader(bytes_stream)
|
20 |
+
text = ""
|
21 |
+
for page in pdf_reader.pages:
|
22 |
+
text += page.extract_text()
|
23 |
+
text += "\n\n"
|
24 |
+
return text
|
25 |
+
|
26 |
+
|
27 |
+
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
|
28 |
+
"""Extracts text from the given image using tesseract via pyocr.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
image(PIL.Image.Image): The image to extract text from.
|
32 |
+
|
33 |
+
Returns: The extracted text.
|
34 |
+
"""
|
35 |
+
if not pyocr.tesseract.is_available():
|
36 |
+
raise Exception("Tesseract is not available.")
|
37 |
+
text = pyocr.tesseract.image_to_string(image, lang="eng")
|
38 |
+
return text
|
39 |
+
|
40 |
+
|
41 |
+
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
|
42 |
+
"""Extracts text from the given images using tesseract via pyocr.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
images(List[PIL.Image.Image]): The images to extract text from.
|
46 |
+
|
47 |
+
Returns: The extracted text.
|
48 |
+
"""
|
49 |
+
text = ""
|
50 |
+
for image in images:
|
51 |
+
text += extract_text_from_image_pyocr_tesseract(image)
|
52 |
+
text += "\n\n"
|
53 |
+
image.close()
|
54 |
+
return text
|
55 |
+
|
56 |
+
if __name__ == '__main__':
|
57 |
+
filename = 'examples/upright.pdf'
|
58 |
+
with open(filename, 'rb') as file:
|
59 |
+
bytes_stream = BytesIO(file.read())
|
60 |
+
text = extract_text_from_pdf_pypdf(bytes_stream)
|
61 |
+
print(text)
|
62 |
+
print("-"*25)
|
63 |
+
filename = 'examples/upright.jpeg'
|
64 |
+
image = Image.open(filename)
|
65 |
+
text = extract_text_from_image_pyocr_tesseract(image)
|
66 |
+
print(text)
|
67 |
+
image.close()
|
main.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import categories
|
4 |
+
import processing
|
5 |
+
import extract
|
6 |
+
from PIL import Image
|
7 |
+
from pydantic import BaseModel
|
8 |
+
from io import BytesIO
|
9 |
+
|
10 |
+
def categorize_and_parse_text(text: str) -> BaseModel:
|
11 |
+
"""Categorizes the text and parses the information from it.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text(str): The text to categorize and parse information from.
|
15 |
+
|
16 |
+
Returns: The category of the text.
|
17 |
+
"""
|
18 |
+
category = categories.categorize_text(text)
|
19 |
+
# if stop_on_category:
|
20 |
+
# return category, text
|
21 |
+
result = categories.run_category_chain(category, text)
|
22 |
+
return result
|
23 |
+
|
24 |
+
def process_pdf(filename: Path, extract_only=False) -> BaseModel:
|
25 |
+
"""Processes the given PDF file and extracts information from it.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
filename(Path): The PDF file to process.
|
29 |
+
|
30 |
+
Returns: The extracted information.
|
31 |
+
"""
|
32 |
+
with open(filename, "rb") as f:
|
33 |
+
pdf_bytes = bytes(f.read())
|
34 |
+
|
35 |
+
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
|
36 |
+
# If the encoded text is too short, a pdf scanner probably added a watermark
|
37 |
+
if len(text) < 20:
|
38 |
+
# Try to extract text from images
|
39 |
+
images = processing.preprocess_pdf_pdf2image(pdf_bytes)
|
40 |
+
text = extract.extract_text_from_images_pyocr_tesseract(images)
|
41 |
+
if extract_only:
|
42 |
+
return text
|
43 |
+
result = categorize_and_parse_text(text)
|
44 |
+
return result
|
45 |
+
|
46 |
+
def process_image(filename: Path, extract_only=False) -> BaseModel:
|
47 |
+
"""Processes the given image file and extracts information from it.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
filename(Path): The image file to process.
|
51 |
+
|
52 |
+
Returns: The extracted information.
|
53 |
+
"""
|
54 |
+
image = Image.open(filename)
|
55 |
+
image = processing.preprocess_image(image)
|
56 |
+
text = extract.extract_text_from_image_pyocr_tesseract(image)
|
57 |
+
image.close()
|
58 |
+
if extract_only:
|
59 |
+
return text
|
60 |
+
result = categorize_and_parse_text(text)
|
61 |
+
return result
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
filename = Path("examples/example1.pdf")
|
65 |
+
result = process_pdf(filename)
|
66 |
+
print(result.json(indent=4))
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
poppler-utils
|
processing.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Responsible for (pre)processing images and PDFs before they are passed to the OCR
|
2 |
+
engine and other miscellaneous actions concerning processing.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
# import cv2
|
9 |
+
# import numpy as np
|
10 |
+
import pyocr
|
11 |
+
from pdf2image import pdf2image
|
12 |
+
from PIL import Image #, ImageOps
|
13 |
+
|
14 |
+
PDF_CONVERSION_DPI = 300
|
15 |
+
ROTATION_CONFIDENCE_THRESHOLD = 2.0
|
16 |
+
|
17 |
+
# def rotate_image(image: Image, angle: float):
|
18 |
+
# """Rotates the given image by the given angle.
|
19 |
+
|
20 |
+
# Args:
|
21 |
+
# image(PIL.Image.Image): The image to be rotated.
|
22 |
+
# angle(float): The angle to rotate the image by.
|
23 |
+
|
24 |
+
# Returns: The rotated image.
|
25 |
+
# """
|
26 |
+
# image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
27 |
+
# height, width, _ = image.shape # Get the image height, width, and channels
|
28 |
+
# # Compute the rotation matrix
|
29 |
+
# rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
|
30 |
+
# # Apply the rotation to the image
|
31 |
+
# rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
|
32 |
+
# rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
|
33 |
+
# return rotated_image
|
34 |
+
|
35 |
+
|
36 |
+
# class PDF_CONVERTER(enum.Enum):
|
37 |
+
# PDF2IMAGE = 1
|
38 |
+
# IMAGEMAGICK = 2
|
39 |
+
|
40 |
+
|
41 |
+
def correct_orientation(image: Image.Image) -> Image.Image:
|
42 |
+
"""Corrects the orientation of an image if it is not upright.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
image(PIL.Image.Image): The pillow image to be corrected.
|
46 |
+
|
47 |
+
Returns: The corrected pillow image as a copy. The original image is not closed.
|
48 |
+
"""
|
49 |
+
if not pyocr.tesseract.is_available():
|
50 |
+
raise Exception("Tesseract is not available.")
|
51 |
+
|
52 |
+
# image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual
|
53 |
+
orientation_info = {}
|
54 |
+
try:
|
55 |
+
orientation_info = pyocr.tesseract.detect_orientation(image)
|
56 |
+
except pyocr.PyocrException as e:
|
57 |
+
print("Orientation detection failed: {}".format(e))
|
58 |
+
# output = pytesseract.image_to_osd(
|
59 |
+
# image, config=" --psm 0", output_type=pytesseract.Output.DICT
|
60 |
+
# )
|
61 |
+
angle = orientation_info.get("angle", 0)
|
62 |
+
confidence = orientation_info.get("confidence", 100)
|
63 |
+
# rotate = output["rotate"]
|
64 |
+
# confidence = output["orientation_conf"]
|
65 |
+
|
66 |
+
if confidence > ROTATION_CONFIDENCE_THRESHOLD:
|
67 |
+
new_image = image.rotate(angle, expand=True)
|
68 |
+
else:
|
69 |
+
new_image = image.copy()
|
70 |
+
return new_image
|
71 |
+
|
72 |
+
|
73 |
+
def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
74 |
+
"""Converts a PDF to an image using pdf2image.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
pdf_bytes(bytes): The bytes of the PDF to be converted.
|
78 |
+
|
79 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
80 |
+
"""
|
81 |
+
images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
|
82 |
+
return images
|
83 |
+
|
84 |
+
|
85 |
+
def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
|
86 |
+
"""Converts a PDF to an image using ImageMagick.
|
87 |
+
|
88 |
+
Args:
|
89 |
+
filename(pathlib.Path): The path to the PDF to be converted.
|
90 |
+
dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
|
91 |
+
are saved in the folder as page.jpg or as page-01.jpg,
|
92 |
+
page-02.jpg, etc.
|
93 |
+
|
94 |
+
Returns: dest_folder
|
95 |
+
"""
|
96 |
+
os.system(f"magick convert"
|
97 |
+
f"-density {PDF_CONVERSION_DPI}"
|
98 |
+
f"{filename}"
|
99 |
+
f"-quality 100"
|
100 |
+
f"{dest_folder/'page.jpg'}")
|
101 |
+
return dest_folder
|
102 |
+
|
103 |
+
|
104 |
+
def preprocess_image(image: Image.Image) -> Image.Image:
|
105 |
+
"""Preprocesses an image for future use with OCR.
|
106 |
+
The following operations are performed:
|
107 |
+
1. Orientation correction
|
108 |
+
|
109 |
+
Args:
|
110 |
+
image(PIL.Image.Image): The image to be preprocessed.
|
111 |
+
|
112 |
+
Returns: The preprocessed pillow image.
|
113 |
+
"""
|
114 |
+
rotated_image = correct_orientation(image)
|
115 |
+
result = rotated_image
|
116 |
+
image.close()
|
117 |
+
return result
|
118 |
+
|
119 |
+
def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
120 |
+
"""Preprocesses a PDF for future use with OCR.
|
121 |
+
The following operations are performed:
|
122 |
+
1. PDF to image conversion
|
123 |
+
2. Orientation correction
|
124 |
+
|
125 |
+
Args:
|
126 |
+
pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
|
127 |
+
|
128 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
129 |
+
"""
|
130 |
+
images = convert_pdf_to_image_pdf2image(pdf_bytes)
|
131 |
+
result = []
|
132 |
+
for image in images:
|
133 |
+
new_image = preprocess_image(image)
|
134 |
+
image.close()
|
135 |
+
result.append(new_image)
|
136 |
+
return result
|
137 |
+
|
138 |
+
def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
|
139 |
+
"""Preprocesses a PDF for future use with OCR.
|
140 |
+
The following operations are performed:
|
141 |
+
1. PDF to image conversion
|
142 |
+
2. Orientation correction
|
143 |
+
|
144 |
+
Args:
|
145 |
+
filename(pathlib.Path): The path to the PDF to be preprocessed.
|
146 |
+
|
147 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
148 |
+
"""
|
149 |
+
dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
|
150 |
+
result = []
|
151 |
+
for image in dest_folder.glob("*.jpg"):
|
152 |
+
new_image = preprocess_image(image)
|
153 |
+
image.close()
|
154 |
+
result.append(new_image)
|
155 |
+
return result
|
156 |
+
|
157 |
+
if __name__ == '__main__':
|
158 |
+
filename = 'examples/upright.jpeg'
|
159 |
+
image = Image.open(filename)
|
160 |
+
new_image = preprocess_image(image)
|
161 |
+
image.close()
|
162 |
+
new_image.show()
|
163 |
+
new_image.close()
|
164 |
+
|
165 |
+
filename = 'examples/rotated.pdf'
|
166 |
+
with open(filename, 'rb') as file:
|
167 |
+
bytes_ = bytes(file.read())
|
168 |
+
images = preprocess_pdf_pdf2image(bytes_)
|
169 |
+
for image in images:
|
170 |
+
image.show()
|
171 |
+
image.close()
|
requirements.txt
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.0.1
|
5 |
+
annotated-types==0.5.0
|
6 |
+
anyio==3.7.1
|
7 |
+
asttokens==2.2.1
|
8 |
+
async-timeout==4.0.2
|
9 |
+
attrs==23.1.0
|
10 |
+
backcall==0.2.0
|
11 |
+
backports.functools-lru-cache==1.6.5
|
12 |
+
certifi==2023.7.22
|
13 |
+
charset-normalizer==3.2.0
|
14 |
+
click==8.1.6
|
15 |
+
colorama==0.4.6
|
16 |
+
comm==0.1.3
|
17 |
+
contourpy==1.1.0
|
18 |
+
cycler==0.11.0
|
19 |
+
dataclasses-json==0.5.13
|
20 |
+
datasets==2.14.1
|
21 |
+
debugpy==1.6.7
|
22 |
+
decorator==5.1.1
|
23 |
+
dill==0.3.7
|
24 |
+
exceptiongroup==1.1.2
|
25 |
+
executing==1.2.0
|
26 |
+
fastapi==0.100.1
|
27 |
+
ffmpy==0.3.1
|
28 |
+
filelock==3.12.2
|
29 |
+
fonttools==4.41.1
|
30 |
+
frozenlist==1.4.0
|
31 |
+
fsspec==2023.6.0
|
32 |
+
gradio==3.39.0
|
33 |
+
gradio_client==0.3.0
|
34 |
+
greenlet==2.0.2
|
35 |
+
h11==0.14.0
|
36 |
+
httpcore==0.17.3
|
37 |
+
httpx==0.24.1
|
38 |
+
huggingface-hub==0.16.4
|
39 |
+
idna==3.4
|
40 |
+
importlib-metadata==6.8.0
|
41 |
+
importlib-resources==6.0.0
|
42 |
+
ipykernel==6.25.0
|
43 |
+
ipython==8.14.0
|
44 |
+
iso4217==1.11.20220401
|
45 |
+
jedi==0.18.2
|
46 |
+
Jinja2==3.1.2
|
47 |
+
jsonschema==4.18.4
|
48 |
+
jsonschema-specifications==2023.7.1
|
49 |
+
jupyter_client==8.3.0
|
50 |
+
jupyter_core==5.3.1
|
51 |
+
kiwisolver==1.4.4
|
52 |
+
langchain==0.0.247
|
53 |
+
langsmith==0.0.15
|
54 |
+
linkify-it-py==2.0.2
|
55 |
+
markdown-it-py==2.2.0
|
56 |
+
MarkupSafe==2.1.3
|
57 |
+
marshmallow==3.20.1
|
58 |
+
matplotlib==3.7.2
|
59 |
+
matplotlib-inline==0.1.6
|
60 |
+
mdit-py-plugins==0.3.3
|
61 |
+
mdurl==0.1.2
|
62 |
+
multidict==6.0.4
|
63 |
+
multiprocess==0.70.15
|
64 |
+
mypy-extensions==1.0.0
|
65 |
+
nest-asyncio==1.5.6
|
66 |
+
numexpr==2.8.4
|
67 |
+
numpy==1.25.1
|
68 |
+
openai==0.27.8
|
69 |
+
openapi-schema-pydantic==1.2.4
|
70 |
+
opencv-python-headless==4.8.0.74
|
71 |
+
orjson==3.9.2
|
72 |
+
packaging==23.1
|
73 |
+
pandas==2.0.3
|
74 |
+
parso==0.8.3
|
75 |
+
pdf2image==1.16.3
|
76 |
+
pickleshare==0.7.5
|
77 |
+
Pillow==10.0.0
|
78 |
+
pip==23.2.1
|
79 |
+
platformdirs==3.9.1
|
80 |
+
prompt-toolkit==3.0.39
|
81 |
+
psutil==5.9.5
|
82 |
+
pure-eval==0.2.2
|
83 |
+
pyarrow==12.0.1
|
84 |
+
pydantic==1.10.12
|
85 |
+
pydantic_core==2.4.0
|
86 |
+
pydub==0.25.1
|
87 |
+
Pygments==2.15.1
|
88 |
+
pyocr==0.8.3
|
89 |
+
pyparsing==3.0.9
|
90 |
+
pypdf==3.13.0
|
91 |
+
pypiwin32==223
|
92 |
+
python-dateutil==2.8.2
|
93 |
+
python-multipart==0.0.6
|
94 |
+
pytz==2023.3
|
95 |
+
pywin32==304
|
96 |
+
PyYAML==6.0.1
|
97 |
+
pyzmq==25.1.0
|
98 |
+
referencing==0.30.0
|
99 |
+
requests==2.31.0
|
100 |
+
rpds-py==0.9.2
|
101 |
+
semantic-version==2.10.0
|
102 |
+
setuptools==68.0.0
|
103 |
+
six==1.16.0
|
104 |
+
sniffio==1.3.0
|
105 |
+
SQLAlchemy==2.0.19
|
106 |
+
stack-data==0.6.2
|
107 |
+
starlette==0.27.0
|
108 |
+
tenacity==8.2.2
|
109 |
+
toolz==0.12.0
|
110 |
+
tornado==6.3.2
|
111 |
+
tqdm==4.65.0
|
112 |
+
traitlets==5.9.0
|
113 |
+
typing_extensions==4.7.1
|
114 |
+
typing-inspect==0.9.0
|
115 |
+
tzdata==2023.3
|
116 |
+
uc-micro-py==1.0.2
|
117 |
+
urllib3==2.0.4
|
118 |
+
uvicorn==0.23.1
|
119 |
+
wcwidth==0.2.6
|
120 |
+
websockets==11.0.3
|
121 |
+
wheel==0.38.4
|
122 |
+
xxhash==3.3.0
|
123 |
+
yarl==1.9.2
|
124 |
+
zipp==3.16.2
|