ankur-bohra commited on
Commit
0d99179
1 Parent(s): 901322a

Add basic structure

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
37
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .conda
2
+ temp*
3
+ __pycache__/
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM continuumio/miniconda3
2
+
3
+ WORKDIR /code
4
+
5
+ # Create the environment:
6
+ COPY ./environment.yml /code/environment.yml
7
+
8
+ RUN conda config --set channel_priority strict
9
+ RUN conda config --add channels conda-forge
10
+ RUN conda env create -f environment.yml
11
+
12
+ # Make RUN commands use the new environment:
13
+ SHELL ["conda", "run", "-n", "env", "/bin/bash", "-c"]
14
+
15
+ RUN pip install -r requirements.txt
16
+
17
+ # Demonstrate the environment is activated:
18
+ RUN echo "Making sure installation worked:"
19
+ RUN python -c "import gradio, pypdf, pdf2image, langchain, openai, datasets"
20
+
21
+ COPY . .
22
+
23
+ # The code to run when container is started:
24
+
25
+ ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "env", "python", "app.py"]
26
+
app.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+
6
+ from langchain.schema.output_parser import OutputParserException
7
+ import gradio as gr
8
+ from PIL import Image
9
+
10
+ import categories
11
+ from categories import Category
12
+ from main import process_image, process_pdf
13
+
14
+ HF_TOKEN = os.getenv("HF_TOKEN")
15
+ PDF_IFRAME = """
16
+ <div style="border-radius: 10px; width: 100%; overflow: hidden;">
17
+ <iframe
18
+ src="data:application/pdf;base64,{0}"
19
+ width="100%"
20
+ height="400"
21
+ type="application/pdf">
22
+ </iframe>
23
+ </div>"""
24
+
25
+ hf_writer_normal = gr.HuggingFaceDatasetSaver(
26
+ HF_TOKEN, "automatic-reimbursement-tool-demo", separate_dirs=False
27
+ )
28
+ hf_writer_incorrect = gr.HuggingFaceDatasetSaver(
29
+ HF_TOKEN, "automatic-reimbursement-tool-demo-incorrect", separate_dirs=False
30
+ )
31
+ # with open("examples/example1.pdf", "rb") as pdf_file:
32
+ # base64_pdf = base64.b64encode(pdf_file.read())
33
+
34
+
35
+ # example_paths = []
36
+ # current_file_path = None
37
+
38
+ # def ignore_examples(function):
39
+ # def new_function(*args, **kwargs):
40
+ # global example_paths, current_file_path
41
+ # if current_file_path not in example_paths:
42
+ # return function(*args, **kwargs)
43
+
44
+
45
+
46
+ def display_file(input_file):
47
+ global current_file_path
48
+ current_file_path = input_file.name if input_file else None
49
+ if not input_file:
50
+ return gr.HTML.update(visible=False), gr.Image.update(visible=False)
51
+ if input_file.name.endswith(".pdf"):
52
+ with open(input_file.name, "rb") as input_file:
53
+ pdf_base64 = base64.b64encode(input_file.read()).decode()
54
+ return gr.HTML.update(
55
+ PDF_IFRAME.format(pdf_base64), visible=True
56
+ ), gr.Image.update(visible=False)
57
+ else:
58
+ # image = Image.open(input_file.name)
59
+ return gr.HTML.update(visible=False), gr.Image.update(
60
+ input_file.name, visible=True
61
+ )
62
+
63
+
64
+ def show_intermediate_outputs(show_intermediate):
65
+ if show_intermediate:
66
+ return gr.Accordion.update(visible=True)
67
+ else:
68
+ return gr.Accordion.update(visible=False)
69
+
70
+
71
+ def show_share_contact(share_result):
72
+ return gr.Textbox.update(visible=share_result)
73
+
74
+
75
+ def clear_inputs():
76
+ return gr.File.update(value=None)
77
+
78
+
79
+ def submit(input_file, old_text):
80
+ if not input_file:
81
+ gr.Error("Please upload a file to continue!")
82
+ return gr.Textbox.update()
83
+
84
+ # Send change to preprocessed image or to extracted text
85
+ if input_file.name.endswith(".pdf"):
86
+ text = process_pdf(Path(input_file.name), extract_only=True)
87
+ else:
88
+ text = process_image(Path(input_file.name), extract_only=True)
89
+ return text
90
+
91
+
92
+ def categorize_extracted_text(extracted_text):
93
+ category = categories.categorize_text(extracted_text)
94
+ # gr.Info(f"Recognized category: {category}")
95
+ return category
96
+
97
+
98
+ def extract_from_category(category, extracted_text):
99
+ # gr.Info("Received category: " + category)
100
+ if not category:
101
+ return (
102
+ gr.Chatbot.update(None),
103
+ gr.JSON.update(None),
104
+ gr.Button.update(interactive=False),
105
+ gr.Button.update(interactive=False),
106
+ )
107
+ category = Category[category]
108
+ chain = categories.category_modules[category].chain
109
+ formatted_prompt = chain.prompt.format_prompt(
110
+ text=extracted_text,
111
+ format_instructions=chain.output_parser.get_format_instructions(),
112
+ )
113
+ result = chain.generate(
114
+ input_list=[
115
+ {
116
+ "text": extracted_text,
117
+ "format_instructions": chain.output_parser.get_format_instructions(),
118
+ }
119
+ ]
120
+ )
121
+ question = f""
122
+ if len(formatted_prompt.messages) > 1:
123
+ question += f"**System:**\n{formatted_prompt.messages[1].content}"
124
+ question += f"\n\n**Human:**\n{formatted_prompt.messages[0].content}"
125
+ answer = result.generations[0][0].text
126
+ try:
127
+ information = chain.output_parser.parse_with_prompt(answer, formatted_prompt)
128
+ information = information.json() if information else {}
129
+ except OutputParserException as e:
130
+ information = {
131
+ "error": "Unable to parse chatbot output",
132
+ "details": str(e),
133
+ "output": e.llm_output,
134
+ }
135
+ return (
136
+ gr.Chatbot.update([[question, answer]]),
137
+ gr.JSON.update(information),
138
+ gr.Button.update(interactive=True),
139
+ gr.Button.update(interactive=True),
140
+ )
141
+
142
+
143
+ def dynamic_auto_flag(flag_method):
144
+ def modified_flag_method(share_result, *args, **kwargs):
145
+ if share_result:
146
+ flag_method(*args, **kwargs)
147
+
148
+ return modified_flag_method
149
+
150
+
151
+ # def save_example_and_submit(input_file):
152
+ # example_paths.append(input_file.name)
153
+ # submit(input_file, "")
154
+
155
+
156
+ with gr.Blocks(title="Automatic Reimbursement Tool Demo") as page:
157
+ gr.Markdown("<center><h1>Automatic Reimbursement Tool Demo</h1></center>")
158
+ gr.Markdown("<h2>Description</h2>")
159
+ gr.Markdown(
160
+ "The reimbursement filing process can be time-consuming and cumbersome, causing "
161
+ "frustration for faculty members and finance departments. Our project aims to "
162
+ "automate the information extraction involved in the process by feeding "
163
+ "extracted text to language models such as ChatGPT. This demo showcases the "
164
+ "categorization and extraction parts of the pipeline. Categorization is done "
165
+ "to identify the relevant details associated with the text, after which "
166
+ "extraction is done for those details using a language model."
167
+ )
168
+ gr.Markdown("<h2>Try it out!</h2>")
169
+ with gr.Box() as demo:
170
+ with gr.Row():
171
+ with gr.Column(variant="panel"):
172
+ gr.HTML(
173
+ '<div><center style="color:rgb(200, 200, 200);">Input</center></div>'
174
+ )
175
+ pdf_preview = gr.HTML(label="Preview", show_label=True, visible=False)
176
+ image_preview = gr.Image(
177
+ label="Preview", show_label=True, visible=False, height=350
178
+ )
179
+ input_file = gr.File(
180
+ label="Input receipt",
181
+ show_label=True,
182
+ type="file",
183
+ file_count="single",
184
+ file_types=["image", ".pdf"],
185
+ )
186
+ input_file.change(
187
+ display_file, input_file, [pdf_preview, image_preview]
188
+ )
189
+
190
+ with gr.Row():
191
+ clear = gr.Button("Clear", variant="secondary")
192
+ submit_button = gr.Button("Submit", variant="primary")
193
+
194
+ show_intermediate = gr.Checkbox(
195
+ False,
196
+ label="Show intermediate outputs",
197
+ info="There are several intermediate steps in the process such as preprocessing, OCR, chatbot interaction. You can choose to show their results here.",
198
+ )
199
+ share_result = gr.Checkbox(
200
+ True,
201
+ label="Share results",
202
+ info="Sharing your result with us will help us immensely in improving this tool.",
203
+ interactive=True,
204
+ )
205
+ contact = gr.Textbox(
206
+ type="email",
207
+ label="Contact",
208
+ interactive=True,
209
+ placeholder="Enter your email address",
210
+ info="Optionally, enter your email address to allow us to contact you regarding your result.",
211
+ visible=True,
212
+ )
213
+ share_result.change(show_share_contact, share_result, [contact])
214
+
215
+ with gr.Column(variant="panel"):
216
+ gr.HTML(
217
+ '<div><center style="color:rgb(200, 200, 200);">Output</center></div>'
218
+ )
219
+ category = gr.Dropdown(
220
+ value=None,
221
+ choices=Category.__members__.keys(),
222
+ label=f"Recognized category ({', '.join(Category.__members__.keys())})",
223
+ show_label=True,
224
+ interactive=False,
225
+ )
226
+ intermediate_outputs = gr.Accordion(
227
+ "Intermediate outputs", open=True, visible=False
228
+ )
229
+ with intermediate_outputs:
230
+ extracted_text = gr.Textbox(
231
+ label="Extracted text",
232
+ show_label=True,
233
+ max_lines=5,
234
+ show_copy_button=True,
235
+ lines=5,
236
+ interactive=False,
237
+ )
238
+ chatbot = gr.Chatbot(
239
+ None,
240
+ label="Chatbot interaction",
241
+ show_label=True,
242
+ interactive=False,
243
+ height=240,
244
+ )
245
+ information = gr.JSON(label="Extracted information")
246
+ with gr.Row():
247
+ flag_incorrect_button = gr.Button(
248
+ "Flag as incorrect", variant="stop", interactive=True
249
+ )
250
+ flag_irrelevant_button = gr.Button(
251
+ "Flag as irrelevant", variant="stop", interactive=True
252
+ )
253
+
254
+ show_intermediate.change(
255
+ show_intermediate_outputs, show_intermediate, [intermediate_outputs]
256
+ )
257
+
258
+ clear.click(clear_inputs, None, [input_file])
259
+ submit_button.click(
260
+ submit,
261
+ [input_file, extracted_text],
262
+ [extracted_text],
263
+ )
264
+ submit_button.click(
265
+ lambda input_file, category, chatbot, information: (
266
+ gr.Dropdown.update(None),
267
+ gr.Chatbot.update(None),
268
+ gr.Textbox.update(None),
269
+ ) if input_file else (category, chatbot, information),
270
+ [input_file, category, chatbot, information],
271
+ [category, chatbot, information],
272
+ )
273
+ extracted_text.change(
274
+ categorize_extracted_text,
275
+ [extracted_text],
276
+ [category],
277
+ )
278
+ category.change(
279
+ extract_from_category,
280
+ [category, extracted_text],
281
+ [chatbot, information, flag_incorrect_button, flag_irrelevant_button],
282
+ )
283
+
284
+ hf_writer_normal.setup(
285
+ [input_file, extracted_text, category, chatbot, information, contact],
286
+ flagging_dir="flagged",
287
+ )
288
+ flag_method = gr.flagging.FlagMethod(
289
+ hf_writer_normal, "", "", visual_feedback=True
290
+ )
291
+ information.change(
292
+ dynamic_auto_flag(flag_method),
293
+ inputs=[
294
+ share_result,
295
+ input_file,
296
+ extracted_text,
297
+ category,
298
+ chatbot,
299
+ information,
300
+ contact,
301
+ ],
302
+ outputs=None,
303
+ preprocess=False,
304
+ queue=False,
305
+ )
306
+
307
+ hf_writer_incorrect.setup(
308
+ [input_file, extracted_text, category, chatbot, information, contact],
309
+ flagging_dir="flagged_incorrect",
310
+ )
311
+ flag_incorrect_method = gr.flagging.FlagMethod(
312
+ hf_writer_incorrect,
313
+ "Flag as incorrect",
314
+ "Incorrect",
315
+ visual_feedback=True,
316
+ )
317
+ flag_incorrect_button.click(
318
+ lambda: gr.Button.update(value="Saving...", interactive=False),
319
+ None,
320
+ flag_incorrect_button,
321
+ queue=False,
322
+ )
323
+ flag_incorrect_button.click(
324
+ flag_incorrect_method,
325
+ inputs=[
326
+ input_file,
327
+ extracted_text,
328
+ category,
329
+ chatbot,
330
+ information,
331
+ contact,
332
+ ],
333
+ outputs=[flag_incorrect_button],
334
+ preprocess=False,
335
+ queue=False,
336
+ )
337
+
338
+ flag_irrelevant_method = gr.flagging.FlagMethod(
339
+ hf_writer_incorrect,
340
+ "Flag as irrelevant",
341
+ "Irrelevant",
342
+ visual_feedback=True,
343
+ )
344
+ flag_irrelevant_button.click(
345
+ lambda: gr.Button.update(value="Saving...", interactive=False),
346
+ None,
347
+ flag_irrelevant_button,
348
+ queue=False,
349
+ )
350
+ flag_irrelevant_button.click(
351
+ flag_irrelevant_method,
352
+ inputs=[
353
+ input_file,
354
+ extracted_text,
355
+ category,
356
+ chatbot,
357
+ information,
358
+ contact,
359
+ ],
360
+ outputs=[flag_irrelevant_button],
361
+ preprocess=False,
362
+ queue=False,
363
+ )
364
+
365
+
366
+ page.launch(show_api=True, show_error=True, debug=True)
categories/__init__.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import Union
3
+
4
+ # from . import vendor
5
+ from langchain.chains import LLMChain
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.output_parsers import PydanticOutputParser
8
+ from langchain.output_parsers.enum import EnumOutputParser
9
+ from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate,
10
+ SystemMessagePromptTemplate)
11
+ from pydantic import BaseModel
12
+
13
+ from . import accomodation, random_, travel_cab, travel_flight
14
+
15
+
16
+ class Category(Enum):
17
+ ACCOMODATION = "ACCOMODATION"
18
+ TRAVEL_FLIGHT = "TRAVEL_FLIGHT"
19
+ TRAVEL_CAB = "TRAVEL_CAB"
20
+ # VENDOR = "VENDOR"
21
+ RANDOM = "RANDOM"
22
+
23
+
24
+ category_modules = {
25
+ Category.ACCOMODATION: accomodation,
26
+ Category.TRAVEL_FLIGHT: travel_flight,
27
+ Category.TRAVEL_CAB: travel_cab,
28
+ # Category.VENDOR: vendor,
29
+ Category.RANDOM: random_,
30
+ }
31
+
32
+ model = ChatOpenAI(
33
+ temperature=0,
34
+ n=1,
35
+ # max_tokens=300,
36
+ model_kwargs={
37
+ "stop": None,
38
+ "top_p": 1,
39
+ "frequency_penalty": 0,
40
+ "presence_penalty": 0,
41
+ },
42
+ )
43
+
44
+ # Build categorizing chain
45
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
46
+ "You are a classifier that, given a bill's text, states what type of bill "
47
+ "category it belongs to: accomodation (bills regarding stays), travel (bills "
48
+ "concerning cab or other land rides), travel (bills concerning flights), random "
49
+ "(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
50
+ "You may want to see if there are Room Details, Check-in/Check-out Date for "
51
+ "Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
52
+ "Travel; Conference Details for Conference organizers; anything else comes under "
53
+ "random category. Your answers must be only the appropriate choice e.g. 'option' and "
54
+ "not 'The given bill belongs to the option category.'\n"
55
+ "{format_instructions}"
56
+ )
57
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
58
+ chat_prompt = ChatPromptTemplate.from_messages(
59
+ [system_message_prompt, human_message_prompt]
60
+ )
61
+ category_parser = EnumOutputParser(enum=Category)
62
+ categorize_chain = LLMChain(
63
+ llm=model, prompt=chat_prompt, output_parser=category_parser
64
+ )
65
+
66
+
67
+ def categorize_text(text: str) -> Category:
68
+ """Categories the text into one of the categories defined in Category by querying
69
+ ChatGPT.
70
+
71
+ Args:
72
+ text(str): The text to categorize.
73
+
74
+ Returns: The category of the text.
75
+ """
76
+ return categorize_chain.run(
77
+ text=text, format_instructions=category_parser.get_format_instructions()
78
+ )
79
+
80
+
81
+ def run_category_chain(category: Category, text: str) -> Union[BaseModel, None]:
82
+ """Runs the chain for the given category on the given text.
83
+
84
+ Args:
85
+ category(Category): The category for which the chain is to be run.
86
+ text(str): The text on which the chain is to be run.
87
+
88
+ Returns: The output of the chain.
89
+ """
90
+ output_parser = category_modules[category].output_parser
91
+ try:
92
+ return category_modules[category].chain.run(
93
+ text=text, format_instructions=output_parser.get_format_instructions()
94
+ )
95
+ except Exception as e:
96
+ print("Error in running chain for category", category, ":", e)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ text = """amazonin
101
+ we)
102
+
103
+ Sold By :
104
+
105
+ Spigen India Pvt. Ltd.
106
+
107
+ * Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
108
+ 37//15/1, 15/2,, Adjacent to Starex School, Village
109
+ - Binola, National Highway -8, Tehsil - Manesar
110
+ Gurgaon, Haryana, 122413
111
+
112
+ IN
113
+
114
+ PAN No: ABACS5056L
115
+ GST Registration No: O6ABACS5056L12Z5
116
+
117
+ Order Number: 407-5335982-7837125
118
+ Order Date: 30.05.2023
119
+
120
+ Tax Invoice/Bill of Supply/Cash Memo
121
+ (Original for Recipient)
122
+
123
+ Billing Address :
124
+
125
+ Praveen Bohra
126
+
127
+ E-303, ParkView City 2, Sector 49, Sohna Road
128
+ GURGAON, HARYANA, 122018
129
+
130
+ IN
131
+
132
+ State/UT Code: 06
133
+
134
+ Shipping Address :
135
+
136
+ Praveen Bohra
137
+
138
+ Praveen Bohra
139
+
140
+ E-303, ParkView City 2, Sector 49, Sohna Road
141
+ GURGAON, HARYANA, 122018
142
+
143
+ IN
144
+
145
+ State/UT Code: 06
146
+
147
+ Place of supply: HARYANA
148
+
149
+ Place of delivery: HARYANA
150
+
151
+ Invoice Number : DEL5-21033
152
+ Invoice Details : HR-DEL5-918080915-2324
153
+ Invoice Date : 30.05.2023
154
+
155
+ Description at Tax |Tax /|Tax Total
156
+ p y Rate |Type |Amount|Amount
157
+
158
+ Black) | BO8BHLZHBH ( ACS01744INP )
159
+ HSN:39269099
160
+
161
+ 1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
162
+ 1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
163
+ 9% |SGST| %76.19
164
+
165
+ TOTAL:
166
+
167
+ Amount in Words:
168
+ Nine Hundred Ninety-nine only
169
+
170
+ Whether tax is payable under reverse charge - No
171
+
172
+ For Spigen India Pvt. Ltd.:
173
+ sSoigenrn
174
+
175
+ Authorized Signatory
176
+
177
+ Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
178
+ 2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
179
+
180
+ *ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
181
+
182
+ Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
183
+
184
+ Please note that this invoice is not a demand for payment
185
+
186
+ Page 1 of 1"""
187
+ category = categorize_text(text)
188
+ print("Category:", category)
189
+
190
+ print("\n\n")
191
+ result = run_category_chain(category, text)
192
+ print(result)
categories/accomodation/__init__.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0.6,
14
+ max_tokens=300,
15
+ n=1,
16
+ request_timeout=None,
17
+ model_kwargs={
18
+ 'stop': None,
19
+ 'top_p': 1,
20
+ }
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are tasked with developing an OCR data extraction system for hotel bills in PDF "
26
+ "format given as text. The system should extract important information necessary for "
27
+ "the reimbursement process from a college. Your prompt should fetch the following "
28
+ "essential details from the hotel bill: hotel name, address, bill number/invoice "
29
+ "number, booking ID / confirmation ID / booking number, check-in date and time, "
30
+ "check-out date and time, total amount, booking platform, bill date.\n"
31
+ "Ensure that the system accurately extracts the above information from the OCR text "
32
+ "of the hotel bill.\n"
33
+ "{format_instructions}"
34
+ )
35
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
36
+ chat_prompt = ChatPromptTemplate.from_messages(
37
+ [system_message_prompt, human_message_prompt]
38
+ )
39
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
40
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
41
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/accomodation/model.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ """
10
+ 1. Hotel Name: [Hotel Name]
11
+ 2. Address: [Hotel Address]
12
+ 3. Bill number/Invoice number: [Bill Number]
13
+ 4. booking ID / Confirmation ID / Booking #: [Booking ID]
14
+ 5. Check-in Date and Time: [Check-in Date Time]
15
+ 6. Check-out Date and Time: [Check-out Date Time]
16
+ 7. Total Amount: [Total Amount Charged]
17
+ 8. Booking platform: [Booking Platform]
18
+ 9. Bill date: [Bill Date]
19
+ """
20
+
21
+ hostel_name: str = Field(..., title="The name of the hotel")
22
+ address: str = Field(..., title="The address of the hotel")
23
+ bill_number: str = Field(..., title="The bill number/invoice number")
24
+ booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
25
+ check_in_date_time: datetime = Field(..., title="The check-in date and time")
26
+ check_out_date_time: datetime = Field(..., title="The check-out date and time")
27
+ total_amount_charged: float = Field(..., title="The total amount charged")
28
+ booking_platform: str = Field(..., title="The booking platform")
29
+ bill_date: datetime = Field(..., title="The bill date")
categories/random_/__init__.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs={
16
+ 'stop': None,
17
+ 'top_p': 1,
18
+ 'frequency_penalty': 0,
19
+ 'presence_penalty': 0,
20
+ }
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents like uids, total, tax, name, currency, date, seller details, summary. You "
27
+ "may use context to make an educated guess about the currency. Use null if you are "
28
+ "unable to find certain details\n"
29
+ "{format_instructions}"
30
+ )
31
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
32
+ chat_prompt = ChatPromptTemplate.from_messages(
33
+ [system_message_prompt, human_message_prompt]
34
+ )
35
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
36
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
37
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
38
+
39
+ if __name__ == "__main__":
40
+ text = """amazonin
41
+ we)
42
+
43
+ Sold By :
44
+
45
+ Spigen India Pvt. Ltd.
46
+
47
+ * Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
48
+ 37//15/1, 15/2,, Adjacent to Starex School, Village
49
+ - Binola, National Highway -8, Tehsil - Manesar
50
+ Gurgaon, Haryana, 122413
51
+
52
+ IN
53
+
54
+ PAN No: ABACS5056L
55
+ GST Registration No: O6ABACS5056L12Z5
56
+
57
+ Order Number: 407-5335982-7837125
58
+ Order Date: 30.05.2023
59
+
60
+ Tax Invoice/Bill of Supply/Cash Memo
61
+ (Original for Recipient)
62
+
63
+ Billing Address :
64
+
65
+ Praveen Bohra
66
+
67
+ E-303, ParkView City 2, Sector 49, Sohna Road
68
+ GURGAON, HARYANA, 122018
69
+
70
+ IN
71
+
72
+ State/UT Code: 06
73
+
74
+ Shipping Address :
75
+
76
+ Praveen Bohra
77
+
78
+ Praveen Bohra
79
+
80
+ E-303, ParkView City 2, Sector 49, Sohna Road
81
+ GURGAON, HARYANA, 122018
82
+
83
+ IN
84
+
85
+ State/UT Code: 06
86
+
87
+ Place of supply: HARYANA
88
+
89
+ Place of delivery: HARYANA
90
+
91
+ Invoice Number : DEL5-21033
92
+ Invoice Details : HR-DEL5-918080915-2324
93
+ Invoice Date : 30.05.2023
94
+
95
+ Description at Tax |Tax /|Tax Total
96
+ p y Rate |Type |Amount|Amount
97
+
98
+ Black) | BO8BHLZHBH ( ACS01744INP )
99
+ HSN:39269099
100
+
101
+ 1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
102
+ 1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
103
+ 9% |SGST| %76.19
104
+
105
+ TOTAL:
106
+
107
+ Amount in Words:
108
+ Nine Hundred Ninety-nine only
109
+
110
+ Whether tax is payable under reverse charge - No
111
+
112
+ For Spigen India Pvt. Ltd.:
113
+ sSoigenrn
114
+
115
+ Authorized Signatory
116
+
117
+ Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
118
+ 2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
119
+
120
+ *ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
121
+
122
+ Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
123
+
124
+ Please note that this invoice is not a demand for payment
125
+
126
+ Page 1 of 1"""
127
+ # result = chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions())
128
+ # print(result.json(indent=4))
129
+ result = chain.generate(input_list=[{"text": text, "format_instructions": fixing_parser.get_format_instructions()}])
130
+ print(result)
131
+ result = fixing_parser.parse_with_prompt(result.generations[0][0].text, chain.prompt.format_prompt(text=text, format_instructions=fixing_parser.get_format_instructions()))
132
+ print(result)
133
+ # result = chain.run(text=text, format_instructions=output_parser.get_format_instructions(), verbose=True)
134
+ # print(result)
categories/random_/model.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: schema.json
3
+ # timestamp: 2023-07-28T11:36:16+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import date
8
+ from typing import Dict, Optional, Union
9
+
10
+ import iso4217
11
+ from pydantic import BaseModel, Field, constr, validator, ValidationError
12
+
13
+
14
+ class TaxItem(BaseModel):
15
+ gst: float = Field(
16
+ ...,
17
+ title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
18
+ )
19
+
20
+
21
+ class TaxItem1(BaseModel):
22
+ vat: float = Field(..., title="The total VAT present in the invoice")
23
+
24
+
25
+ class TaxNumberItem(BaseModel):
26
+ gst_number: constr(min_length=15) = Field(
27
+ ..., title="The alphanumeric GSTIN/GST number code"
28
+ )
29
+
30
+
31
+ class TaxNumberItem1(BaseModel):
32
+ vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
33
+
34
+
35
+ class TaxNumberItem2(BaseModel):
36
+ ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
37
+
38
+
39
+ class SellerDetails(BaseModel):
40
+ name: Optional[str] = None
41
+ address: Optional[str] = None
42
+ contact: Optional[str] = None
43
+ tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
44
+ ..., title="Tax information"
45
+ )
46
+ pan_number: constr(min_length=10, max_length=10) = Field(
47
+ ..., title="The 10-character alphanumeric PAN code"
48
+ )
49
+
50
+
51
+ class UIDDict(BaseModel):
52
+ invoice_number: str = Field(..., title="The invoice number")
53
+ other_uids: Dict[str, str] = Field(
54
+ ...,
55
+ title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
56
+ )
57
+
58
+
59
+ class InformationExtractedFromABillReceipt(BaseModel):
60
+ uids: UIDDict = Field(..., title="Invoice number and other UIDs")
61
+ total: float = Field(..., title="Total amount or price")
62
+ tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
63
+ name: str = Field(
64
+ ...,
65
+ title="Name of the person/entity that the invoice item was charged or delivered to",
66
+ )
67
+ currency: str = Field(
68
+ default="INR",
69
+ title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
70
+ )
71
+ issue_date: date = Field(
72
+ ..., title="The date the invoice was issued"
73
+ )
74
+ seller_details: SellerDetails = Field(..., title="Information about the seller")
75
+ summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
76
+
77
+ @validator("currency")
78
+ @classmethod
79
+ def check_currency(cls, v: str) -> str:
80
+ if not iso4217.Currency.__members__.get(v.lower()):
81
+ raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
82
+ return v.upper()
categories/travel_cab/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs= {
16
+ 'stop': None,
17
+ 'top_p': 1,
18
+ 'frequency_penalty': 0,
19
+ 'presence_penalty': 0,
20
+ }
21
+ )
22
+
23
+ # Build categorizing chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents such as date/time/place of departure and arrival.\n"
27
+ "{format_instructions}"
28
+ )
29
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
30
+ chat_prompt = ChatPromptTemplate.from_messages(
31
+ [system_message_prompt, human_message_prompt]
32
+ )
33
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
34
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
35
+ chain = LLMChain(
36
+ llm=model, prompt=chat_prompt, output_parser=fixing_parser
37
+ )
categories/travel_cab/model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, time
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ ''''''
10
+
11
+ place_from: str = Field(..., title="place where journey starts")
12
+ date_from: date = Field(
13
+ ..., title="date on which journey starts (DD/MM/YYYY)"
14
+ )
15
+ time_from: time = Field(..., title="time at which journey starts")
16
+ place_to: str = Field(..., title="place where journey end")
17
+ date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
18
+ time_to: time = Field(..., title="time at which journey end")
19
+ amount: float = Field(..., title="cost of journey ticket")
categories/travel_flight/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ )
10
+
11
+ model = ChatOpenAI(temperature=0)
12
+
13
+ # Build categorizing chain
14
+ human_message_prompt = HumanMessagePromptTemplate.from_template(
15
+ "Parse through and find the following details from the text extracted from a travel "
16
+ "bill\n"
17
+ "{format_instructions}\n"
18
+ "{text}"
19
+ )
20
+ chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
21
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
22
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
23
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/travel_flight/model.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, time
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ """
10
+ response_schemas = [
11
+ ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
12
+ ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
13
+ ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
14
+ ResponseSchema(name="place (to)", description="place where flight end/lands"),
15
+ ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
16
+ ResponseSchema(name="time (to)", description="time at which flight end/lands"),
17
+ ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
18
+ ResponseSchema(name="amount", description="cost of flight ticket")
19
+ ]"""
20
+
21
+ place_from: str = Field(..., title="place where flight starts/takes-off")
22
+ date_from: date = Field(
23
+ ..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
24
+ )
25
+ time_from: time = Field(..., title="time at which flight starts/takes-off")
26
+ place_to: str = Field(..., title="place where flight end/lands")
27
+ date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
28
+ time_to: time = Field(..., title="time at which flight end/lands")
29
+ pnr_number: str = Field(..., title="PNR Number of flight")
30
+ amount: float = Field(..., title="cost of flight ticket")
categories/vendor/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs={
16
+ "stop": None,
17
+ "top_p": 1,
18
+ "frequency_penalty": 0,
19
+ "presence_penalty": 0,
20
+ },
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents like uids, total, tax, addresses, bank details, invoice details, "
27
+ "participant registration details."
28
+ "{format_instructions}"
29
+ )
30
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
31
+ chat_prompt = ChatPromptTemplate.from_messages(
32
+ [system_message_prompt, human_message_prompt]
33
+ )
34
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
35
+ print(output_parser.get_format_instructions())
36
+ # exit()
37
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
38
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/vendor/model.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: schema.json
3
+ # timestamp: 2023-07-28T11:36:16+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+
9
+ from pydantic import BaseModel, Field, constr, validator, ValidationError
10
+
11
+
12
+ class BankDetails(BaseModel):
13
+ """account holder name, bank name, account number, branch, ifs code, swift code"""
14
+
15
+ account_holder_name: str = Field(..., title="The name of the account holder")
16
+ bank_name: str = Field(..., title="The name of the bank")
17
+ account_number: str = Field(..., title="The account number")
18
+ branch: str = Field(..., title="The branch of the bank")
19
+ ifs_code: str = Field(..., title="The IFS code of the bank")
20
+ swift_code: str = Field(..., title="The SWIFT code of the bank")
21
+
22
+
23
+ class InformationExtractedFromABillReceipt(BaseModel):
24
+ """
25
+ GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
26
+ bank details: (account holder name, bank name, account number, branch, ifs code, swift
27
+ code), recipient, registration id, registration fee, registration date/time
28
+ """
29
+
30
+ gstin: constr(min_length=15) = Field(
31
+ ..., title="The alphanumeric GSTIN/GST number code"
32
+ )
33
+ billing_address: str = Field(..., title="The billing address")
34
+ invoice_number: str = Field(..., title="The invoice number")
35
+ invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
36
+ due_date: datetime = Field(..., title="The date-time the invoice is due")
37
+ total: float = Field(..., title="Total amount or price")
38
+ balance_due: float = Field(..., title="The amount due")
39
+ bank_details: BankDetails = Field(..., title="Bank details")
40
+ recipient: str = Field(
41
+ ...,
42
+ title="Name of the person/entity that the invoice item was charged or delivered to",
43
+ )
44
+ registration_id: str = Field(..., title="The registration ID")
45
+ registration_fee: float = Field(..., title="The registration fee")
46
+ registration_date_time: datetime = Field(..., title="The registration date-time")
environment.yml ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: env
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - asttokens=2.2.1=pyhd8ed1ab_0
7
+ - backcall=0.2.0=pyh9f0ad1d_0
8
+ - backports=1.0=pyhd8ed1ab_3
9
+ - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
10
+ - boost-cpp=1.78.0=h9f4b32c_3
11
+ - bzip2=1.0.8=h8ffe710_4
12
+ - ca-certificates=2023.7.22=h56e8100_0
13
+ - cairo=1.16.0=hdecc03f_1016
14
+ - colorama=0.4.6=pyhd8ed1ab_0
15
+ - comm=0.1.3=pyhd8ed1ab_0
16
+ - debugpy=1.6.7=py39h99910a6_0
17
+ - decorator=5.1.1=pyhd8ed1ab_0
18
+ - executing=1.2.0=pyhd8ed1ab_0
19
+ - expat=2.5.0=h63175ca_1
20
+ - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
21
+ - font-ttf-inconsolata=3.000=h77eed37_0
22
+ - font-ttf-source-code-pro=2.038=h77eed37_0
23
+ - font-ttf-ubuntu=0.83=hab24e00_0
24
+ - fontconfig=2.14.2=hbde0cde_0
25
+ - fonts-conda-ecosystem=1=0
26
+ - fonts-conda-forge=1=0
27
+ - freetype=2.12.1=h546665d_1
28
+ - gettext=0.21.1=h5728263_0
29
+ - icu=72.1=h63175ca_0
30
+ - importlib-metadata=6.8.0=pyha770c72_0
31
+ - importlib_metadata=6.8.0=hd8ed1ab_0
32
+ - ipykernel=6.25.0=pyh6817e22_0
33
+ - ipython=8.14.0=pyh08f2357_0
34
+ - jedi=0.18.2=pyhd8ed1ab_0
35
+ - jupyter_client=8.3.0=pyhd8ed1ab_0
36
+ - jupyter_core=5.3.1=py39hcbf5309_0
37
+ - krb5=1.21.1=heb0366b_0
38
+ - lcms2=2.15=h3e3b177_1
39
+ - lerc=4.0.0=h63175ca_0
40
+ - libcurl=8.2.1=hd5e4a3a_0
41
+ - libdeflate=1.18=hcfcfb64_0
42
+ - libexpat=2.5.0=h63175ca_1
43
+ - libffi=3.4.2=h8ffe710_5
44
+ - libglib=2.76.4=he8f3873_0
45
+ - libiconv=1.17=h8ffe710_0
46
+ - libjpeg-turbo=2.1.5.1=hcfcfb64_0
47
+ - libpng=1.6.39=h19919ed_0
48
+ - libsodium=1.0.18=h8d14728_1
49
+ - libssh2=1.11.0=h7dfc565_0
50
+ - libtiff=4.5.1=h6c8260b_0
51
+ - libzlib=1.2.13=hcfcfb64_5
52
+ - matplotlib-inline=0.1.6=pyhd8ed1ab_0
53
+ - nest-asyncio=1.5.6=pyhd8ed1ab_0
54
+ - openjpeg=2.5.0=ha2aaf27_2
55
+ - openssl=3.1.1=hcfcfb64_1
56
+ - packaging=23.1=pyhd8ed1ab_0
57
+ - parso=0.8.3=pyhd8ed1ab_0
58
+ - pcre2=10.40=h17e33f8_0
59
+ - pickleshare=0.7.5=py_1003
60
+ - pip=23.2.1=py39haa95532_0
61
+ - pixman=0.40.0=h8ffe710_0
62
+ - platformdirs=3.9.1=pyhd8ed1ab_0
63
+ - poppler=23.07.0=h45d20d0_0
64
+ - poppler-data=0.4.12=hd8ed1ab_0
65
+ - prompt-toolkit=3.0.39=pyha770c72_0
66
+ - prompt_toolkit=3.0.39=hd8ed1ab_0
67
+ - psutil=5.9.5=py39ha55989b_0
68
+ - pure_eval=0.2.2=pyhd8ed1ab_0
69
+ - pygments=2.15.1=pyhd8ed1ab_0
70
+ - python=3.9.17=h1aa4202_0
71
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
72
+ - python_abi=3.9=2_cp39
73
+ - pywin32=304=py39h99910a6_2
74
+ - pyzmq=25.1.0=py39hea35a22_0
75
+ - setuptools=68.0.0=py39haa95532_0
76
+ - six=1.16.0=pyh6c4a22f_0
77
+ - sqlite=3.41.2=h2bbff1b_0
78
+ - stack_data=0.6.2=pyhd8ed1ab_0
79
+ - tornado=6.3.2=py39ha55989b_0
80
+ - traitlets=5.9.0=pyhd8ed1ab_0
81
+ - typing-extensions=4.7.1=hd8ed1ab_0
82
+ - typing_extensions=4.7.1=pyha770c72_0
83
+ - ucrt=10.0.22621.0=h57928b3_0
84
+ - vc=14.3=h64f974e_17
85
+ - vc14_runtime=14.36.32532=hfdfe4a8_17
86
+ - vs2015_runtime=14.36.32532=h05e6639_17
87
+ - wcwidth=0.2.6=pyhd8ed1ab_0
88
+ - wheel=0.38.4=py39haa95532_0
89
+ - xz=5.2.6=h8d14728_0
90
+ - zeromq=4.3.4=h0e60522_1
91
+ - zipp=3.16.2=pyhd8ed1ab_0
92
+ - zlib=1.2.13=hcfcfb64_5
93
+ - zstd=1.5.2=h12be248_7
94
+ - pip:
95
+ - aiofiles==23.1.0
96
+ - aiohttp==3.8.5
97
+ - aiosignal==1.3.1
98
+ - altair==5.0.1
99
+ - annotated-types==0.5.0
100
+ - anyio==3.7.1
101
+ - async-timeout==4.0.2
102
+ - attrs==23.1.0
103
+ - certifi==2023.7.22
104
+ - charset-normalizer==3.2.0
105
+ - click==8.1.6
106
+ - contourpy==1.1.0
107
+ - cycler==0.11.0
108
+ - dataclasses-json==0.5.13
109
+ - datasets==2.14.1
110
+ - dill==0.3.7
111
+ - exceptiongroup==1.1.2
112
+ - fastapi==0.100.1
113
+ - ffmpy==0.3.1
114
+ - filelock==3.12.2
115
+ - fonttools==4.41.1
116
+ - frozenlist==1.4.0
117
+ - fsspec==2023.6.0
118
+ - gradio==3.39.0
119
+ - gradio-client==0.3.0
120
+ - greenlet==2.0.2
121
+ - h11==0.14.0
122
+ - httpcore==0.17.3
123
+ - httpx==0.24.1
124
+ - huggingface-hub==0.16.4
125
+ - idna==3.4
126
+ - importlib-resources==6.0.0
127
+ - iso4217==1.11.20220401
128
+ - jinja2==3.1.2
129
+ - jsonschema==4.18.4
130
+ - jsonschema-specifications==2023.7.1
131
+ - kiwisolver==1.4.4
132
+ - langchain==0.0.247
133
+ - langsmith==0.0.15
134
+ - linkify-it-py==2.0.2
135
+ - markdown-it-py==2.2.0
136
+ - markupsafe==2.1.3
137
+ - marshmallow==3.20.1
138
+ - matplotlib==3.7.2
139
+ - mdit-py-plugins==0.3.3
140
+ - mdurl==0.1.2
141
+ - multidict==6.0.4
142
+ - multiprocess==0.70.15
143
+ - mypy-extensions==1.0.0
144
+ - numexpr==2.8.4
145
+ - numpy==1.25.1
146
+ - openai==0.27.8
147
+ - openapi-schema-pydantic==1.2.4
148
+ - opencv-python-headless==4.8.0.74
149
+ - orjson==3.9.2
150
+ - pandas==2.0.3
151
+ - pdf2image==1.16.3
152
+ - pillow==10.0.0
153
+ - pyarrow==12.0.1
154
+ - pydantic==1.10.12
155
+ - pydantic-core==2.4.0
156
+ - pydub==0.25.1
157
+ - pyocr==0.8.3
158
+ - pyparsing==3.0.9
159
+ - pypdf==3.13.0
160
+ - pypiwin32==223
161
+ - python-multipart==0.0.6
162
+ - pytz==2023.3
163
+ - pyyaml==6.0.1
164
+ - referencing==0.30.0
165
+ - requests==2.31.0
166
+ - rpds-py==0.9.2
167
+ - semantic-version==2.10.0
168
+ - sniffio==1.3.0
169
+ - sqlalchemy==2.0.19
170
+ - starlette==0.27.0
171
+ - tenacity==8.2.2
172
+ - toolz==0.12.0
173
+ - tqdm==4.65.0
174
+ - typing-inspect==0.9.0
175
+ - tzdata==2023.3
176
+ - uc-micro-py==1.0.2
177
+ - urllib3==2.0.4
178
+ - uvicorn==0.23.1
179
+ - websockets==11.0.3
180
+ - xxhash==3.3.0
181
+ - yarl==1.9.2
examples/example1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
3
+ size 45782
examples/rotated.jpeg ADDED

Git LFS Details

  • SHA256: e98aa24e25b2c3f277c237664cba4616fbe5d80fe3099459fb81e2ef3720d23c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.79 MB
examples/rotated.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
3
+ size 333463
examples/upright.jpeg ADDED

Git LFS Details

  • SHA256: 728be2c94b4af573145e5e89ffe5c3dfddb12a3055b85e60a23bd7697cff83f7
  • Pointer size: 132 Bytes
  • Size of remote file: 2.93 MB
examples/upright.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
3
+ size 325064
extract.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Responsible for extracting text from images and PDFs using OCR engines or other modules.
2
+ """
3
+ from io import BytesIO
4
+ from typing import List
5
+
6
+ import pyocr.tesseract
7
+ import pypdf
8
+ from PIL import Image
9
+
10
+
11
+ def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
12
+ """Extracts text from the given PDF file using pypdf.
13
+
14
+ Args:
15
+ bytes_stream (BytesIO): The PDF file to extract text from.
16
+
17
+ Returns: The extracted text
18
+ """
19
+ pdf_reader = pypdf.PdfReader(bytes_stream)
20
+ text = ""
21
+ for page in pdf_reader.pages:
22
+ text += page.extract_text()
23
+ text += "\n\n"
24
+ return text
25
+
26
+
27
+ def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
28
+ """Extracts text from the given image using tesseract via pyocr.
29
+
30
+ Args:
31
+ image(PIL.Image.Image): The image to extract text from.
32
+
33
+ Returns: The extracted text.
34
+ """
35
+ if not pyocr.tesseract.is_available():
36
+ raise Exception("Tesseract is not available.")
37
+ text = pyocr.tesseract.image_to_string(image, lang="eng")
38
+ return text
39
+
40
+
41
+ def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
42
+ """Extracts text from the given images using tesseract via pyocr.
43
+
44
+ Args:
45
+ images(List[PIL.Image.Image]): The images to extract text from.
46
+
47
+ Returns: The extracted text.
48
+ """
49
+ text = ""
50
+ for image in images:
51
+ text += extract_text_from_image_pyocr_tesseract(image)
52
+ text += "\n\n"
53
+ image.close()
54
+ return text
55
+
56
+ if __name__ == '__main__':
57
+ filename = 'examples/upright.pdf'
58
+ with open(filename, 'rb') as file:
59
+ bytes_stream = BytesIO(file.read())
60
+ text = extract_text_from_pdf_pypdf(bytes_stream)
61
+ print(text)
62
+ print("-"*25)
63
+ filename = 'examples/upright.jpeg'
64
+ image = Image.open(filename)
65
+ text = extract_text_from_image_pyocr_tesseract(image)
66
+ print(text)
67
+ image.close()
main.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import categories
4
+ import processing
5
+ import extract
6
+ from PIL import Image
7
+ from pydantic import BaseModel
8
+ from io import BytesIO
9
+
10
+ def categorize_and_parse_text(text: str) -> BaseModel:
11
+ """Categorizes the text and parses the information from it.
12
+
13
+ Args:
14
+ text(str): The text to categorize and parse information from.
15
+
16
+ Returns: The category of the text.
17
+ """
18
+ category = categories.categorize_text(text)
19
+ # if stop_on_category:
20
+ # return category, text
21
+ result = categories.run_category_chain(category, text)
22
+ return result
23
+
24
+ def process_pdf(filename: Path, extract_only=False) -> BaseModel:
25
+ """Processes the given PDF file and extracts information from it.
26
+
27
+ Args:
28
+ filename(Path): The PDF file to process.
29
+
30
+ Returns: The extracted information.
31
+ """
32
+ with open(filename, "rb") as f:
33
+ pdf_bytes = bytes(f.read())
34
+
35
+ text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
36
+ # If the encoded text is too short, a pdf scanner probably added a watermark
37
+ if len(text) < 20:
38
+ # Try to extract text from images
39
+ images = processing.preprocess_pdf_pdf2image(pdf_bytes)
40
+ text = extract.extract_text_from_images_pyocr_tesseract(images)
41
+ if extract_only:
42
+ return text
43
+ result = categorize_and_parse_text(text)
44
+ return result
45
+
46
+ def process_image(filename: Path, extract_only=False) -> BaseModel:
47
+ """Processes the given image file and extracts information from it.
48
+
49
+ Args:
50
+ filename(Path): The image file to process.
51
+
52
+ Returns: The extracted information.
53
+ """
54
+ image = Image.open(filename)
55
+ image = processing.preprocess_image(image)
56
+ text = extract.extract_text_from_image_pyocr_tesseract(image)
57
+ image.close()
58
+ if extract_only:
59
+ return text
60
+ result = categorize_and_parse_text(text)
61
+ return result
62
+
63
+ if __name__ == "__main__":
64
+ filename = Path("examples/example1.pdf")
65
+ result = process_pdf(filename)
66
+ print(result.json(indent=4))
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
processing.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Responsible for (pre)processing images and PDFs before they are passed to the OCR
2
+ engine and other miscellaneous actions concerning processing.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ # import cv2
9
+ # import numpy as np
10
+ import pyocr
11
+ from pdf2image import pdf2image
12
+ from PIL import Image #, ImageOps
13
+
14
+ PDF_CONVERSION_DPI = 300
15
+ ROTATION_CONFIDENCE_THRESHOLD = 2.0
16
+
17
+ # def rotate_image(image: Image, angle: float):
18
+ # """Rotates the given image by the given angle.
19
+
20
+ # Args:
21
+ # image(PIL.Image.Image): The image to be rotated.
22
+ # angle(float): The angle to rotate the image by.
23
+
24
+ # Returns: The rotated image.
25
+ # """
26
+ # image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
27
+ # height, width, _ = image.shape # Get the image height, width, and channels
28
+ # # Compute the rotation matrix
29
+ # rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
30
+ # # Apply the rotation to the image
31
+ # rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
32
+ # rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
33
+ # return rotated_image
34
+
35
+
36
+ # class PDF_CONVERTER(enum.Enum):
37
+ # PDF2IMAGE = 1
38
+ # IMAGEMAGICK = 2
39
+
40
+
41
+ def correct_orientation(image: Image.Image) -> Image.Image:
42
+ """Corrects the orientation of an image if it is not upright.
43
+
44
+ Args:
45
+ image(PIL.Image.Image): The pillow image to be corrected.
46
+
47
+ Returns: The corrected pillow image as a copy. The original image is not closed.
48
+ """
49
+ if not pyocr.tesseract.is_available():
50
+ raise Exception("Tesseract is not available.")
51
+
52
+ # image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual
53
+ orientation_info = {}
54
+ try:
55
+ orientation_info = pyocr.tesseract.detect_orientation(image)
56
+ except pyocr.PyocrException as e:
57
+ print("Orientation detection failed: {}".format(e))
58
+ # output = pytesseract.image_to_osd(
59
+ # image, config=" --psm 0", output_type=pytesseract.Output.DICT
60
+ # )
61
+ angle = orientation_info.get("angle", 0)
62
+ confidence = orientation_info.get("confidence", 100)
63
+ # rotate = output["rotate"]
64
+ # confidence = output["orientation_conf"]
65
+
66
+ if confidence > ROTATION_CONFIDENCE_THRESHOLD:
67
+ new_image = image.rotate(angle, expand=True)
68
+ else:
69
+ new_image = image.copy()
70
+ return new_image
71
+
72
+
73
+ def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
74
+ """Converts a PDF to an image using pdf2image.
75
+
76
+ Args:
77
+ pdf_bytes(bytes): The bytes of the PDF to be converted.
78
+
79
+ Returns: A list of pillow images corresponding to each page from the PDF.
80
+ """
81
+ images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
82
+ return images
83
+
84
+
85
+ def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
86
+ """Converts a PDF to an image using ImageMagick.
87
+
88
+ Args:
89
+ filename(pathlib.Path): The path to the PDF to be converted.
90
+ dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
91
+ are saved in the folder as page.jpg or as page-01.jpg,
92
+ page-02.jpg, etc.
93
+
94
+ Returns: dest_folder
95
+ """
96
+ os.system(f"magick convert"
97
+ f"-density {PDF_CONVERSION_DPI}"
98
+ f"{filename}"
99
+ f"-quality 100"
100
+ f"{dest_folder/'page.jpg'}")
101
+ return dest_folder
102
+
103
+
104
+ def preprocess_image(image: Image.Image) -> Image.Image:
105
+ """Preprocesses an image for future use with OCR.
106
+ The following operations are performed:
107
+ 1. Orientation correction
108
+
109
+ Args:
110
+ image(PIL.Image.Image): The image to be preprocessed.
111
+
112
+ Returns: The preprocessed pillow image.
113
+ """
114
+ rotated_image = correct_orientation(image)
115
+ result = rotated_image
116
+ image.close()
117
+ return result
118
+
119
+ def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
120
+ """Preprocesses a PDF for future use with OCR.
121
+ The following operations are performed:
122
+ 1. PDF to image conversion
123
+ 2. Orientation correction
124
+
125
+ Args:
126
+ pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
127
+
128
+ Returns: A list of pillow images corresponding to each page from the PDF.
129
+ """
130
+ images = convert_pdf_to_image_pdf2image(pdf_bytes)
131
+ result = []
132
+ for image in images:
133
+ new_image = preprocess_image(image)
134
+ image.close()
135
+ result.append(new_image)
136
+ return result
137
+
138
+ def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
139
+ """Preprocesses a PDF for future use with OCR.
140
+ The following operations are performed:
141
+ 1. PDF to image conversion
142
+ 2. Orientation correction
143
+
144
+ Args:
145
+ filename(pathlib.Path): The path to the PDF to be preprocessed.
146
+
147
+ Returns: A list of pillow images corresponding to each page from the PDF.
148
+ """
149
+ dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
150
+ result = []
151
+ for image in dest_folder.glob("*.jpg"):
152
+ new_image = preprocess_image(image)
153
+ image.close()
154
+ result.append(new_image)
155
+ return result
156
+
157
+ if __name__ == '__main__':
158
+ filename = 'examples/upright.jpeg'
159
+ image = Image.open(filename)
160
+ new_image = preprocess_image(image)
161
+ image.close()
162
+ new_image.show()
163
+ new_image.close()
164
+
165
+ filename = 'examples/rotated.pdf'
166
+ with open(filename, 'rb') as file:
167
+ bytes_ = bytes(file.read())
168
+ images = preprocess_pdf_pdf2image(bytes_)
169
+ for image in images:
170
+ image.show()
171
+ image.close()
requirements.txt ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.5
3
+ aiosignal==1.3.1
4
+ altair==5.0.1
5
+ annotated-types==0.5.0
6
+ anyio==3.7.1
7
+ asttokens==2.2.1
8
+ async-timeout==4.0.2
9
+ attrs==23.1.0
10
+ backcall==0.2.0
11
+ backports.functools-lru-cache==1.6.5
12
+ certifi==2023.7.22
13
+ charset-normalizer==3.2.0
14
+ click==8.1.6
15
+ colorama==0.4.6
16
+ comm==0.1.3
17
+ contourpy==1.1.0
18
+ cycler==0.11.0
19
+ dataclasses-json==0.5.13
20
+ datasets==2.14.1
21
+ debugpy==1.6.7
22
+ decorator==5.1.1
23
+ dill==0.3.7
24
+ exceptiongroup==1.1.2
25
+ executing==1.2.0
26
+ fastapi==0.100.1
27
+ ffmpy==0.3.1
28
+ filelock==3.12.2
29
+ fonttools==4.41.1
30
+ frozenlist==1.4.0
31
+ fsspec==2023.6.0
32
+ gradio==3.39.0
33
+ gradio_client==0.3.0
34
+ greenlet==2.0.2
35
+ h11==0.14.0
36
+ httpcore==0.17.3
37
+ httpx==0.24.1
38
+ huggingface-hub==0.16.4
39
+ idna==3.4
40
+ importlib-metadata==6.8.0
41
+ importlib-resources==6.0.0
42
+ ipykernel==6.25.0
43
+ ipython==8.14.0
44
+ iso4217==1.11.20220401
45
+ jedi==0.18.2
46
+ Jinja2==3.1.2
47
+ jsonschema==4.18.4
48
+ jsonschema-specifications==2023.7.1
49
+ jupyter_client==8.3.0
50
+ jupyter_core==5.3.1
51
+ kiwisolver==1.4.4
52
+ langchain==0.0.247
53
+ langsmith==0.0.15
54
+ linkify-it-py==2.0.2
55
+ markdown-it-py==2.2.0
56
+ MarkupSafe==2.1.3
57
+ marshmallow==3.20.1
58
+ matplotlib==3.7.2
59
+ matplotlib-inline==0.1.6
60
+ mdit-py-plugins==0.3.3
61
+ mdurl==0.1.2
62
+ multidict==6.0.4
63
+ multiprocess==0.70.15
64
+ mypy-extensions==1.0.0
65
+ nest-asyncio==1.5.6
66
+ numexpr==2.8.4
67
+ numpy==1.25.1
68
+ openai==0.27.8
69
+ openapi-schema-pydantic==1.2.4
70
+ opencv-python-headless==4.8.0.74
71
+ orjson==3.9.2
72
+ packaging==23.1
73
+ pandas==2.0.3
74
+ parso==0.8.3
75
+ pdf2image==1.16.3
76
+ pickleshare==0.7.5
77
+ Pillow==10.0.0
78
+ pip==23.2.1
79
+ platformdirs==3.9.1
80
+ prompt-toolkit==3.0.39
81
+ psutil==5.9.5
82
+ pure-eval==0.2.2
83
+ pyarrow==12.0.1
84
+ pydantic==1.10.12
85
+ pydantic_core==2.4.0
86
+ pydub==0.25.1
87
+ Pygments==2.15.1
88
+ pyocr==0.8.3
89
+ pyparsing==3.0.9
90
+ pypdf==3.13.0
91
+ pypiwin32==223
92
+ python-dateutil==2.8.2
93
+ python-multipart==0.0.6
94
+ pytz==2023.3
95
+ pywin32==304
96
+ PyYAML==6.0.1
97
+ pyzmq==25.1.0
98
+ referencing==0.30.0
99
+ requests==2.31.0
100
+ rpds-py==0.9.2
101
+ semantic-version==2.10.0
102
+ setuptools==68.0.0
103
+ six==1.16.0
104
+ sniffio==1.3.0
105
+ SQLAlchemy==2.0.19
106
+ stack-data==0.6.2
107
+ starlette==0.27.0
108
+ tenacity==8.2.2
109
+ toolz==0.12.0
110
+ tornado==6.3.2
111
+ tqdm==4.65.0
112
+ traitlets==5.9.0
113
+ typing_extensions==4.7.1
114
+ typing-inspect==0.9.0
115
+ tzdata==2023.3
116
+ uc-micro-py==1.0.2
117
+ urllib3==2.0.4
118
+ uvicorn==0.23.1
119
+ wcwidth==0.2.6
120
+ websockets==11.0.3
121
+ wheel==0.38.4
122
+ xxhash==3.3.0
123
+ yarl==1.9.2
124
+ zipp==3.16.2