Kuldip2411 commited on
Commit
5c9f913
verified
1 Parent(s): 133cf5d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +107 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from paddleocr import PaddleOCR
4
+ from langchain_groq import ChatGroq
5
+ from langchain.output_parsers import PydanticOutputParser
6
+ from langchain_core.prompts import PromptTemplate
7
+ from pydantic import BaseModel, Field
8
+ import fitz
9
+ import json
10
+ from PIL import Image
11
+ ocr = PaddleOCR(use_angle_cls=True, lang='es')
12
+
13
+ st.set_page_config(layout="wide")
14
+
15
+ class CarInfoEntity(BaseModel):
16
+ dealer_name: str = Field(description="Nombre del concesionario o empresa.")
17
+ dealer_address: str = Field(description="Direcci贸n f铆sica del concesionario.")
18
+ tax_id: str = Field(description="N煤mero de identificaci贸n fiscal del concesionario.")
19
+ contact_phone: str = Field(description="N煤mero de tel茅fono principal para contactar con el concesionario.")
20
+ contact_fax: str = Field(description="N煤mero de fax del concesionario.")
21
+ contact_email: str = Field(description="Direcci贸n de correo electr贸nico para consultas.")
22
+ website_url: str = Field(description="Sitio web oficial del concesionario.")
23
+ operating_hours: str = Field(description="Horario habitual de atenci贸n del concesionario.")
24
+ saturday_hours: str = Field(description="Horario de atenci贸n espec铆fico para los s谩bados.")
25
+ order_date: str = Field(description="Fecha en que se realiz贸 el pedido.")
26
+ order_number: str = Field(description="Identificador 煤nico del pedido.")
27
+ sales_rep: str = Field(description="Nombre del vendedor que maneja la transacci贸n.")
28
+ customer_full_name: str = Field(description="Nombre completo del comprador.")
29
+ customer_address: str = Field(description="Direcci贸n del comprador.")
30
+ customer_city: str = Field(description="Ciudad donde reside el comprador.")
31
+ customer_postal_code: str = Field(description="C贸digo postal de la direcci贸n del comprador.")
32
+ customer_province: str = Field(description="Provincia donde se encuentra el comprador.")
33
+ customer_id: str = Field(description="N煤mero de identificaci贸n del comprador (NIF).")
34
+ customer_phone: str = Field(description="N煤mero de tel茅fono del comprador.")
35
+ vehicle_description: str = Field(description="Descripci贸n del veh铆culo que se est谩 comprando, incluyendo marca, modelo y a帽o.")
36
+ vehicle_color: str = Field(description="Color del veh铆culo.")
37
+ vehicle_price: str = Field(description="Precio total del veh铆culo, incluyendo impuestos.")
38
+
39
+ model = ChatGroq(
40
+ model="llama-3.1-70b-versatile",
41
+ temperature=0,
42
+ max_tokens=None,
43
+ timeout=None,
44
+ max_retries=2,
45
+ api_key='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'
46
+ )
47
+
48
+ os.environ['GROQ_API_KEY'] = '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'
49
+
50
+ entity = ['dealer_name', 'dealer_address', 'tax_id', 'contact_phone', 'contact_fax', 'contact_email', 'website_url',
51
+ 'operating_hours', 'saturday_hours', 'order_date', 'order_number', 'sales_rep',
52
+ 'customer_full_name', 'customer_address', 'customer_city', 'customer_postal_code',
53
+ 'customer_province', 'customer_id','customer_phone', 'vehicle_description','vehicle_color','vehicle_price']
54
+
55
+ # Streamlit App
56
+ st.title("Vehicle Information Extractor")
57
+ st.write("Upload a PDF file to extract vehicle information.")
58
+
59
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
60
+
61
+ if uploaded_file is not None:
62
+ with open("temp.pdf", "wb") as f:
63
+ f.write(uploaded_file.read())
64
+
65
+ col1, col2 = st.columns(2)
66
+
67
+ with col1:
68
+ doc = fitz.open("temp.pdf")
69
+ st.write("Uploaded PDF:")
70
+ for page_num in range(len(doc)):
71
+ page = doc.load_page(page_num)
72
+ pix = page.get_pixmap()
73
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
74
+ st.image(img, caption=f"Page {page_num+1}", use_column_width=True)
75
+
76
+ content = ocr.ocr("temp.pdf")
77
+
78
+ extracted_text = []
79
+ for page in content:
80
+ for result in page:
81
+ text = result[1][0]
82
+ extracted_text.append(text)
83
+
84
+ all_text = " ".join(extracted_text)
85
+
86
+ prompt_text = """Task: Analyze the {all_text} and find out given entity value:{entity} from the {all_text}:
87
+
88
+ Output Format: A table with the entity and value. First column contains the {entity} and second column contains the value fetched from the {all_text}.
89
+
90
+ Do not include any additional explanations or unnecessary details.
91
+ {format_instructions}"""
92
+
93
+ parser = PydanticOutputParser(pydantic_object=CarInfoEntity)
94
+
95
+ prompt = PromptTemplate(
96
+ template=prompt_text,
97
+ input_variables=["all_text", "entity"],
98
+ partial_variables={"format_instructions": parser.get_format_instructions()},
99
+ )
100
+
101
+ chain = prompt | model | parser
102
+
103
+ output = chain.invoke({"all_text": all_text, "entity": entity})
104
+
105
+ with col2:
106
+ st.write("Extracted Vehicle Information (Table):")
107
+ st.table(output)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ paddleocr==2.8.1
2
+ langchain==0.3.3
3
+ paddlepaddle==2.6.2
4
+ langchain_groq==0.2.0
5
+ PyMuPDF==1.24.11
6
+ pillow==10.4.0
7
+ streamlit==1.39.0