Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import PyPDF2
|
|
5 |
import pandas as pd
|
6 |
import torch
|
7 |
import os
|
|
|
8 |
|
9 |
# Set page configuration
|
10 |
st.set_page_config(
|
@@ -14,12 +15,12 @@ st.set_page_config(
|
|
14 |
)
|
15 |
|
16 |
# Load Hugging Face token from environment variable
|
17 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
18 |
|
19 |
# Model name
|
20 |
MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
|
21 |
|
22 |
-
# Label mapping
|
23 |
LABEL_TO_CLASS = {
|
24 |
0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
|
25 |
4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
|
@@ -38,8 +39,8 @@ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/99
|
|
38 |
with st.sidebar:
|
39 |
st.header("Upload Documents π")
|
40 |
uploaded_file = st.file_uploader(
|
41 |
-
"Choose a PDF or
|
42 |
-
type=["pdf", "xlsx"],
|
43 |
label_visibility="collapsed"
|
44 |
)
|
45 |
|
@@ -47,22 +48,38 @@ with st.sidebar:
|
|
47 |
if "messages" not in st.session_state:
|
48 |
st.session_state.messages = []
|
49 |
|
50 |
-
# File processing function
|
51 |
@st.cache_data
|
52 |
def process_file(uploaded_file):
|
53 |
if uploaded_file is None:
|
54 |
-
return
|
55 |
|
56 |
try:
|
57 |
if uploaded_file.type == "application/pdf":
|
58 |
pdf_reader = PyPDF2.PdfReader(uploaded_file)
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
61 |
df = pd.read_excel(uploaded_file)
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
except Exception as e:
|
64 |
st.error(f"π Error processing file: {str(e)}")
|
65 |
-
return
|
66 |
|
67 |
# Model loading function
|
68 |
@st.cache_resource
|
@@ -73,19 +90,14 @@ def load_model(hf_token):
|
|
73 |
return None
|
74 |
|
75 |
login(token=hf_token)
|
76 |
-
|
77 |
-
# Load tokenizer and model for classification
|
78 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
|
79 |
model = AutoModelForSequenceClassification.from_pretrained(
|
80 |
MODEL_NAME,
|
81 |
-
num_labels=len(LABEL_TO_CLASS),
|
82 |
token=hf_token
|
83 |
)
|
84 |
-
|
85 |
-
# Determine device
|
86 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
87 |
model.to(device)
|
88 |
-
|
89 |
return model, tokenizer
|
90 |
|
91 |
except Exception as e:
|
@@ -94,30 +106,36 @@ def load_model(hf_token):
|
|
94 |
|
95 |
# Classification function
|
96 |
def classify_instruction(prompt, file_context, model, tokenizer):
|
97 |
-
full_prompt = f"Context:\n{file_context}\n\nInstruction: {prompt}"
|
98 |
-
|
99 |
model.eval()
|
100 |
device = model.device
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# Display chat messages
|
113 |
for message in st.session_state.messages:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
st.markdown(message["content"])
|
118 |
-
except:
|
119 |
-
with st.chat_message(message["role"]):
|
120 |
-
st.markdown(message["content"])
|
121 |
|
122 |
# Chat input handling
|
123 |
if prompt := st.chat_input("Ask your inspection question..."):
|
@@ -127,7 +145,6 @@ if prompt := st.chat_input("Ask your inspection question..."):
|
|
127 |
if model_data is None:
|
128 |
st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
|
129 |
st.stop()
|
130 |
-
|
131 |
st.session_state.model, st.session_state.tokenizer = model_data
|
132 |
|
133 |
model = st.session_state.model
|
@@ -140,14 +157,29 @@ if prompt := st.chat_input("Ask your inspection question..."):
|
|
140 |
|
141 |
# Process file context
|
142 |
file_context = process_file(uploaded_file)
|
|
|
|
|
|
|
143 |
|
144 |
# Classify the instruction
|
145 |
if model and tokenizer:
|
146 |
try:
|
147 |
with st.chat_message("assistant", avatar=BOT_AVATAR):
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
152 |
|
153 |
except Exception as e:
|
|
|
5 |
import pandas as pd
|
6 |
import torch
|
7 |
import os
|
8 |
+
import re
|
9 |
|
10 |
# Set page configuration
|
11 |
st.set_page_config(
|
|
|
15 |
)
|
16 |
|
17 |
# Load Hugging Face token from environment variable
|
18 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
19 |
|
20 |
# Model name
|
21 |
MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
|
22 |
|
23 |
+
# Label mapping
|
24 |
LABEL_TO_CLASS = {
|
25 |
0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
|
26 |
4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
|
|
|
39 |
with st.sidebar:
|
40 |
st.header("Upload Documents π")
|
41 |
uploaded_file = st.file_uploader(
|
42 |
+
"Choose a PDF, XLSX, or CSV file",
|
43 |
+
type=["pdf", "xlsx", "csv"],
|
44 |
label_visibility="collapsed"
|
45 |
)
|
46 |
|
|
|
48 |
if "messages" not in st.session_state:
|
49 |
st.session_state.messages = []
|
50 |
|
51 |
+
# File processing function with pre-processing
|
52 |
@st.cache_data
|
53 |
def process_file(uploaded_file):
|
54 |
if uploaded_file is None:
|
55 |
+
return None
|
56 |
|
57 |
try:
|
58 |
if uploaded_file.type == "application/pdf":
|
59 |
pdf_reader = PyPDF2.PdfReader(uploaded_file)
|
60 |
+
text = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
61 |
+
# Basic pre-processing
|
62 |
+
text = re.sub(r'\s+', ' ', text.lower().strip())
|
63 |
+
return {"type": "text", "content": text}
|
64 |
+
|
65 |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
66 |
df = pd.read_excel(uploaded_file)
|
67 |
+
elif uploaded_file.type == "text/csv":
|
68 |
+
df = pd.read_csv(uploaded_file)
|
69 |
+
|
70 |
+
# For tabular data (xlsx, csv), detect scope columns
|
71 |
+
if 'df' in locals():
|
72 |
+
scope_cols = [col for col in df.columns if "scope" in col.lower()]
|
73 |
+
if not scope_cols:
|
74 |
+
st.warning("No 'scope' column found in the file. Using all data as context.")
|
75 |
+
return {"type": "table", "content": df.to_markdown()}
|
76 |
+
# Pre-process scope data
|
77 |
+
scope_data = df[scope_cols].dropna().astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))
|
78 |
+
return {"type": "scope", "content": scope_data}
|
79 |
+
|
80 |
except Exception as e:
|
81 |
st.error(f"π Error processing file: {str(e)}")
|
82 |
+
return None
|
83 |
|
84 |
# Model loading function
|
85 |
@st.cache_resource
|
|
|
90 |
return None
|
91 |
|
92 |
login(token=hf_token)
|
|
|
|
|
93 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
|
94 |
model = AutoModelForSequenceClassification.from_pretrained(
|
95 |
MODEL_NAME,
|
96 |
+
num_labels=len(LABEL_TO_CLASS),
|
97 |
token=hf_token
|
98 |
)
|
|
|
|
|
99 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
100 |
model.to(device)
|
|
|
101 |
return model, tokenizer
|
102 |
|
103 |
except Exception as e:
|
|
|
106 |
|
107 |
# Classification function
|
108 |
def classify_instruction(prompt, file_context, model, tokenizer):
|
|
|
|
|
109 |
model.eval()
|
110 |
device = model.device
|
111 |
|
112 |
+
if file_context["type"] == "scope":
|
113 |
+
# Batch prediction for multiple scope entries
|
114 |
+
predictions = []
|
115 |
+
for scope in file_context["content"].values.flatten():
|
116 |
+
full_prompt = f"Context:\n{scope}\n\nInstruction: {prompt}"
|
117 |
+
inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
|
118 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
119 |
+
with torch.no_grad():
|
120 |
+
outputs = model(**inputs)
|
121 |
+
prediction = outputs.logits.argmax().item()
|
122 |
+
predictions.append(LABEL_TO_CLASS[prediction])
|
123 |
+
return predictions
|
124 |
+
else:
|
125 |
+
# Single prediction for text or table context
|
126 |
+
full_prompt = f"Context:\n{file_context['content']}\n\nInstruction: {prompt}"
|
127 |
+
inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
|
128 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
129 |
+
with torch.no_grad():
|
130 |
+
outputs = model(**inputs)
|
131 |
+
prediction = outputs.logits.argmax().item()
|
132 |
+
return LABEL_TO_CLASS[prediction]
|
133 |
|
134 |
# Display chat messages
|
135 |
for message in st.session_state.messages:
|
136 |
+
avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
|
137 |
+
with st.chat_message(message["role"], avatar=avatar):
|
138 |
+
st.markdown(message["content"])
|
|
|
|
|
|
|
|
|
139 |
|
140 |
# Chat input handling
|
141 |
if prompt := st.chat_input("Ask your inspection question..."):
|
|
|
145 |
if model_data is None:
|
146 |
st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
|
147 |
st.stop()
|
|
|
148 |
st.session_state.model, st.session_state.tokenizer = model_data
|
149 |
|
150 |
model = st.session_state.model
|
|
|
157 |
|
158 |
# Process file context
|
159 |
file_context = process_file(uploaded_file)
|
160 |
+
if file_context is None:
|
161 |
+
st.error("No file uploaded or file processing failed.")
|
162 |
+
st.stop()
|
163 |
|
164 |
# Classify the instruction
|
165 |
if model and tokenizer:
|
166 |
try:
|
167 |
with st.chat_message("assistant", avatar=BOT_AVATAR):
|
168 |
+
predicted_output = classify_instruction(prompt, file_context, model, tokenizer)
|
169 |
+
if file_context["type"] == "scope":
|
170 |
+
# Display multiple predictions in a table
|
171 |
+
scope_values = file_context["content"].values.flatten()
|
172 |
+
result_df = pd.DataFrame({
|
173 |
+
"Scope": scope_values,
|
174 |
+
"Predicted Class": predicted_output
|
175 |
+
})
|
176 |
+
st.write("Predicted Classes:")
|
177 |
+
st.table(result_df)
|
178 |
+
response = "Predictions completed for multiple scope entries."
|
179 |
+
else:
|
180 |
+
# Single prediction
|
181 |
+
response = f"The Item Class is: {predicted_output}"
|
182 |
+
st.markdown(response)
|
183 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
184 |
|
185 |
except Exception as e:
|