amiguel commited on
Commit
418028a
Β·
verified Β·
1 Parent(s): 33be14e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -37
app.py CHANGED
@@ -5,6 +5,7 @@ import PyPDF2
5
  import pandas as pd
6
  import torch
7
  import os
 
8
 
9
  # Set page configuration
10
  st.set_page_config(
@@ -14,12 +15,12 @@ st.set_page_config(
14
  )
15
 
16
  # Load Hugging Face token from environment variable
17
- HF_TOKEN = os.getenv("HF_TOKEN") # Set this in your Space's secrets
18
 
19
  # Model name
20
  MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
21
 
22
- # Label mapping (same as in Colab)
23
  LABEL_TO_CLASS = {
24
  0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
25
  4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
@@ -38,8 +39,8 @@ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/99
38
  with st.sidebar:
39
  st.header("Upload Documents πŸ“‚")
40
  uploaded_file = st.file_uploader(
41
- "Choose a PDF or XLSX file",
42
- type=["pdf", "xlsx"],
43
  label_visibility="collapsed"
44
  )
45
 
@@ -47,22 +48,38 @@ with st.sidebar:
47
  if "messages" not in st.session_state:
48
  st.session_state.messages = []
49
 
50
- # File processing function
51
  @st.cache_data
52
  def process_file(uploaded_file):
53
  if uploaded_file is None:
54
- return ""
55
 
56
  try:
57
  if uploaded_file.type == "application/pdf":
58
  pdf_reader = PyPDF2.PdfReader(uploaded_file)
59
- return "\n".join([page.extract_text() for page in pdf_reader.pages])
 
 
 
 
60
  elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
61
  df = pd.read_excel(uploaded_file)
62
- return df.to_markdown()
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
  st.error(f"πŸ“„ Error processing file: {str(e)}")
65
- return ""
66
 
67
  # Model loading function
68
  @st.cache_resource
@@ -73,19 +90,14 @@ def load_model(hf_token):
73
  return None
74
 
75
  login(token=hf_token)
76
-
77
- # Load tokenizer and model for classification
78
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
79
  model = AutoModelForSequenceClassification.from_pretrained(
80
  MODEL_NAME,
81
- num_labels=len(LABEL_TO_CLASS), # Ensure correct number of labels
82
  token=hf_token
83
  )
84
-
85
- # Determine device
86
  device = "cuda" if torch.cuda.is_available() else "cpu"
87
  model.to(device)
88
-
89
  return model, tokenizer
90
 
91
  except Exception as e:
@@ -94,30 +106,36 @@ def load_model(hf_token):
94
 
95
  # Classification function
96
  def classify_instruction(prompt, file_context, model, tokenizer):
97
- full_prompt = f"Context:\n{file_context}\n\nInstruction: {prompt}"
98
-
99
  model.eval()
100
  device = model.device
101
 
102
- inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
103
- inputs = {k: v.to(device) for k, v in inputs.items()}
104
-
105
- with torch.no_grad():
106
- outputs = model(**inputs)
107
- prediction = outputs.logits.argmax().item()
108
- class_name = LABEL_TO_CLASS[prediction]
109
-
110
- return class_name
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  # Display chat messages
113
  for message in st.session_state.messages:
114
- try:
115
- avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
116
- with st.chat_message(message["role"], avatar=avatar):
117
- st.markdown(message["content"])
118
- except:
119
- with st.chat_message(message["role"]):
120
- st.markdown(message["content"])
121
 
122
  # Chat input handling
123
  if prompt := st.chat_input("Ask your inspection question..."):
@@ -127,7 +145,6 @@ if prompt := st.chat_input("Ask your inspection question..."):
127
  if model_data is None:
128
  st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
129
  st.stop()
130
-
131
  st.session_state.model, st.session_state.tokenizer = model_data
132
 
133
  model = st.session_state.model
@@ -140,14 +157,29 @@ if prompt := st.chat_input("Ask your inspection question..."):
140
 
141
  # Process file context
142
  file_context = process_file(uploaded_file)
 
 
 
143
 
144
  # Classify the instruction
145
  if model and tokenizer:
146
  try:
147
  with st.chat_message("assistant", avatar=BOT_AVATAR):
148
- predicted_class = classify_instruction(prompt, file_context, model, tokenizer)
149
- response = f"The Item Class is: {predicted_class}"
150
- st.markdown(response)
 
 
 
 
 
 
 
 
 
 
 
 
151
  st.session_state.messages.append({"role": "assistant", "content": response})
152
 
153
  except Exception as e:
 
5
  import pandas as pd
6
  import torch
7
  import os
8
+ import re
9
 
10
  # Set page configuration
11
  st.set_page_config(
 
15
  )
16
 
17
  # Load Hugging Face token from environment variable
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
 
20
  # Model name
21
  MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
22
 
23
+ # Label mapping
24
  LABEL_TO_CLASS = {
25
  0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
26
  4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
 
39
  with st.sidebar:
40
  st.header("Upload Documents πŸ“‚")
41
  uploaded_file = st.file_uploader(
42
+ "Choose a PDF, XLSX, or CSV file",
43
+ type=["pdf", "xlsx", "csv"],
44
  label_visibility="collapsed"
45
  )
46
 
 
48
  if "messages" not in st.session_state:
49
  st.session_state.messages = []
50
 
51
+ # File processing function with pre-processing
52
  @st.cache_data
53
  def process_file(uploaded_file):
54
  if uploaded_file is None:
55
+ return None
56
 
57
  try:
58
  if uploaded_file.type == "application/pdf":
59
  pdf_reader = PyPDF2.PdfReader(uploaded_file)
60
+ text = "\n".join([page.extract_text() for page in pdf_reader.pages])
61
+ # Basic pre-processing
62
+ text = re.sub(r'\s+', ' ', text.lower().strip())
63
+ return {"type": "text", "content": text}
64
+
65
  elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
66
  df = pd.read_excel(uploaded_file)
67
+ elif uploaded_file.type == "text/csv":
68
+ df = pd.read_csv(uploaded_file)
69
+
70
+ # For tabular data (xlsx, csv), detect scope columns
71
+ if 'df' in locals():
72
+ scope_cols = [col for col in df.columns if "scope" in col.lower()]
73
+ if not scope_cols:
74
+ st.warning("No 'scope' column found in the file. Using all data as context.")
75
+ return {"type": "table", "content": df.to_markdown()}
76
+ # Pre-process scope data
77
+ scope_data = df[scope_cols].dropna().astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))
78
+ return {"type": "scope", "content": scope_data}
79
+
80
  except Exception as e:
81
  st.error(f"πŸ“„ Error processing file: {str(e)}")
82
+ return None
83
 
84
  # Model loading function
85
  @st.cache_resource
 
90
  return None
91
 
92
  login(token=hf_token)
 
 
93
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
94
  model = AutoModelForSequenceClassification.from_pretrained(
95
  MODEL_NAME,
96
+ num_labels=len(LABEL_TO_CLASS),
97
  token=hf_token
98
  )
 
 
99
  device = "cuda" if torch.cuda.is_available() else "cpu"
100
  model.to(device)
 
101
  return model, tokenizer
102
 
103
  except Exception as e:
 
106
 
107
  # Classification function
108
  def classify_instruction(prompt, file_context, model, tokenizer):
 
 
109
  model.eval()
110
  device = model.device
111
 
112
+ if file_context["type"] == "scope":
113
+ # Batch prediction for multiple scope entries
114
+ predictions = []
115
+ for scope in file_context["content"].values.flatten():
116
+ full_prompt = f"Context:\n{scope}\n\nInstruction: {prompt}"
117
+ inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
118
+ inputs = {k: v.to(device) for k, v in inputs.items()}
119
+ with torch.no_grad():
120
+ outputs = model(**inputs)
121
+ prediction = outputs.logits.argmax().item()
122
+ predictions.append(LABEL_TO_CLASS[prediction])
123
+ return predictions
124
+ else:
125
+ # Single prediction for text or table context
126
+ full_prompt = f"Context:\n{file_context['content']}\n\nInstruction: {prompt}"
127
+ inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
128
+ inputs = {k: v.to(device) for k, v in inputs.items()}
129
+ with torch.no_grad():
130
+ outputs = model(**inputs)
131
+ prediction = outputs.logits.argmax().item()
132
+ return LABEL_TO_CLASS[prediction]
133
 
134
  # Display chat messages
135
  for message in st.session_state.messages:
136
+ avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
137
+ with st.chat_message(message["role"], avatar=avatar):
138
+ st.markdown(message["content"])
 
 
 
 
139
 
140
  # Chat input handling
141
  if prompt := st.chat_input("Ask your inspection question..."):
 
145
  if model_data is None:
146
  st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
147
  st.stop()
 
148
  st.session_state.model, st.session_state.tokenizer = model_data
149
 
150
  model = st.session_state.model
 
157
 
158
  # Process file context
159
  file_context = process_file(uploaded_file)
160
+ if file_context is None:
161
+ st.error("No file uploaded or file processing failed.")
162
+ st.stop()
163
 
164
  # Classify the instruction
165
  if model and tokenizer:
166
  try:
167
  with st.chat_message("assistant", avatar=BOT_AVATAR):
168
+ predicted_output = classify_instruction(prompt, file_context, model, tokenizer)
169
+ if file_context["type"] == "scope":
170
+ # Display multiple predictions in a table
171
+ scope_values = file_context["content"].values.flatten()
172
+ result_df = pd.DataFrame({
173
+ "Scope": scope_values,
174
+ "Predicted Class": predicted_output
175
+ })
176
+ st.write("Predicted Classes:")
177
+ st.table(result_df)
178
+ response = "Predictions completed for multiple scope entries."
179
+ else:
180
+ # Single prediction
181
+ response = f"The Item Class is: {predicted_output}"
182
+ st.markdown(response)
183
  st.session_state.messages.append({"role": "assistant", "content": response})
184
 
185
  except Exception as e: