Spaces:

amiguel
/

classfinetune

Sleeping

App Files Files Community

amiguel commited on Mar 23

Commit

418028a

verified ·

1 Parent(s): 33be14e

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -37

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import PyPDF2
 import pandas as pd
 import torch
 import os
 # Set page configuration
 st.set_page_config(
@@ -14,12 +15,12 @@ st.set_page_config(
 )
 # Load Hugging Face token from environment variable
-HF_TOKEN = os.getenv("HF_TOKEN")  # Set this in your Space's secrets
 # Model name
 MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
-# Label mapping (same as in Colab)
 LABEL_TO_CLASS = {
     0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
     4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
@@ -38,8 +39,8 @@ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/99
 with st.sidebar:
     st.header("Upload Documents 📂")
     uploaded_file = st.file_uploader(
-        "Choose a PDF or XLSX file",
-        type=["pdf", "xlsx"],
         label_visibility="collapsed"
     )
@@ -47,22 +48,38 @@ with st.sidebar:
 if "messages" not in st.session_state:
     st.session_state.messages = []
-# File processing function
 @st.cache_data
 def process_file(uploaded_file):
     if uploaded_file is None:
-        return ""
     try:
         if uploaded_file.type == "application/pdf":
             pdf_reader = PyPDF2.PdfReader(uploaded_file)
-            return "\n".join([page.extract_text() for page in pdf_reader.pages])
         elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
             df = pd.read_excel(uploaded_file)
-            return df.to_markdown()
     except Exception as e:
         st.error(f"📄 Error processing file: {str(e)}")
-        return ""
 # Model loading function
 @st.cache_resource
@@ -73,19 +90,14 @@ def load_model(hf_token):
             return None
         login(token=hf_token)
-        # Load tokenizer and model for classification
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
         model = AutoModelForSequenceClassification.from_pretrained(
             MODEL_NAME,
-            num_labels=len(LABEL_TO_CLASS),  # Ensure correct number of labels
             token=hf_token
         )
-        # Determine device
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
         return model, tokenizer
     except Exception as e:
@@ -94,30 +106,36 @@ def load_model(hf_token):
 # Classification function
 def classify_instruction(prompt, file_context, model, tokenizer):
-    full_prompt = f"Context:\n{file_context}\n\nInstruction: {prompt}"
     model.eval()
     device = model.device
-    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model(**inputs)
-        prediction = outputs.logits.argmax().item()
-        class_name = LABEL_TO_CLASS[prediction]
-    return class_name
 # Display chat messages
 for message in st.session_state.messages:
-    try:
-        avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
-        with st.chat_message(message["role"], avatar=avatar):
-            st.markdown(message["content"])
-    except:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
 # Chat input handling
 if prompt := st.chat_input("Ask your inspection question..."):
@@ -127,7 +145,6 @@ if prompt := st.chat_input("Ask your inspection question..."):
         if model_data is None:
             st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
             st.stop()
         st.session_state.model, st.session_state.tokenizer = model_data
     model = st.session_state.model
@@ -140,14 +157,29 @@ if prompt := st.chat_input("Ask your inspection question..."):
     # Process file context
     file_context = process_file(uploaded_file)
     # Classify the instruction
     if model and tokenizer:
         try:
             with st.chat_message("assistant", avatar=BOT_AVATAR):
-                predicted_class = classify_instruction(prompt, file_context, model, tokenizer)
-                response = f"The Item Class is: {predicted_class}"
-                st.markdown(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
         except Exception as e:

 import pandas as pd
 import torch
 import os
+import re
 # Set page configuration
 st.set_page_config(
 )
 # Load Hugging Face token from environment variable
+HF_TOKEN = os.getenv("HF_TOKEN")
 # Model name
 MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
+# Label mapping
 LABEL_TO_CLASS = {
     0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
     4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
 with st.sidebar:
     st.header("Upload Documents 📂")
     uploaded_file = st.file_uploader(
+        "Choose a PDF, XLSX, or CSV file",
+        type=["pdf", "xlsx", "csv"],
         label_visibility="collapsed"
     )
 if "messages" not in st.session_state:
     st.session_state.messages = []
+# File processing function with pre-processing
 @st.cache_data
 def process_file(uploaded_file):
     if uploaded_file is None:
+        return None
     try:
         if uploaded_file.type == "application/pdf":
             pdf_reader = PyPDF2.PdfReader(uploaded_file)
+            text = "\n".join([page.extract_text() for page in pdf_reader.pages])
+            # Basic pre-processing
+            text = re.sub(r'\s+', ' ', text.lower().strip())
+            return {"type": "text", "content": text}
         elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
             df = pd.read_excel(uploaded_file)
+        elif uploaded_file.type == "text/csv":
+            df = pd.read_csv(uploaded_file)
+        # For tabular data (xlsx, csv), detect scope columns
+        if 'df' in locals():
+            scope_cols = [col for col in df.columns if "scope" in col.lower()]
+            if not scope_cols:
+                st.warning("No 'scope' column found in the file. Using all data as context.")
+                return {"type": "table", "content": df.to_markdown()}
+            # Pre-process scope data
+            scope_data = df[scope_cols].dropna().astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))
+            return {"type": "scope", "content": scope_data}
     except Exception as e:
         st.error(f"📄 Error processing file: {str(e)}")
+        return None
 # Model loading function
 @st.cache_resource
             return None
         login(token=hf_token)
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
         model = AutoModelForSequenceClassification.from_pretrained(
             MODEL_NAME,
+            num_labels=len(LABEL_TO_CLASS),
             token=hf_token
         )
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
         return model, tokenizer
     except Exception as e:
 # Classification function
 def classify_instruction(prompt, file_context, model, tokenizer):
     model.eval()
     device = model.device
+    if file_context["type"] == "scope":
+        # Batch prediction for multiple scope entries
+        predictions = []
+        for scope in file_context["content"].values.flatten():
+            full_prompt = f"Context:\n{scope}\n\nInstruction: {prompt}"
+            inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+                prediction = outputs.logits.argmax().item()
+                predictions.append(LABEL_TO_CLASS[prediction])
+        return predictions
+    else:
+        # Single prediction for text or table context
+        full_prompt = f"Context:\n{file_context['content']}\n\nInstruction: {prompt}"
+        inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+            prediction = outputs.logits.argmax().item()
+        return LABEL_TO_CLASS[prediction]
 # Display chat messages
 for message in st.session_state.messages:
+    avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
+    with st.chat_message(message["role"], avatar=avatar):
+        st.markdown(message["content"])
 # Chat input handling
 if prompt := st.chat_input("Ask your inspection question..."):
         if model_data is None:
             st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
             st.stop()
         st.session_state.model, st.session_state.tokenizer = model_data
     model = st.session_state.model
     # Process file context
     file_context = process_file(uploaded_file)
+    if file_context is None:
+        st.error("No file uploaded or file processing failed.")
+        st.stop()
     # Classify the instruction
     if model and tokenizer:
         try:
             with st.chat_message("assistant", avatar=BOT_AVATAR):
+                predicted_output = classify_instruction(prompt, file_context, model, tokenizer)
+                if file_context["type"] == "scope":
+                    # Display multiple predictions in a table
+                    scope_values = file_context["content"].values.flatten()
+                    result_df = pd.DataFrame({
+                        "Scope": scope_values,
+                        "Predicted Class": predicted_output
+                    })
+                    st.write("Predicted Classes:")
+                    st.table(result_df)
+                    response = "Predictions completed for multiple scope entries."
+                else:
+                    # Single prediction
+                    response = f"The Item Class is: {predicted_output}"
+                    st.markdown(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
         except Exception as e: