Spaces:

moazx
/

Code-Vulnerability-Detector

Sleeping

App Files Files Community

moazx commited on 24 days ago

Commit

0264912

verified ·

1 Parent(s): 28993e8

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -223

app.py CHANGED Viewed

@@ -1,223 +1,135 @@
-import streamlit as st
-import torch
-import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import pandas as pd
-# Page configuration
-st.set_page_config(
-    page_title="DiverseVul Code Vulnerability Classifier",
-    page_icon="🔍",
-    layout="wide"
-)
-# Example code snippets
-VULNERABLE_EXAMPLE = """static int cirrus_bitblt_videotovideo_patterncopy(CirrusVGAState * s)\n{\n
-return cirrus_bitblt_common_patterncopy(s,\n\t\t\t\t\t    s->vram_ptr +\n                                            (s->cirrus_blt_srcaddr & ~7));\n}"""
-NON_VULNERABLE_EXAMPLE = """static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
-\n const TranslationBlock *tb)\n{\n    LoongArchCPU *cpu = LOONGARCH_CPU(cs);\n    CPULoongArchState *env = &cpu->env;\n\n    env->pc = tb->pc;\n}"""
-@st.cache_resource
-def load_model():
-    """Load the model and tokenizer with caching"""
-    # Fine-tuned model
-    model_name = "trained_model"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = model.to(device)
-    model.eval()
-    return model, tokenizer, device
-def classify_code_sample(code_sample, model, tokenizer, device, max_length=512):
-    """Classify a single code sample and get probabilities"""
-    inputs = tokenizer(
-        code_sample,
-        truncation=True,
-        padding='max_length',
-        max_length=max_length,
-        return_tensors="pt"
-    ).to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-    probabilities = F.softmax(logits, dim=-1).squeeze().cpu().numpy()
-    return probabilities
-def main():
-    st.title("DiverseVul Code Vulnerability Classifier")
-    st.write("""
-    This tool analyzes code snippets for various types of vulnerabilities, including but not limited to:
-    - Security vulnerabilities (e.g., buffer overflows, injection flaws)
-    - Memory management issues
-    - Concurrency problems
-    - Resource leaks
-    - Logic errors
-    - Performance issues
-    - Reliability problems
-    """)
-    # Load model and tokenizer
-    try:
-        with st.spinner("Loading model..."):
-            model, tokenizer, device = load_model()
-        st.success("Model loaded successfully!")
-    except Exception as e:
-        st.error(f"Error loading model: {str(e)}")
-        return
-    # Example buttons
-    st.subheader("Try an Example")
-    col1, col2 = st.columns(2)
-    with col1:
-        if st.button("📋 Load Vulnerable Example"):
-            st.session_state['code_input'] = VULNERABLE_EXAMPLE
-    with col2:
-        if st.button("📋 Load Non-Vulnerable Example"):
-            st.session_state['code_input'] = NON_VULNERABLE_EXAMPLE
-    # Input area
-    st.subheader("Input Code")
-    code_input = st.text_area(
-        "Enter your code snippet here:",
-        value=st.session_state.get('code_input', ''),
-        height=300,
-        help="Paste your code here for comprehensive vulnerability analysis"
-    )
-    # Analysis button
-    if st.button("Analyze Code"):
-        if not code_input.strip():
-            st.warning("Please enter some code to analyze.")
-            return
-        with st.spinner("Analyzing code..."):
-            try:
-                # Get predictions
-                probabilities = classify_code_sample(code_input, model, tokenizer, device)
-                # Create results section
-                st.subheader("Analysis Results")
-                # Display prediction with confidence
-                class_names = ["Non-vulnerable", "Vulnerable"]
-                predicted_class_index = probabilities.argmax()
-                predicted_class = class_names[predicted_class_index]
-                confidence = probabilities[predicted_class_index] * 100
-                # Create columns for layout
-                col1, col2 = st.columns(2)
-                # Display prediction and confidence
-                with col1:
-                    st.metric(
-                        "Prediction",
-                        predicted_class,
-                        help="The model's classification of the code"
-                    )
-                with col2:
-                    st.metric(
-                        "Confidence",
-                        f"{confidence:.1f}%",
-                        help="How confident the model is in its prediction"
-                    )
-                # Create a DataFrame for detailed probabilities
-                results_df = pd.DataFrame({
-                    'Class': class_names,
-                    'Probability': probabilities
-                })
-                # Display probability distribution
-                st.subheader("Detailed Probabilities")
-                st.bar_chart(
-                    results_df.set_index('Class')['Probability']
-                )
-                # Additional information and disclaimers
-                if predicted_class == "Vulnerable":
-                    st.warning("""
-                        ⚠️ This code has been flagged as potentially vulnerable.
-                        Please review it carefully for various types of vulnerabilities including:
-                        Security:
-                        - Input validation
-                        - Authentication issues
-                        - Access control problems
-                        Implementation:
-                        - Memory management
-                        - Resource handling
-                        - Error handling
-                        Design:
-                        - Concurrency issues
-                        - Logic errors
-                        - Performance problems
-                        Best Practices:
-                        - Code structure
-                        - Error handling patterns
-                        - Resource cleanup
-                    """)
-                st.info("""
-                    Note: This tool is trained on the DiverseVul dataset, which covers 150 different
-                    types of Common Weakness Enumeration (CWE) categories. While comprehensive, it
-                    should be used as part of a larger code review process. False positives and
-                    negatives are possible.
-                """)
-            except Exception as e:
-                st.error(f"Error during analysis: {str(e)}")
-    # Add sidebar with information
-    with st.sidebar:
-        st.header("About")
-        st.write("""
-        This tool uses a machine learning model trained on the DiverseVul dataset, which includes:
-        - 18,945 vulnerable functions
-        - 330,492 non-vulnerable functions
-        - 150 different CWE types
-        - Code from thousands of real-world projects
-        """)
-        st.subheader("Example Code Explanation")
-        st.write("""
-        The vulnerable example contains:
-        - SQL injection vulnerability
-        - Path traversal vulnerability
-        - Buffer overflow vulnerability
-        The non-vulnerable example shows:
-        - Parameterized SQL queries
-        - Safe path validation
-        - Proper buffer bounds checking
-        """)
-        st.subheader("How to Use")
-        st.write("""
-        1. Click an example button or paste your code
-        2. Click 'Analyze Code'
-        3. Review the results and probability scores
-        4. Consider all flagged issues in context
-        5. Verify findings with manual review
-        """)
-        st.subheader("Limitations")
-        st.write("""
-        - The model may not catch all vulnerabilities
-        - Some safe code might be flagged as vulnerable
-        - Results should be verified by domain experts
-        - Performance varies across different CWE types
-        - Best used as part of a comprehensive code review process
-        """)
-if __name__ == "__main__":
-    main()

+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import gradio as gr
+# Example code snippets
+VULNERABLE_EXAMPLE = """static int cirrus_bitblt_videotovideo_patterncopy(CirrusVGAState * s)\n{\n
+return cirrus_bitblt_common_patterncopy(s,\n\t\t\t\t\t    s->vram_ptr +\n                                            (s->cirrus_blt_srcaddr & ~7));\n}"""
+NON_VULNERABLE_EXAMPLE = """static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
+\n const TranslationBlock *tb)\n{\n    LoongArchCPU *cpu = LOONGARCH_CPU(cs);\n    CPULoongArchState *env = &cpu->env;\n\n    env->pc = tb->pc;\n}"""
+# Load the model and tokenizer
+def load_model():
+    """Load the model and tokenizer"""
+    model_name = "moazx/Code-Vulnerability-Classifier_app"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    return model, tokenizer, device
+# Load the model and tokenizer once when the app starts
+model, tokenizer, device = load_model()
+def classify_code_sample(code_sample):
+    """Classify a single code sample and get probabilities"""
+    inputs = tokenizer(
+        code_sample,
+        truncation=True,
+        padding='max_length',
+        max_length=512,
+        return_tensors="pt"
+    ).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+    probabilities = F.softmax(logits, dim=-1).squeeze().cpu().numpy()
+    return probabilities
+def analyze_code(code_input):
+    """Analyze the code and return results"""
+    if not code_input.strip():
+        return "Please enter some code to analyze."
+    try:
+        # Get predictions
+        probabilities = classify_code_sample(code_input)
+        # Class names and confidence
+        class_names = ["Non-vulnerable", "Vulnerable"]
+        predicted_class_index = probabilities.argmax()
+        predicted_class = class_names[predicted_class_index]
+        confidence = probabilities[predicted_class_index] * 100
+        # Prepare results
+        result = f"**Prediction:** {predicted_class}\n"
+        result += f"**Confidence:** {confidence:.1f}%\n\n"
+        # Detailed probabilities
+        result += "**Detailed Probabilities:**\n"
+        for class_name, prob in zip(class_names, probabilities):
+            result += f"- {class_name}: {prob * 100:.1f}%\n"
+        # Additional warnings for vulnerable code
+        if predicted_class == "Vulnerable":
+            result += "\n⚠️ **Warning:** This code has been flagged as potentially vulnerable. Please review it carefully for:\n"
+            result += "- Security issues (e.g., input validation, authentication)\n"
+            result += "- Implementation issues (e.g., memory management, resource handling)\n"
+            result += "- Design issues (e.g., concurrency, logic errors)\n"
+        return result
+    except Exception as e:
+        return f"Error during analysis: {str(e)}"
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("# DiverseVul Code Vulnerability Classifier")
+    gr.Markdown("""
+    This tool analyzes code snippets for various types of vulnerabilities, including:
+    - Security vulnerabilities (e.g., buffer overflows, injection flaws)
+    - Memory management issues
+    - Concurrency problems
+    - Resource leaks
+    - Logic errors
+    - Performance issues
+    - Reliability problems
+    """)
+    with gr.Row():
+        with gr.Column():
+            code_input = gr.Textbox(
+                label="Enter your code snippet here:",
+                placeholder="Paste your code here...",
+                lines=10,
+                max_lines=20,
+                value=""
+            )
+            analyze_button = gr.Button("Analyze Code")
+        with gr.Column():
+            output = gr.Markdown(label="Analysis Results")
+    # Example buttons
+    gr.Markdown("### Try an Example")
+    with gr.Row():
+        vulnerable_example_button = gr.Button("📋 Load Vulnerable Example")
+        non_vulnerable_example_button = gr.Button("📋 Load Non-Vulnerable Example")
+    # Event handlers
+    analyze_button.click(
+        analyze_code,
+        inputs=code_input,
+        outputs=output
+    )
+    vulnerable_example_button.click(
+        lambda: VULNERABLE_EXAMPLE,
+        outputs=code_input
+    )
+    non_vulnerable_example_button.click(
+        lambda: NON_VULNERABLE_EXAMPLE,
+        outputs=code_input
+    )
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()