Spaces:

tgd1115
/

neuro-orion-v1

Sleeping

App Files Files Community

tgd1115 commited on Dec 16, 2024

Commit

976b948

verified ·

1 Parent(s): 3c2ff6b

manual deployment

Browse files

Files changed (12) hide show

.gitignore +166 -0
README.md +153 -14
requirements.txt +26 -0
src/app.py +236 -0
src/config/llm/nvidia-llama-3.1-nemotron-70b-instruct.yaml +4 -0
src/config/llm/openai-gpt-3.5-turbo.yaml +4 -0
src/config/llm/openai-gpt-4o-mini.yaml +4 -0
src/llm/base_llm_provider.py +16 -0
src/llm/enums.py +3 -0
src/llm/llm.py +32 -0
src/llm/nvidia_llm.py +29 -0
src/llm/openai_llm.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Mac cache file
+.DS_Store

README.md CHANGED Viewed

@@ -1,14 +1,153 @@
----
-title: Neuro Orion V1
-emoji: 🏆
-colorFrom: gray
-colorTo: gray
-sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: DL Assignment
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Neuro Orion - NYC Taxi Traffic Time Series Anomaly Detection
+emoji: 🐨
+colorFrom: indigo
+colorTo: yellow
+sdk: streamlit
+sdk_version: "1.41.1"
+app_file: src/app.py
+pinned: true
+---
+[![Sync to Hugging Face hub](https://github.com/gdtan02/NeuroOrion_Time_Series_Anomaly_Detection/actions/workflows/main.yml/badge.svg)](https://github.com/gdtan02/NeuroOrion_Time_Series_Anomaly_Detection/actions/workflows/main.yml)
+# NYC Taxi Traffic - Time Series Anomaly Detection
+## Project Overview
+This project is developed for WID3011 Deep Learning Assignment.
+SDG 8: Sustainable Cities & Communities:
+This problem examines an anomaly detection challenge using the NYC Taxi Traffic dataset, available on Kaggle
+([https://www.kaggle.com/datasets/julienjta/nyc-taxi-traffic]) and provided by the NYC Taxi and Limousine
+Commission. The dataset presents a univariate time series of total taxi passenger counts from July 2014 to January
+2015, aggregated every 30 minutes. It includes five notable anomalies, occurring during the NYC Marathon,
+Thanksgiving, Christmas, New Year’s Day, and a snowstorm.
+The task involves implementing a complete anomaly detection pipeline: analyzing the NYC Taxi Traffic dataset,
+developing a Long Short Term Memory (LSTM) model to detect outliers and anomaly.
+**Group Name:**
+Neuro Orion
+**Group Members:**
+1. Poo Wei Chien
+2. Tan Guo Dong
+3. Tan Zhi Jian
+4. Sanjivan A/L Balajawahar
+5. Marvin Chin Yi Kai
+---
+## Acknowledgements
+We acknowledge the contributors to the following resources:
+- All the members of Neuro Orion for their contributions to the project.
+- NYC Taxi Traffic dataset provided by NYC Taxi and Limousine Commission.
+- Open-source tools and frameworks like TensorFlow, PyTorch, and Jupyter Notebook.
+---
+## Installation Guide
+Follow these steps to set up the project locally:
+### 1. Clone the repository to your local machine:
+Run the following command in your terminal:
+```bash
+git clone https://github.com/gdtan02/NeuroOrion_Time_Series_Anomaly_Detection.git
+cd nyc-taxi-anomaly-detection
+```
+### 2. Set up a Python Virtual Environment (Optional):
+You can use `venv` or `conda` to create and activate a virtual environment to manage dependencies.
+Using `venv`:
+For Windows user, run the following command:
+```commandline
+python -m venv venv
+venv\Scripts\activate
+```
+For MacOS/Linux user, run the following command:
+```commandline
+python3 -m venv venv
+source venv/bin/activate
+```
+Using `conda`:
+```commandline
+conda create --name nyc-taxi-env python=3.8 -y
+conda activate nyc-taxi-env
+```
+### 3. Install dependencies:
+Install all the required dependencies listed in `requirements.txt` file using `pip`:
+```commandline
+pip install -r requirements.txt
+```
+### 4. Install Jupyter Notebook (Optional):
+If Jupyter Notebook is not already installed, you can install it using `pip`:
+```commandline
+pip install notebook
+```
+Alternatively, if you are using `conda`, you can install Jupyter Notebook using the following command:
+```commandline
+conda install -c conda-forge notebook
+```
+### 5: Start Jupyter Notebook
+Launch Jupyter Notebook to execute the project code:
+```commandline
+jupyter notebook
+```
+A browser window should open, displaying the Jupyter Notebook interface.
+If it does not open automatically, copy and paste the link shown in the terminal into your web browser.
+You are now ready to run the project code in the Jupyter Notebook.
+---
+## Development Setup
+### 1: Code Formatting
+We use Black for code formatting. To set up:
+1. Install black and pre-commit:
+```bash
+pip install black pre-commit
+```
+2. Run pre-commit hooks:
+```bash
+pre-commit install
+```
+3. Run Black manually:
+```bash
+black .
+```
+4. Configure VS Code (optional):
+```json
+{
+    "python.formatting.provider": "black",
+    "editor.formatOnSave": true
+}
+```
+Refer to the [Black documentation](https://black.readthedocs.io/en/stable/) for more information. Reference from the article [here](https://dev.to/emmo00/how-to-setup-black-and-pre-commit-in-python-for-auto-text-formatting-on-commit-4kka)
+---

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Data analytics libraries
+pandas
+matplotlib
+numpy
+seaborn
+statsmodels
+# Machine learning and deep learning libraries
+scikit-learn
+tensorflow
+keras
+torch
+# Llm
+llama-index
+llama-index-llms-openai
+llama-index-llms-nvidia
+llama-index-llms-openai-like
+# Others
+tqdm
+black
+pre-commit
+streamlit
+plotly
+pyod

src/app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objs as go
+from sklearn.preprocessing import StandardScaler
+from pyod.models.iforest import IForest
+from datetime import datetime, timedelta
+class NYCTaxiAnomalyDetector:
+    def __init__(self, data):
+        self.data = data.copy()
+        self.scaler = StandardScaler()
+    def filter_by_date_range(self, start_date, end_date):
+        """
+        Filter data by specified date range
+        :param start_date: Start date of the range
+        :param end_date: End date of the range
+        :return: Filtered DataFrame
+        """
+        # Ensure date column is datetime
+        if not pd.api.types.is_datetime64_any_dtype(self.data["date"]):
+            self.data["date"] = pd.to_datetime(self.data["date"])
+        # Filter data
+        filtered_data = self.data[
+            (self.data["date"] >= start_date) & (self.data["date"] <= end_date)
+        ]
+        return filtered_data
+    def preprocess_data(self, data, column):
+        """
+        Preprocess data for anomaly detection
+        :param data: Filtered DataFrame
+        :param column: Column to detect anomalies in
+        :return: Scaled data and original index
+        """
+        # Ensure the column is numeric
+        data[column] = pd.to_numeric(data[column], errors="coerce")
+        # Remove NaN values
+        clean_data = data[column].dropna()
+        # Scale the data
+        scaled_data = self.scaler.fit_transform(clean_data.values.reshape(-1, 1))
+        return scaled_data, clean_data.index
+    def detect_anomalies(self, data, column, contamination=0.05):
+        """
+        Detect anomalies using Isolation Forest
+        :param data: Filtered DataFrame
+        :param column: Column to detect anomalies in
+        :param contamination: Expected proportion of outliers
+        :return: DataFrame with anomaly detection results
+        """
+        # Preprocess data
+        scaled_data, original_index = self.preprocess_data(data, column)
+        # Apply Isolation Forest
+        clf = IForest(contamination=contamination, random_state=42)
+        y_pred = clf.fit_predict(scaled_data)
+        # Create results DataFrame
+        anomaly_results = pd.DataFrame(
+            {
+                "date": original_index,
+                "value": data.loc[original_index, column],
+                "is_anomaly": y_pred == 1,
+            }
+        )
+        return anomaly_results
+class AIContextGenerator:
+    def generate_context(self, anomaly_date):
+        """
+        Generate potential context for the anomaly
+        :param anomaly_date: Date of the anomaly
+        :return: List of contextual insights
+        """
+        # Mock contextual insights - replace with actual data sources
+        contexts = [
+            {
+                "type": "Weather",
+                "description": f"Weather conditions on {anomaly_date.date()}",
+                "severity": "High",
+            },
+            {
+                "type": "Event",
+                "description": f"City events around {anomaly_date.date()}",
+                "severity": "Medium",
+            },
+            {
+                "type": "Economic",
+                "description": f"Economic factors on {anomaly_date.date()}",
+                "severity": "Low",
+            },
+        ]
+        return contexts
+def load_nyc_taxi_data():
+    """
+    Load and preprocess NYC Taxi dataset
+    :return: DataFrame with synthetic taxi traffic data
+    """
+    # Synthetic data generation
+    dates = pd.date_range(start="2023-01-01", end="2023-12-31", freq="D")
+    base_traffic = np.random.normal(5000, 500, len(dates))
+    # Introduce some anomalies
+    base_traffic[50] = 10000  # Extreme spike
+    base_traffic[200] = 500  # Extreme drop
+    base_traffic[300] = 12000  # Another spike
+    df = pd.DataFrame({"date": dates, "daily_traffic": base_traffic})
+    return df
+def main():
+    st.set_page_config(
+        page_title="NYC Taxi Traffic Anomaly Detection", page_icon="🚕", layout="wide"
+    )
+    st.title("🚕 NYC Taxi Traffic Anomaly Detection")
+    # Load Data
+    taxi_data = load_nyc_taxi_data()
+    # Sidebar for Configuration
+    st.sidebar.header("Anomaly Detection Settings")
+    # Date Range Selection
+    st.sidebar.subheader("Date Range")
+    min_date = taxi_data["date"].min().date()
+    max_date = taxi_data["date"].max().date()
+    col1, col2 = st.sidebar.columns(2)
+    with col1:
+        start_date = st.date_input(
+            "Start Date", min_value=min_date, max_value=max_date, value=min_date
+        )
+    with col2:
+        end_date = st.date_input(
+            "End Date", min_value=min_date, max_value=max_date, value=max_date
+        )
+    # Anomaly Sensitivity
+    anomaly_threshold = st.sidebar.slider(
+        "Anomaly Sensitivity",
+        min_value=0.01,
+        max_value=0.1,
+        value=0.05,
+        step=0.01,
+        help="Lower values detect fewer but more extreme anomalies",
+    )
+    # Instantiate Detector
+    detector = NYCTaxiAnomalyDetector(taxi_data)
+    # Filter Data by Date Range
+    filtered_data = detector.filter_by_date_range(
+        pd.to_datetime(start_date), pd.to_datetime(end_date)
+    )
+    # Detect Anomalies
+    anomalies = detector.detect_anomalies(
+        filtered_data, "daily_traffic", contamination=anomaly_threshold
+    )
+    # Visualization
+    st.header("Daily Taxi Traffic Trend")
+    fig = px.line(
+        filtered_data,
+        x="date",
+        y="daily_traffic",
+        title=f"NYC Taxi Daily Traffic ({start_date} to {end_date})",
+        labels={"daily_traffic": "Number of Taxi Rides"},
+    )
+    # Highlight Anomalies
+    anomaly_points = filtered_data[anomalies["is_anomaly"]]
+    fig.add_trace(
+        go.Scatter(
+            x=anomaly_points["date"],
+            y=anomaly_points["daily_traffic"],
+            mode="markers",
+            name="Anomalies",
+            marker=dict(color="red", size=10, symbol="star"),
+        )
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # Anomaly Details
+    st.header("Anomaly Insights")
+    if not anomaly_points.empty:
+        context_generator = AIContextGenerator()
+        for _, anomaly in anomaly_points.iterrows():
+            st.subheader(f"Anomaly on {anomaly['date'].date()}")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Taxi Rides", f"{anomaly['daily_traffic']:.0f}")
+            with col2:
+                contexts = context_generator.generate_context(anomaly["date"])
+                st.write("### Potential Context")
+                for context in contexts:
+                    st.markdown(
+                        f"""
+                    - **{context['type']}**: {context['description']}
+                      (Severity: {context['severity']})
+                    """
+                    )
+    else:
+        st.info("No significant anomalies detected with current settings.")
+if __name__ == "__main__":
+    main()

src/config/llm/nvidia-llama-3.1-nemotron-70b-instruct.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+PROVIDER: nvidia
+BASE_URL: https://integrate.api.nvidia.com/v1
+MODEL: nvidia/llama-3.1-nemotron-70b-instruct
+TEMPERATURE: 0

src/config/llm/openai-gpt-3.5-turbo.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+PROVIDER: openai
+BASE_URL: default
+MODEL: gpt-3.5-turbo
+TEMPERATURE: 0

src/config/llm/openai-gpt-4o-mini.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+PROVIDER: openai
+BASE_URL: default
+MODEL: gpt-4o-mini
+TEMPERATURE: 0

src/llm/base_llm_provider.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Base class for LLM providers"""
+from abc import abstractmethod
+from typing import Dict, Optional
+class BaseLLMProvider:
+    @abstractmethod
+    def __init__(self):
+        """LLM provider initialization"""
+        raise NotImplementedError
+    @abstractmethod
+    def complete(self, prompt: str = "") -> str:
+        """LLM chat completion implementation by each provider"""
+        raise NotImplementedError

src/llm/enums.py ADDED Viewed

	@@ -0,0 +1,3 @@

+OPENAI_LLM = "openai"
+NVIDIA_LLM = "nvidia"
+DEFAULT_LLM_API_BASE = "default"

src/llm/llm.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import yaml
+from src.llm.enums import OPENAI_LLM, NVIDIA_LLM
+from src.llm.base_llm_provider import BaseLLMProvider
+from src.llm.openai_llm import OpenAILLM
+from src.llm.nvidia_llm import NvidiaLLM
+def get_llm(config_file_path: str = "config.yaml") -> BaseLLMProvider:
+    """
+    Initiates LLM client from config file
+    """
+    # load config
+    with open(config_file_path, "r") as f:
+        config = yaml.safe_load(f)
+    # init & return llm
+    if config["PROVIDER"] == OPENAI_LLM:
+        return OpenAILLM(
+            model=config["MODEL"],
+            temperature=config["TEMPERATURE"],
+            base_url=config["BASE_URL"],
+        )
+    elif config["PROVIDER"] == NVIDIA_LLM:
+        return NvidiaLLM(
+            model=config["MODEL"],
+            temperature=config["TEMPERATURE"],
+            base_url=config["BASE_URL"],
+        )
+    else:
+        raise ValueError(config["MODEL"])

src/llm/nvidia_llm.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""NVIDIA LLM Implementation"""
+from llama_index.llms.nvidia import NVIDIA
+from src.llm.base_llm_provider import BaseLLMProvider
+from src.llm.enums import DEFAULT_LLM_API_BASE
+class NvidiaLLM(BaseLLMProvider):
+    def __init__(
+        self,
+        model: str = "nvidia/llama-3.1-nemotron-70b-instruct",
+        temperature: float = 0.0,
+        base_url: str = "https://integrate.api.nvidia.com/v1",
+    ):
+        """Initiate NVIDIA client"""
+        if base_url == DEFAULT_LLM_API_BASE:
+            self._client = NVIDIA(
+                model=model,
+                temperature=temperature,
+            )
+        else:
+            self._client = NVIDIA(
+                model=model, temperature=temperature, base_url=base_url
+            )
+    def complete(self, prompt: str = "") -> str:
+        return str(self._client.complete(prompt))

src/llm/openai_llm.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""OpenAI LLM Implementation"""
+from llama_index.llms.openai import OpenAI
+from src.llm.base_llm_provider import BaseLLMProvider
+from src.llm.enums import DEFAULT_LLM_API_BASE
+class OpenAILLM(BaseLLMProvider):
+    def __init__(
+        self,
+        model: str = "gpt-4o-mini",
+        temperature: float = 0.0,
+        base_url: str = DEFAULT_LLM_API_BASE,
+    ):
+        """Initiate OpenAI client"""
+        if base_url == DEFAULT_LLM_API_BASE:
+            self._client = OpenAI(
+                model=model,
+                temperature=temperature,
+            )
+        else:
+            self._client = OpenAI(
+                model=model, temperature=temperature, base_url=base_url
+            )
+    def complete(self, prompt: str = "") -> str:
+        return str(self._client.complete(prompt))