Spaces:

qanta-challenge
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on 23 days ago

Commit

3a1af80

1 Parent(s): 4f5d1cb

Adds support for caching llm calls to a sqlite db and a hf dataset. Refactors repo creation logic and fixes unused temperature param.

Browse files

Files changed (5) hide show

check_repos.py +14 -15
src/envs.py +3 -0
src/workflows/executors.py +1 -0
src/workflows/llmcache.py +479 -0
src/workflows/llms.py +123 -16

check_repos.py CHANGED Viewed

@@ -1,26 +1,25 @@
 from huggingface_hub import HfApi
-from src.envs import QUEUE_REPO, RESULTS_REPO, TOKEN
-def check_and_create_repos():
     api = HfApi(token=TOKEN)
-    # Check and create queue repo
     try:
-        api.repo_info(repo_id=QUEUE_REPO, repo_type="dataset")
-        print(f"Queue repository {QUEUE_REPO} exists")
     except Exception:
-        print(f"Creating queue repository {QUEUE_REPO}")
-        api.create_repo(repo_id=QUEUE_REPO, repo_type="dataset", exist_ok=True, private=False)
-    # Check and create results repo
-    try:
-        api.repo_info(repo_id=RESULTS_REPO, repo_type="dataset")
-        print(f"Results repository {RESULTS_REPO} exists")
-    except Exception:
-        print(f"Creating results repository {RESULTS_REPO}")
-        api.create_repo(repo_id=RESULTS_REPO, repo_type="dataset", exist_ok=True, private=False)
 if __name__ == "__main__":

 from huggingface_hub import HfApi
+from src.envs import LLM_CACHE_REPO, QUEUE_REPO, RESULTS_REPO, TOKEN
+def check_and_create_dataset_repo(repo_id: str):
     api = HfApi(token=TOKEN)
     try:
+        api.repo_info(repo_id=repo_id, repo_type="dataset")
+        print(f"{repo_id} exists")
     except Exception:
+        print(f"Creating {repo_id}")
+        api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, private=True)
+def check_and_create_repos():
+    print("1. QUEUE Repository")
+    check_and_create_dataset_repo(QUEUE_REPO)
+    print("2. RESULTS Repository")
+    check_and_create_dataset_repo(RESULTS_REPO)
+    print("3. LLM Cache Repository")
+    check_and_create_dataset_repo(LLM_CACHE_REPO)
 if __name__ == "__main__":

src/envs.py CHANGED Viewed

@@ -15,6 +15,7 @@ OWNER = "umdclip"
 REPO_ID = f"{OWNER}/quizbowl-submission"
 QUEUE_REPO = f"{OWNER}/advcal-requests"
 RESULTS_REPO = f"{OWNER}/model-results"  # TODO: change to advcal-results after testing is done
 EXAMPLES_PATH = "examples"
@@ -29,12 +30,14 @@ PLAYGROUND_DATASET_NAMES = {
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 SERVER_REFRESH_INTERVAL = 86400  # seconds (one day)
 LEADERBOARD_REFRESH_INTERVAL = 600  # seconds (10 minutes)

 REPO_ID = f"{OWNER}/quizbowl-submission"
 QUEUE_REPO = f"{OWNER}/advcal-requests"
 RESULTS_REPO = f"{OWNER}/model-results"  # TODO: change to advcal-results after testing is done
+LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
 EXAMPLES_PATH = "examples"
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
+LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+LLM_CACHE_REFRESH_INTERVAL = 600  # seconds (30 minutes)
 SERVER_REFRESH_INTERVAL = 86400  # seconds (one day)
 LEADERBOARD_REFRESH_INTERVAL = 600  # seconds (10 minutes)

src/workflows/executors.py CHANGED Viewed

@@ -221,6 +221,7 @@ def execute_model_step(
         system=model_step.system_prompt,
         prompt=step_result,
         response_format=ModelResponse,
         logprobs=logprobs,
     )

         system=model_step.system_prompt,
         prompt=step_result,
         response_format=ModelResponse,
+        temperature=model_step.temperature,
         logprobs=logprobs,
     )

src/workflows/llmcache.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import hashlib
+import json
+import os
+import sqlite3
+import threading
+import time
+from pathlib import Path
+from typing import Any, Optional
+from datasets import Dataset, Features, Value
+from huggingface_hub import snapshot_download
+from loguru import logger
+def load_dataset_from_hf(repo_id, local_dir):
+    snapshot_download(
+        repo_id=repo_id,
+        local_dir=local_dir,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=os.environ["HF_TOKEN"],
+    )
+class CacheDB:
+    """Handles database operations for storing and retrieving cache entries."""
+    def __init__(self, db_path: Path):
+        """Initialize database connection.
+        Args:
+            db_path: Path to SQLite database file
+        """
+        self.db_path = db_path
+        self.lock = threading.Lock()
+        # Initialize the database
+        try:
+            self.initialize_db()
+        except Exception as e:
+            logger.exception(f"Failed to initialize database: {e}")
+            logger.warning(f"Please provide a different filepath or remove the file at {self.db_path}")
+            raise
+    def initialize_db(self) -> None:
+        """Initialize SQLite database with the required table."""
+        # Check if database file already exists
+        if self.db_path.exists():
+            self._verify_existing_db()
+        else:
+            self._create_new_db()
+    def _verify_existing_db(self) -> None:
+        """Verify and repair an existing database if needed."""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                self._ensure_table_exists(cursor)
+                self._verify_schema(cursor)
+                self._ensure_index_exists(cursor)
+                conn.commit()
+            logger.info(f"Using existing SQLite database at {self.db_path}")
+        except Exception as e:
+            logger.exception(f"Database corruption detected: {e}")
+            raise ValueError(f"Corrupted database at {self.db_path}: {str(e)}")
+    def _create_new_db(self) -> None:
+        """Create a new database with the required schema."""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                self._create_table(cursor)
+                self._ensure_index_exists(cursor)
+                conn.commit()
+                logger.info(f"Initialized new SQLite database at {self.db_path}")
+        except Exception as e:
+            logger.exception(f"Failed to initialize SQLite database: {e}")
+            raise
+    def _ensure_table_exists(self, cursor) -> None:
+        """Check if the llm_cache table exists and create it if not."""
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='llm_cache'")
+        if not cursor.fetchone():
+            self._create_table(cursor)
+            logger.info("Created missing llm_cache table")
+    def _create_table(self, cursor) -> None:
+        """Create the llm_cache table with the required schema."""
+        cursor.execute("""
+        CREATE TABLE IF NOT EXISTS llm_cache (
+            key TEXT PRIMARY KEY,
+            request_json TEXT,
+            response_json TEXT,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+        """)
+    def _verify_schema(self, cursor) -> None:
+        """Verify that the table schema has all required columns."""
+        cursor.execute("PRAGMA table_info(llm_cache)")
+        columns = {row[1] for row in cursor.fetchall()}
+        required_columns = {"key", "request_json", "response_json", "created_at"}
+        if not required_columns.issubset(columns):
+            missing = required_columns - columns
+            raise ValueError(f"Database schema is corrupted. Missing columns: {missing}")
+    def _ensure_index_exists(self, cursor) -> None:
+        """Create an index on the key column for faster lookups."""
+        cursor.execute("CREATE INDEX IF NOT EXISTS idx_llm_cache_key ON llm_cache (key)")
+    def get(self, key: str) -> Optional[dict[str, Any]]:
+        """Get cached entry by key.
+        Args:
+            key: Cache key to look up
+        Returns:
+            Dict containing the request and response or None if not found
+        """
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.row_factory = sqlite3.Row
+                cursor = conn.cursor()
+                cursor.execute("SELECT request_json, response_json FROM llm_cache WHERE key = ?", (key,))
+                result = cursor.fetchone()
+                if result:
+                    logger.debug(f"Cache hit for key: {key}. Response: {result['response_json']}")
+                    return {
+                        "request": result["request_json"],
+                        "response": result["response_json"],
+                    }
+                logger.debug(f"Cache miss for key: {key}")
+                return None
+        except Exception as e:
+            logger.error(f"Error retrieving from cache: {e}")
+            return None
+    def set(self, key: str, request_json: str, response_json: str) -> bool:
+        """Set entry in cache.
+        Args:
+            key: Cache key
+            request_json: JSON string of request parameters
+            response_json: JSON string of response
+        Returns:
+            True if successful, False otherwise
+        """
+        with self.lock:
+            try:
+                with sqlite3.connect(self.db_path) as conn:
+                    cursor = conn.cursor()
+                    cursor.execute(
+                        "INSERT OR REPLACE INTO llm_cache (key, request_json, response_json) VALUES (?, ?, ?)",
+                        (key, request_json, response_json),
+                    )
+                    conn.commit()
+                    logger.debug(f"Saved response to cache with key: {key}, response: {response_json}")
+                    return True
+            except Exception as e:
+                logger.error(f"Failed to save to SQLite cache: {e}")
+                return False
+    def get_all_entries(self) -> dict[str, dict[str, Any]]:
+        """Get all cache entries from the database."""
+        cache = {}
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.row_factory = sqlite3.Row
+                cursor = conn.cursor()
+                cursor.execute("SELECT key, request_json, response_json FROM llm_cache ORDER BY created_at")
+                for row in cursor.fetchall():
+                    cache[row["key"]] = {
+                        "request": row["request_json"],
+                        "response": row["response_json"],
+                    }
+                logger.debug(f"Retrieved {len(cache)} entries from cache database")
+                return cache
+        except Exception as e:
+            logger.error(f"Error retrieving all cache entries: {e}")
+            return {}
+    def clear(self) -> bool:
+        """Clear all cache entries.
+        Returns:
+            True if successful, False otherwise
+        """
+        with self.lock:
+            try:
+                with sqlite3.connect(self.db_path) as conn:
+                    cursor = conn.cursor()
+                    cursor.execute("DELETE FROM llm_cache")
+                    conn.commit()
+                    logger.info("Cache cleared")
+                    return True
+            except Exception as e:
+                logger.error(f"Failed to clear cache: {e}")
+                return False
+    def get_existing_keys(self) -> set:
+        """Get all existing keys in the database.
+        Returns:
+            Set of keys
+        """
+        existing_keys = set()
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                cursor.execute("SELECT key FROM llm_cache")
+                for row in cursor.fetchall():
+                    existing_keys.add(row[0])
+                return existing_keys
+        except Exception as e:
+            logger.error(f"Error retrieving existing keys: {e}")
+            return set()
+    def bulk_insert(self, items: list, update: bool = False) -> int:
+        """Insert multiple items into the cache.
+        Args:
+            items: List of (key, request_json, response_json) tuples
+            update: Whether to update existing entries
+        Returns:
+            Number of items inserted
+        """
+        count = 0
+        UPDATE_OR_IGNORE = "INSERT OR REPLACE" if update else "INSERT OR IGNORE"
+        with self.lock:
+            try:
+                with sqlite3.connect(self.db_path) as conn:
+                    cursor = conn.cursor()
+                    cursor.executemany(
+                        f"{UPDATE_OR_IGNORE} INTO llm_cache (key, request_json, response_json) VALUES (?, ?, ?)",
+                        items,
+                    )
+                    count = cursor.rowcount
+                    conn.commit()
+                return count
+            except Exception as e:
+                logger.error(f"Error during bulk insert: {e}")
+                return 0
+class LLMCache:
+    def __init__(
+        self, cache_dir: str = ".", hf_repo: str | None = None, cache_sync_interval: int = 3600, reset: bool = False
+    ):
+        self.cache_dir = Path(cache_dir)
+        self.db_path = self.cache_dir / "llm_cache.db"
+        self.hf_repo_id = hf_repo
+        self.cache_sync_interval = cache_sync_interval
+        self.last_sync_time = time.time()
+        # Create cache directory if it doesn't exist
+        self.cache_dir.mkdir(exist_ok=True, parents=True)
+        # Initialize CacheDB
+        self.db = CacheDB(self.db_path)
+        if reset:
+            self.db.clear()
+        # Try to load from HF dataset if available
+        try:
+            self._load_cache_from_hf()
+        except Exception as e:
+            logger.warning(f"Failed to load cache from HF dataset: {e}")
+    def response_format_to_dict(self, response_format: Any) -> dict[str, Any]:
+        """Convert a response format to a dict."""
+        # If it's a Pydantic model, use its schema
+        if hasattr(response_format, "model_json_schema"):
+            response_format = response_format.model_json_schema()
+        # If it's a Pydantic model, use its dump
+        elif hasattr(response_format, "model_dump"):
+            response_format = response_format.model_dump()
+        if not isinstance(response_format, dict):
+            response_format = {"value": str(response_format)}
+        return response_format
+    def _generate_key(
+        self, model: str, system: str, prompt: str, response_format: Any, temperature: float | None = None
+    ) -> str:
+        """Generate a unique key for caching based on inputs."""
+        response_format_dict = self.response_format_to_dict(response_format)
+        response_format_str = json.dumps(response_format_dict, sort_keys=True)
+        # Include temperature in the key
+        key_content = f"{model}:{system}:{prompt}:{response_format_str}"
+        if temperature is not None:
+            key_content += f":{temperature:.2f}"
+        return hashlib.md5(key_content.encode()).hexdigest()
+    def _create_request_json(
+        self, model: str, system: str, prompt: str, response_format: Any, temperature: float | None
+    ) -> str:
+        """Create JSON string from request parameters."""
+        logger.info(f"Creating request JSON with temperature: {temperature}")
+        request_data = {
+            "model": model,
+            "system": system,
+            "prompt": prompt,
+            "response_format": self.response_format_to_dict(response_format),
+            "temperature": temperature,
+        }
+        return json.dumps(request_data)
+    def _check_request_match(
+        self,
+        cached_request: dict[str, Any],
+        model: str,
+        system: str,
+        prompt: str,
+        response_format: Any,
+        temperature: float | None,
+    ) -> bool:
+        """Check if the cached request matches the new request."""
+        # Check each field and log any mismatches
+        if cached_request["model"] != model:
+            logger.debug(f"Cache mismatch: model - cached: {cached_request['model']}, new: {model}")
+            return False
+        if cached_request["system"] != system:
+            logger.debug(f"Cache mismatch: system - cached: {cached_request['system']}, new: {system}")
+            return False
+        if cached_request["prompt"] != prompt:
+            logger.debug(f"Cache mismatch: prompt - cached: {cached_request['prompt']}, new: {prompt}")
+            return False
+        response_format_dict = self.response_format_to_dict(response_format)
+        if cached_request["response_format"] != response_format_dict:
+            logger.debug(
+                f"Cache mismatch: response_format - cached: {cached_request['response_format']}, new: {response_format_dict}"
+            )
+            return False
+        if cached_request["temperature"] != temperature:
+            logger.debug(f"Cache mismatch: temperature - cached: {cached_request['temperature']}, new: {temperature}")
+            return False
+        return True
+    def get(
+        self, model: str, system: str, prompt: str, response_format: dict[str, Any], temperature: float | None = None
+    ) -> Optional[dict[str, Any]]:
+        """Get cached response if it exists."""
+        key = self._generate_key(model, system, prompt, response_format, temperature)
+        result = self.db.get(key)
+        if not result:
+            return None
+        request_dict = json.loads(result["request"])
+        if not self._check_request_match(request_dict, model, system, prompt, response_format, temperature):
+            logger.warning(f"Cached request does not match new request for key: {key}")
+            return None
+        return json.loads(result["response"])
+    def set(
+        self,
+        model: str,
+        system: str,
+        prompt: str,
+        response_format: dict[str, Any],
+        temperature: float | None,
+        response: dict[str, Any],
+    ) -> None:
+        """Set response in cache and sync if needed."""
+        key = self._generate_key(model, system, prompt, response_format, temperature)
+        request_json = self._create_request_json(model, system, prompt, response_format, temperature)
+        response_json = json.dumps(response)
+        success = self.db.set(key, request_json, response_json)
+        # Check if we should sync to HF
+        if success and self.hf_repo_id and (time.time() - self.last_sync_time > self.cache_sync_interval):
+            try:
+                self.sync_to_hf()
+                self.last_sync_time = time.time()
+            except Exception as e:
+                logger.error(f"Failed to sync cache to HF dataset: {e}")
+    def _load_cache_from_hf(self) -> None:
+        """Load cache from HF dataset if it exists and merge with local cache."""
+        if not self.hf_repo_id:
+            return
+        try:
+            # Check for new commits before loading the dataset
+            dataset = load_dataset_from_hf(self.hf_repo_id, self.cache_dir / "hf_cache")
+            if dataset:
+                existing_keys = self.db.get_existing_keys()
+                # Prepare batch items for insertion
+                items_to_insert = []
+                for item in dataset:
+                    key = item["key"]
+                    # Only update if not in local cache to prioritize local changes
+                    if key in existing_keys:
+                        continue
+                    # Create request JSON
+                    request_data = {
+                        "model": item["model"],
+                        "system": item["system"],
+                        "prompt": item["prompt"],
+                        "temperature": item["temperature"],
+                        "response_format": None,  # We can't fully reconstruct this
+                    }
+                    items_to_insert.append(
+                        (
+                            key,
+                            json.dumps(request_data),
+                            item["response"],  # This is already a JSON string
+                        )
+                    )
+                    logger.info(
+                        f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
+                    )
+                # Bulk insert new items
+                if items_to_insert:
+                    inserted_count = self.db.bulk_insert(items_to_insert)
+                    logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
+                else:
+                    logger.info("No new items to merge from HF dataset")
+        except Exception as e:
+            logger.warning(f"Could not load cache from HF dataset: {e}")
+    def get_all_entries(self) -> dict[str, dict[str, Any]]:
+        """Get all cache entries from the database."""
+        cache = self.db.get_all_entries()
+        entries = {}
+        for key, entry in cache.items():
+            request = json.loads(entry["request"])
+            response = json.loads(entry["response"])
+            entries[key] = {"request": request, "response": response}
+        return entries
+    def sync_to_hf(self) -> None:
+        """Sync cache to HF dataset."""
+        if not self.hf_repo_id:
+            return
+        # Get all entries from the database
+        cache = self.db.get_all_entries()
+        # Convert cache to dataset format
+        entries = []
+        for key, entry in cache.items():
+            request = json.loads(entry["request"])
+            response_str = entry["response"]
+            entries.append(
+                {
+                    "key": key,
+                    "model": request["model"],
+                    "system": request["system"],
+                    "prompt": request["prompt"],
+                    "response_format": request["response_format"],
+                    "temperature": request["temperature"],
+                    "response": response_str,
+                }
+            )
+        # Create and push dataset
+        dataset = Dataset.from_list(entries)
+        dataset.push_to_hub(self.hf_repo_id, private=True)
+        logger.info(f"Synced {len(cache)} cached items to HF dataset {self.hf_repo_id}")
+    def clear(self) -> None:
+        """Clear all cache entries."""
+        self.db.clear()

src/workflows/llms.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # %%
 import json
 import os
-from typing import Optional
 import cohere
 import numpy as np
 from langchain_anthropic import ChatAnthropic
 from langchain_cohere import ChatCohere
 from langchain_openai import ChatOpenAI
 from loguru import logger
 from openai import OpenAI
@@ -14,6 +16,10 @@ from pydantic import BaseModel, Field
 from rich import print as rprint
 from .configs import AVAILABLE_MODELS
 def _openai_is_json_mode_supported(model_name: str) -> bool:
@@ -30,7 +36,7 @@ class LLMOutput(BaseModel):
     logprob: Optional[float] = Field(None, description="The log probability of the response")
-def _get_langchain_chat_output(llm, system: str, prompt: str) -> str:
     output = llm.invoke([("system", system), ("human", prompt)])
     ai_message = output["raw"]
     content = {"content": ai_message.content, "tool_calls": ai_message.tool_calls}
@@ -38,7 +44,9 @@ def _get_langchain_chat_output(llm, system: str, prompt: str) -> str:
     return {"content": content_str, "output": output["parsed"].model_dump()}
-def _cohere_completion(model: str, system: str, prompt: str, response_model, logprobs: bool = True) -> str:
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": prompt},
@@ -49,6 +57,7 @@ def _cohere_completion(model: str, system: str, prompt: str, response_model, log
         messages=messages,
         response_format={"type": "json_schema", "json_schema": response_model.model_json_schema()},
         logprobs=logprobs,
     )
     output = {}
     output["content"] = response.message.content[0].text
@@ -59,12 +68,16 @@ def _cohere_completion(model: str, system: str, prompt: str, response_model, log
     return output
-def _openai_langchain_completion(model: str, system: str, prompt: str, response_model, logprobs: bool = True) -> str:
-    llm = ChatOpenAI(model=model).with_structured_output(response_model, include_raw=True)
     return _get_langchain_chat_output(llm, system, prompt)
-def _openai_completion(model: str, system: str, prompt: str, response_model, logprobs: bool = True) -> str:
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": prompt},
@@ -75,6 +88,7 @@ def _openai_completion(model: str, system: str, prompt: str, response_model, log
         messages=messages,
         response_format=response_model,
         logprobs=logprobs,
     )
     output = {}
     output["content"] = response.choices[0].message.content
@@ -85,14 +99,18 @@ def _openai_completion(model: str, system: str, prompt: str, response_model, log
     return output
-def _anthropic_completion(model: str, system: str, prompt: str, response_model) -> str:
-    llm = ChatAnthropic(model=model).with_structured_output(response_model, include_raw=True)
     return _get_langchain_chat_output(llm, system, prompt)
-def completion(model: str, system: str, prompt: str, response_format, logprobs: bool = False) -> str:
     """
-    Generate a completion from an LLM provider with structured output.
     Args:
         model (str): Provider and model name in format "provider/model" (e.g. "OpenAI/gpt-4")
@@ -116,20 +134,69 @@ def completion(model: str, system: str, prompt: str, response_format, logprobs:
     model_name = AVAILABLE_MODELS[model]["model"]
     provider = model.split("/")[0]
     if provider == "Cohere":
-        return _cohere_completion(model_name, system, prompt, response_format, logprobs)
     elif provider == "OpenAI":
         if _openai_is_json_mode_supported(model_name):
-            return _openai_completion(model_name, system, prompt, response_format, logprobs)
         else:
-            return _openai_langchain_completion(model_name, system, prompt, response_format, logprobs)
     elif provider == "Anthropic":
         if logprobs:
-            raise ValueError("Anthropic does not support logprobs")
-        return _anthropic_completion(model_name, system, prompt, response_format)
     else:
         raise ValueError(f"Provider {provider} not supported")
 # %%
 if __name__ == "__main__":
     from tqdm import tqdm
@@ -142,12 +209,52 @@ if __name__ == "__main__":
         answer: str = Field(description="The short answer to the question")
         explanation: str = Field(description="5 words terse best explanation of the answer.")
-    models = AVAILABLE_MODELS.keys()
     system = "You are an accurate and concise explainer of scientific concepts."
     prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
     for model in tqdm(models):
         response = completion(model, system, prompt, ExplainedAnswer, logprobs=False)
         rprint(response)
 # %%

 # %%
 import json
 import os
+from typing import Any, Optional
 import cohere
 import numpy as np
 from langchain_anthropic import ChatAnthropic
 from langchain_cohere import ChatCohere
+from langchain_core.language_models import BaseChatModel
 from langchain_openai import ChatOpenAI
 from loguru import logger
 from openai import OpenAI
 from rich import print as rprint
 from .configs import AVAILABLE_MODELS
+from .llmcache import LLMCache
+# Initialize global cache
+llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache")
 def _openai_is_json_mode_supported(model_name: str) -> bool:
     logprob: Optional[float] = Field(None, description="The log probability of the response")
+def _get_langchain_chat_output(llm: BaseChatModel, system: str, prompt: str) -> str:
     output = llm.invoke([("system", system), ("human", prompt)])
     ai_message = output["raw"]
     content = {"content": ai_message.content, "tool_calls": ai_message.tool_calls}
     return {"content": content_str, "output": output["parsed"].model_dump()}
+def _cohere_completion(
+    model: str, system: str, prompt: str, response_model, temperature: float | None = None, logprobs: bool = True
+) -> str:
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": prompt},
         messages=messages,
         response_format={"type": "json_schema", "json_schema": response_model.model_json_schema()},
         logprobs=logprobs,
+        temperature=temperature,
     )
     output = {}
     output["content"] = response.message.content[0].text
     return output
+def _openai_langchain_completion(
+    model: str, system: str, prompt: str, response_model, temperature: float | None = None
+) -> str:
+    llm = ChatOpenAI(model=model, temperature=temperature).with_structured_output(response_model, include_raw=True)
     return _get_langchain_chat_output(llm, system, prompt)
+def _openai_completion(
+    model: str, system: str, prompt: str, response_model, temperature: float | None = None, logprobs: bool = True
+) -> str:
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": prompt},
         messages=messages,
         response_format=response_model,
         logprobs=logprobs,
+        temperature=temperature,
     )
     output = {}
     output["content"] = response.choices[0].message.content
     return output
+def _anthropic_completion(
+    model: str, system: str, prompt: str, response_model, temperature: float | None = None
+) -> str:
+    llm = ChatAnthropic(model=model, temperature=temperature).with_structured_output(response_model, include_raw=True)
     return _get_langchain_chat_output(llm, system, prompt)
+def _llm_completion(
+    model: str, system: str, prompt: str, response_format, temperature: float | None = None, logprobs: bool = False
+) -> dict[str, Any]:
     """
+    Generate a completion from an LLM provider with structured output without caching.
     Args:
         model (str): Provider and model name in format "provider/model" (e.g. "OpenAI/gpt-4")
     model_name = AVAILABLE_MODELS[model]["model"]
     provider = model.split("/")[0]
     if provider == "Cohere":
+        return _cohere_completion(model_name, system, prompt, response_format, temperature, logprobs)
     elif provider == "OpenAI":
         if _openai_is_json_mode_supported(model_name):
+            return _openai_completion(model_name, system, prompt, response_format, temperature, logprobs)
+        elif logprobs:
+            raise ValueError(f"{model} does not support logprobs feature.")
         else:
+            return _openai_langchain_completion(model_name, system, prompt, response_format, temperature)
     elif provider == "Anthropic":
         if logprobs:
+            raise ValueError("Anthropic models do not support logprobs")
+        return _anthropic_completion(model_name, system, prompt, response_format, temperature)
     else:
         raise ValueError(f"Provider {provider} not supported")
+def completion(
+    model: str, system: str, prompt: str, response_format, temperature: float | None = None, logprobs: bool = False
+) -> dict[str, Any]:
+    """
+    Generate a completion from an LLM provider with structured output with caching.
+    Args:
+        model (str): Provider and model name in format "provider/model" (e.g. "OpenAI/gpt-4")
+        system (str): System prompt/instructions for the model
+        prompt (str): User prompt/input
+        response_format: Pydantic model defining the expected response structure
+        logprobs (bool, optional): Whether to return log probabilities. Defaults to False.
+            Note: Not supported by Anthropic models.
+    Returns:
+        dict: Contains:
+            - output: The structured response matching response_format
+            - logprob: (optional) Sum of log probabilities if logprobs=True
+            - prob: (optional) Exponential of logprob if logprobs=True
+    Raises:
+        ValueError: If logprobs=True with Anthropic models
+    """
+    # Check cache first
+    cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
+    if cached_response is not None:
+        logger.info(f"Cache hit for model {model}")
+        return cached_response
+    logger.info(f"Cache miss for model {model}, calling API")
+    # Continue with the original implementation for cache miss
+    response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
+    # Update cache with the new response
+    llm_cache.set(
+        model,
+        system,
+        prompt,
+        response_format,
+        temperature,
+        response,
+    )
+    return response
 # %%
 if __name__ == "__main__":
     from tqdm import tqdm
         answer: str = Field(description="The short answer to the question")
         explanation: str = Field(description="5 words terse best explanation of the answer.")
+    models = list(AVAILABLE_MODELS.keys())[:1]  # Just use the first model for testing
     system = "You are an accurate and concise explainer of scientific concepts."
     prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
+    llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache", reset=True)
+    # First call - should be a cache miss
+    logger.info("First call - should be a cache miss")
+    for model in tqdm(models):
+        response = completion(model, system, prompt, ExplainedAnswer, logprobs=False)
+        rprint(response)
+    # Second call - should be a cache hit
+    logger.info("Second call - should be a cache hit")
     for model in tqdm(models):
         response = completion(model, system, prompt, ExplainedAnswer, logprobs=False)
         rprint(response)
+    # Slightly different prompt - should be a cache miss
+    logger.info("Different prompt - should be a cache miss")
+    prompt2 = "Which planet is closest to the sun? Answer directly."
+    for model in tqdm(models):
+        response = completion(model, system, prompt2, ExplainedAnswer, logprobs=False)
+        rprint(response)
+    # Get cache entries count from SQLite
+    try:
+        cache_entries = llm_cache.get_all_entries()
+        logger.info(f"Cache now has {len(cache_entries)} items")
+    except Exception as e:
+        logger.error(f"Failed to get cache entries: {e}")
+    # Test adding entry with temperature parameter
+    logger.info("Testing with temperature parameter")
+    response = completion(models[0], system, "What is Mars?", ExplainedAnswer, temperature=0.7, logprobs=False)
+    rprint(response)
+    # Demonstrate forced sync to HF if repo is configured
+    if llm_cache.hf_repo_id:
+        logger.info("Forcing sync to HF dataset")
+        try:
+            llm_cache.sync_to_hf()
+            logger.info("Successfully synced to HF dataset")
+        except Exception as e:
+            logger.exception(f"Failed to sync to HF: {e}")
+    else:
+        logger.info("HF repo not configured, skipping sync test")
 # %%