Spaces:

jatinmehra
/

CRAWL-GPT-CHAT

Running

App Files Files Community

Jatin Mehra commited on Jan 25

Commit

5b02b7b

1 Parent(s): bb32843

Add pytest configuration and restructure test files; move tests to core directory and update imports

Browse files

Files changed (24) hide show

.vscode/settings.json +11 -0
LICENSE +21 -0
pyproject.toml +59 -0
pytest.ini +7 -0
requirements.txt +0 -182
setup_env.py +105 -0
{core → src}/__init__.py +0 -0
src/crawlgpt/__init__.py +0 -0
{core → src/crawlgpt/core}/DatabaseHandler.py +0 -0
{core → src/crawlgpt/core}/LLMBasedCrawler.py +6 -6
{core → src/crawlgpt/core}/SummaryGenerator.py +6 -0
src/crawlgpt/core/__init__.py +0 -0
{ui → src/crawlgpt/ui}/chat_app.py +6 -6
{ui → src/crawlgpt/ui}/chat_ui.py +5 -5
{utils → src/crawlgpt/utils}/__init__.py +0 -0
{utils → src/crawlgpt/utils}/content_validator.py +0 -0
{utils → src/crawlgpt/utils}/data_manager.py +0 -0
{utils → src/crawlgpt/utils}/helper_functions.py +0 -0
{utils → src/crawlgpt/utils}/monitoring.py +0 -0
{utils → src/crawlgpt/utils}/progress.py +0 -0
tests/{test_database_handler.py → test_core/test_database_handler.py} +2 -2
tests/{test_integration.py → test_core/test_integration.py} +2 -2
tests/{test_llm_based_crawler.py → test_core/test_llm_based_crawler.py} +1 -1
tests/{test_summary_generator.py → test_core/test_summary_generator.py} +1 -1

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./tests",
+        "-p",
+        "test*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Jatin Mehra
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,59 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "crawlgpt"
+version = "0.1.0"
+description = "A web content crawler with GPT-powered summarization and chat capabilities"
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [
+    {name = "Jatin Mehra", email = "[email protected]"}
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Text Processing :: General"
+]
+dependencies = [
+    "streamlit==1.41.1",
+    "groq==0.15.0",
+    "sentence-transformers==3.3.1",
+    "faiss-cpu==1.9.0.post1",
+    "crawl4ai==0.4.247",
+    "python-dotenv==1.0.1",
+    "pydantic==2.10.5",
+    "aiohttp==3.11.11",
+    "beautifulsoup4==4.12.3",
+    "numpy==2.2.0",
+    "tqdm==4.67.1",
+    "playwright>=1.41.0",
+    "asyncio>=3.4.3"
+]
+[project.optional-dependencies]
+dev = [
+    "pytest==8.3.4",
+    "pytest-mockito==0.0.4",
+    "black==24.2.0",  # Updated version
+    "isort==5.13.0",
+    "flake8==7.0.0"
+]
+[project.urls]
+"Bug Tracker" = "https://github.com/Jatin-Mehra119/crawlgpt/issues"
+"Documentation" = "https://github.com/Jatin-Mehra119/crawlgpt/wiki"
+"Source Code" = "https://github.com/Jatin-Mehra119/crawlgpt"
+[project.scripts]
+crawlgpt = "crawlgpt.ui.chat_app:main"

pytest.ini ADDED Viewed

	@@ -0,0 +1,7 @@

+[pytest]
+testpaths = tests
+pythonpath = src
+addopts = -v --tb=short
+python_files = test_*.py
+python_classes = *Tests TestIntegration
+python_functions = test_* async_test_*

requirements.txt DELETED Viewed

@@ -1,182 +0,0 @@
-aiofiles==24.1.0
-aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
-aiosignal==1.3.2
-aiosqlite==0.20.0
-altair==5.5.0
-annotated-types==0.7.0
-anyio==4.7.0
-argon2-cffi==23.1.0
-argon2-cffi-bindings==21.2.0
-arrow==1.3.0
-asttokens==3.0.0
-async-lru==2.0.4
-attrs==24.2.0
-babel==2.16.0
-beautifulsoup4==4.12.3
-bleach==6.2.0
-blinker==1.9.0
-cachetools==5.5.1
-certifi==2024.8.30
-cffi==1.17.1
-charset-normalizer==3.4.0
-click==8.1.8
-colorama==0.4.6
-comm==0.2.2
-contourpy==1.3.1
-Crawl4AI==0.4.247
-cryptography==44.0.0
-cycler==0.12.1
-debugpy==1.8.9
-decorator==5.1.1
-defusedxml==0.7.1
-distro==1.9.0
-executing==2.1.0
-faiss-cpu==1.9.0.post1
-fake-http-header==0.3.5
-fastjsonschema==2.21.1
-filelock==3.13.1
-fonttools==4.55.3
-fqdn==1.5.1
-frozenlist==1.5.0
-fsspec==2024.2.0
-gitdb==4.0.11
-GitPython==3.1.43
-greenlet==3.1.1
-groq==0.15.0
-h11==0.14.0
-httpcore==1.0.7
-httpx==0.27.2
-huggingface-hub==0.27.1
-idna==3.10
-importlib_metadata==8.6.1
-iniconfig==2.0.0
-ipykernel==6.29.5
-ipython==8.30.0
-isoduration==20.11.0
-jedi==0.19.2
-Jinja2==3.1.4
-jiter==0.8.2
-joblib==1.4.2
-json5==0.10.0
-jsonpointer==3.0.0
-jsonschema==4.23.0
-jsonschema-specifications==2024.10.1
-jupyter-events==0.10.0
-jupyter-lsp==2.2.5
-jupyter-server-mathjax==0.2.6
-jupyter_client==8.6.3
-jupyter_core==5.7.2
-jupyter_server==2.14.2
-jupyter_server_terminals==0.5.3
-jupyterlab==4.3.3
-jupyterlab_git==0.50.2
-jupyterlab_pygments==0.3.0
-jupyterlab_server==2.27.3
-kiwisolver==1.4.7
-litellm==1.59.5
-lxml==5.3.0
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-matplotlib==3.9.3
-matplotlib-inline==0.1.7
-mdurl==0.1.2
-mistune==3.0.2
-mockito==1.5.4
-mpmath==1.3.0
-multidict==6.1.0
-narwhals==1.23.0
-nbclient==0.10.1
-nbconvert==7.16.4
-nbdime==4.0.2
-nbformat==5.10.4
-nest-asyncio==1.6.0
-networkx==3.2.1
-nltk==3.9.1
-notebook_shim==0.2.4
-numpy==2.2.0
-openai==1.60.0
-overrides==7.7.0
-packaging==24.2
-pandas==2.2.3
-pandocfilters==1.5.1
-parso==0.8.4
-pexpect==4.9.0
-pillow==10.4.0
-platformdirs==4.3.6
-playwright==1.49.1
-plotly==5.24.1
-pluggy==1.5.0
-prometheus_client==0.21.1
-prompt_toolkit==3.0.48
-propcache==0.2.1
-protobuf==5.29.3
-psutil==6.1.1
-ptyprocess==0.7.0
-pure_eval==0.2.3
-pyarrow==19.0.0
-pycparser==2.22
-pydantic==2.10.5
-pydantic_core==2.27.2
-pydeck==0.9.1
-pyee==12.0.0
-Pygments==2.18.0
-pyOpenSSL==25.0.0
-pyparsing==3.2.0
-pytest==8.3.4
-pytest-mockito==0.0.4
-python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-python-json-logger==3.2.0
-pytz==2024.2
-PyYAML==6.0.2
-pyzmq==26.2.0
-rank-bm25==0.2.2
-referencing==0.35.1
-regex==2024.11.6
-requests==2.32.3
-rfc3339-validator==0.1.4
-rfc3986-validator==0.1.1
-rich==13.9.4
-rpds-py==0.22.3
-safetensors==0.5.2
-scikit-learn==1.6.0
-scipy==1.14.1
-seaborn==0.13.2
-Send2Trash==1.8.3
-sentence-transformers==3.3.1
-setuptools==75.6.0
-six==1.17.0
-smmap==5.0.1
-sniffio==1.3.1
-snowballstemmer==2.2.0
-soupsieve==2.6
-stack-data==0.6.3
-streamlit==1.41.1
-sympy==1.13.1
-tenacity==9.0.0
-terminado==0.18.1
-tf-playwright-stealth==1.1.0
-threadpoolctl==3.5.0
-tiktoken==0.8.0
-tinycss2==1.4.0
-tokenizers==0.21.0
-toml==0.10.2
-torch==2.5.1+cpu
-tornado==6.4.2
-tqdm==4.67.1
-traitlets==5.14.3
-transformers==4.48.1
-types-python-dateutil==2.9.0.20241206
-typing_extensions==4.12.2
-tzdata==2024.2
-uri-template==1.3.0
-urllib3==2.2.3
-watchdog==6.0.0
-wcwidth==0.2.13
-webcolors==24.11.1
-webencodings==0.5.1
-websocket-client==1.8.0
-xxhash==3.5.0
-yarl==1.18.3
-zipp==3.21.0

setup_env.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import subprocess
+import sys
+from pathlib import Path
+import venv
+import platform
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def create_virtual_environment(venv_path):
+    """Create a virtual environment."""
+    try:
+        subprocess.run([sys.executable, "-m", "venv", str(venv_path)], check=True)
+        logger.info(f"Created virtual environment at {venv_path}")
+        # Get pip path
+        pip_path = venv_path / "bin" / "pip"
+        # Upgrade pip
+        logger.info("Upgrading pip...")
+        subprocess.run([str(pip_path), "install", "--upgrade", "pip"], check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to create virtual environment: {e}")
+        raise
+def install_dependencies(venv_path):
+    """Install project dependencies."""
+    try:
+        pip_path = venv_path / "bin" / "pip"
+        # Install core dependencies first
+        logger.info("Installing core dependencies...")
+        subprocess.run([
+            str(pip_path), "install", "-e", "."
+        ], check=True)
+        # Install dev dependencies separately
+        logger.info("Installing development dependencies...")
+        subprocess.run([
+            str(pip_path), "install",
+            "pytest==8.3.4",
+            "pytest-mockito==0.0.4",
+            "black==24.2.0",
+            "isort==5.13.0",
+            "flake8==7.0.0"
+        ], check=True)
+        # Install playwright
+        logger.info("Installing and setting up Playwright...")
+        subprocess.run([
+            str(pip_path), "install", "playwright"
+        ], check=True)
+        playwright_path = venv_path / "bin" / "playwright"
+        subprocess.run([str(playwright_path), "install"], check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to install dependencies: {e}")
+        logger.error(f"Exit code: {e.returncode}")
+        logger.error(f"Output: {e.output if hasattr(e, 'output') else 'No output'}")
+        raise
+def create_env_file():
+    """Create .env file if it doesn't exist."""
+    env_file = Path(".env")
+    if not env_file.exists():
+        with open(env_file, "w") as f:
+            f.write("GROQ_API_KEY=\n")
+        logger.info("Created .env file")
+def main():
+    """Main setup function."""
+    try:
+        # Check Python version
+        if sys.version_info < (3, 8):
+            raise RuntimeError("Python 3.8 or higher is required")
+        # Get project root directory
+        project_root = Path(__file__).parent
+        venv_path = project_root / ".venv"
+        # Create virtual environment if it doesn't exist
+        if not venv_path.exists():
+            create_virtual_environment(venv_path)
+        # Install dependencies
+        install_dependencies(venv_path)
+        # Create .env file
+        create_env_file()
+        logger.info("\nSetup completed successfully!")
+        logger.info("\nNext steps:")
+        logger.info("1. Update the .env file with your API keys")
+        logger.info("2. Activate the virtual environment:")
+        logger.info("   source .venv/bin/activate")
+        logger.info("3. Run the application: python -m streamlit run src/crawlgpt/ui/chat_app.py")
+    except Exception as e:
+        logger.error(f"Setup failed: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

{core → src}/__init__.py RENAMED Viewed

File without changes

src/crawlgpt/__init__.py ADDED Viewed

File without changes

{core → src/crawlgpt/core}/DatabaseHandler.py RENAMED Viewed

File without changes

{core → src/crawlgpt/core}/LLMBasedCrawler.py RENAMED Viewed

@@ -11,12 +11,12 @@ import logging
 from dotenv import load_dotenv
 # Internal imports
-from core.DatabaseHandler import VectorDatabase
-from core.SummaryGenerator import SummaryGenerator
-from utils.monitoring import MetricsCollector, RateLimiter, Metrics
-from utils.progress import ProgressTracker
-from utils.data_manager import DataManager
-from utils.content_validator import ContentValidator
 # Configure logging

 from dotenv import load_dotenv
 # Internal imports
+from src.crawlgpt.core.DatabaseHandler import VectorDatabase
+from src.crawlgpt.core.SummaryGenerator import SummaryGenerator
+from src.crawlgpt.utils.monitoring import MetricsCollector, RateLimiter, Metrics
+from src.crawlgpt.utils.progress import ProgressTracker
+from src.crawlgpt.utils.data_manager import DataManager
+from src.crawlgpt.utils.content_validator import ContentValidator
 # Configure logging

{core → src/crawlgpt/core}/SummaryGenerator.py RENAMED Viewed

@@ -55,6 +55,12 @@ class SummaryGenerator:
         Raises:
             Exception: If API call fails or text processing errors occur
         """
         messages = [
             {"role": "system", "content": "Generate a concise summary for the following text."},
             {"role": "user", "content": text},

         Raises:
             Exception: If API call fails or text processing errors occur
         """
+        # Handle empty input text
+        if not text or not text.strip():
+            return ""
+        # Generate a summary using the Groq API
         messages = [
             {"role": "system", "content": "Generate a concise summary for the following text."},
             {"role": "user", "content": text},

src/crawlgpt/core/__init__.py ADDED Viewed

File without changes

{ui → src/crawlgpt/ui}/chat_app.py RENAMED Viewed

@@ -3,11 +3,11 @@ import streamlit as st
 import asyncio
 import time
 from datetime import datetime
-from core.LLMBasedCrawler import Model
-from utils.monitoring import MetricsCollector, Metrics
-from utils.progress import ProgressTracker
-from utils.data_manager import DataManager
-from utils.content_validator import ContentValidator
 import json
 # Streamlit app title and description
@@ -46,7 +46,7 @@ with st.sidebar:
     st.session_state.use_summary = st.checkbox("Use Summarized RAG", value=False, help="Don't use summaization when dealing with Coding Documentation.")
     st.subheader("🤖 Normal LLM Settings")
     temperature = st.slider("Temperature", 0.0, 1.0, 0.7, help="Controls the randomness of the generated text. Lower values are more deterministic.")
-    max_tokens = st.slider("Max Tokens", 500, 10000, 200, help="Maximum number of tokens to generate in the response.")
     model_id = st.radio("Model ID", ['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'mixtral-8x7b-32768'], help="Choose the model to use for generating responses.")
     # Export/Import Data

 import asyncio
 import time
 from datetime import datetime
+from src.crawlgpt.core.LLMBasedCrawler import Model
+from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
+from src.crawlgpt.utils.progress import ProgressTracker
+from src.crawlgpt.utils.data_manager import DataManager
+from src.crawlgpt.utils.content_validator import ContentValidator
 import json
 # Streamlit app title and description
     st.session_state.use_summary = st.checkbox("Use Summarized RAG", value=False, help="Don't use summaization when dealing with Coding Documentation.")
     st.subheader("🤖 Normal LLM Settings")
     temperature = st.slider("Temperature", 0.0, 1.0, 0.7, help="Controls the randomness of the generated text. Lower values are more deterministic.")
+    max_tokens = st.slider("Max Tokens", 500, 10000, 5000, help="Maximum number of tokens to generate in the response.")
     model_id = st.radio("Model ID", ['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'mixtral-8x7b-32768'], help="Choose the model to use for generating responses.")
     # Export/Import Data

{ui → src/crawlgpt/ui}/chat_ui.py RENAMED Viewed

@@ -5,11 +5,11 @@ import streamlit as st
 import asyncio
 import time
 from datetime import datetime
-from core.LLMBasedCrawler import Model
-from utils.monitoring import MetricsCollector, Metrics
-from utils.progress import ProgressTracker
-from utils.data_manager import DataManager
-from utils.content_validator import ContentValidator
 import json
 # Streamlit app title and description

 import asyncio
 import time
 from datetime import datetime
+from src.crawlgpt.core.LLMBasedCrawler import Model
+from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
+from src.crawlgpt.utils.progress import ProgressTracker
+from src.crawlgpt.utils.data_manager import DataManager
+from src.crawlgpt.utils.content_validator import ContentValidator
 import json
 # Streamlit app title and description

{utils → src/crawlgpt/utils}/__init__.py RENAMED Viewed

File without changes

{utils → src/crawlgpt/utils}/content_validator.py RENAMED Viewed

File without changes

{utils → src/crawlgpt/utils}/data_manager.py RENAMED Viewed

File without changes

{utils → src/crawlgpt/utils}/helper_functions.py RENAMED Viewed

File without changes

{utils → src/crawlgpt/utils}/monitoring.py RENAMED Viewed

File without changes

{utils → src/crawlgpt/utils}/progress.py RENAMED Viewed

File without changes

tests/{test_database_handler.py → test_core/test_database_handler.py} RENAMED Viewed

@@ -1,8 +1,8 @@
 import unittest
 import asyncio
 from unittest.mock import AsyncMock, MagicMock
-from core.LLMBasedCrawler import Model
-from core.DatabaseHandler import VectorDatabase
 class TestIntegration(unittest.TestCase):

 import unittest
 import asyncio
 from unittest.mock import AsyncMock, MagicMock
+from src.crawlgpt.core.LLMBasedCrawler import Model
+from src.crawlgpt.core.DatabaseHandler import VectorDatabase
 class TestIntegration(unittest.TestCase):

tests/{test_integration.py → test_core/test_integration.py} RENAMED Viewed

@@ -1,7 +1,7 @@
 import unittest
 from unittest.mock import AsyncMock, MagicMock
-from core.LLMBasedCrawler import Model
-from core.DatabaseHandler import VectorDatabase
 class TestIntegration(unittest.IsolatedAsyncioTestCase):  # Use IsolatedAsyncioTestCase for async tests

 import unittest
 from unittest.mock import AsyncMock, MagicMock
+from crawlgpt.core.LLMBasedCrawler import Model
+from crawlgpt.core.DatabaseHandler import VectorDatabase
 class TestIntegration(unittest.IsolatedAsyncioTestCase):  # Use IsolatedAsyncioTestCase for async tests

tests/{test_llm_based_crawler.py → test_core/test_llm_based_crawler.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import unittest
 from unittest.mock import AsyncMock, MagicMock
-from core.LLMBasedCrawler import Model
 import asyncio

 import unittest
 from unittest.mock import AsyncMock, MagicMock
+from crawlgpt.core.LLMBasedCrawler import Model
 import asyncio

tests/{test_summary_generator.py → test_core/test_summary_generator.py} RENAMED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from core.SummaryGenerator import SummaryGenerator
 class TestSummaryGenerator(unittest.TestCase):

 import unittest
+from crawlgpt.core.SummaryGenerator import SummaryGenerator
 class TestSummaryGenerator(unittest.TestCase):