Spaces:
Running
Running
Jatin Mehra
commited on
Commit
Β·
5b02b7b
1
Parent(s):
bb32843
Add pytest configuration and restructure test files; move tests to core directory and update imports
Browse files- .vscode/settings.json +11 -0
- LICENSE +21 -0
- pyproject.toml +59 -0
- pytest.ini +7 -0
- requirements.txt +0 -182
- setup_env.py +105 -0
- {core β src}/__init__.py +0 -0
- src/crawlgpt/__init__.py +0 -0
- {core β src/crawlgpt/core}/DatabaseHandler.py +0 -0
- {core β src/crawlgpt/core}/LLMBasedCrawler.py +6 -6
- {core β src/crawlgpt/core}/SummaryGenerator.py +6 -0
- src/crawlgpt/core/__init__.py +0 -0
- {ui β src/crawlgpt/ui}/chat_app.py +6 -6
- {ui β src/crawlgpt/ui}/chat_ui.py +5 -5
- {utils β src/crawlgpt/utils}/__init__.py +0 -0
- {utils β src/crawlgpt/utils}/content_validator.py +0 -0
- {utils β src/crawlgpt/utils}/data_manager.py +0 -0
- {utils β src/crawlgpt/utils}/helper_functions.py +0 -0
- {utils β src/crawlgpt/utils}/monitoring.py +0 -0
- {utils β src/crawlgpt/utils}/progress.py +0 -0
- tests/{test_database_handler.py β test_core/test_database_handler.py} +2 -2
- tests/{test_integration.py β test_core/test_integration.py} +2 -2
- tests/{test_llm_based_crawler.py β test_core/test_llm_based_crawler.py} +1 -1
- tests/{test_summary_generator.py β test_core/test_summary_generator.py} +1 -1
.vscode/settings.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"python.testing.unittestArgs": [
|
3 |
+
"-v",
|
4 |
+
"-s",
|
5 |
+
"./tests",
|
6 |
+
"-p",
|
7 |
+
"test*.py"
|
8 |
+
],
|
9 |
+
"python.testing.pytestEnabled": false,
|
10 |
+
"python.testing.unittestEnabled": true
|
11 |
+
}
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Jatin Mehra
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
pyproject.toml
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=45", "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "crawlgpt"
|
7 |
+
version = "0.1.0"
|
8 |
+
description = "A web content crawler with GPT-powered summarization and chat capabilities"
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python = ">=3.8"
|
11 |
+
authors = [
|
12 |
+
{name = "Jatin Mehra", email = "[email protected]"}
|
13 |
+
]
|
14 |
+
classifiers = [
|
15 |
+
"Development Status :: 3 - Alpha",
|
16 |
+
"Intended Audience :: Developers",
|
17 |
+
"License :: OSI Approved :: MIT License",
|
18 |
+
"Operating System :: OS Independent",
|
19 |
+
"Programming Language :: Python :: 3",
|
20 |
+
"Programming Language :: Python :: 3.8",
|
21 |
+
"Programming Language :: Python :: 3.9",
|
22 |
+
"Programming Language :: Python :: 3.10",
|
23 |
+
"Programming Language :: Python :: 3.11",
|
24 |
+
"Programming Language :: Python :: 3.12",
|
25 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
26 |
+
"Topic :: Text Processing :: General"
|
27 |
+
]
|
28 |
+
dependencies = [
|
29 |
+
"streamlit==1.41.1",
|
30 |
+
"groq==0.15.0",
|
31 |
+
"sentence-transformers==3.3.1",
|
32 |
+
"faiss-cpu==1.9.0.post1",
|
33 |
+
"crawl4ai==0.4.247",
|
34 |
+
"python-dotenv==1.0.1",
|
35 |
+
"pydantic==2.10.5",
|
36 |
+
"aiohttp==3.11.11",
|
37 |
+
"beautifulsoup4==4.12.3",
|
38 |
+
"numpy==2.2.0",
|
39 |
+
"tqdm==4.67.1",
|
40 |
+
"playwright>=1.41.0",
|
41 |
+
"asyncio>=3.4.3"
|
42 |
+
]
|
43 |
+
|
44 |
+
[project.optional-dependencies]
|
45 |
+
dev = [
|
46 |
+
"pytest==8.3.4",
|
47 |
+
"pytest-mockito==0.0.4",
|
48 |
+
"black==24.2.0", # Updated version
|
49 |
+
"isort==5.13.0",
|
50 |
+
"flake8==7.0.0"
|
51 |
+
]
|
52 |
+
|
53 |
+
[project.urls]
|
54 |
+
"Bug Tracker" = "https://github.com/Jatin-Mehra119/crawlgpt/issues"
|
55 |
+
"Documentation" = "https://github.com/Jatin-Mehra119/crawlgpt/wiki"
|
56 |
+
"Source Code" = "https://github.com/Jatin-Mehra119/crawlgpt"
|
57 |
+
|
58 |
+
[project.scripts]
|
59 |
+
crawlgpt = "crawlgpt.ui.chat_app:main"
|
pytest.ini
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
testpaths = tests
|
3 |
+
pythonpath = src
|
4 |
+
addopts = -v --tb=short
|
5 |
+
python_files = test_*.py
|
6 |
+
python_classes = *Tests TestIntegration
|
7 |
+
python_functions = test_* async_test_*
|
requirements.txt
DELETED
@@ -1,182 +0,0 @@
|
|
1 |
-
aiofiles==24.1.0
|
2 |
-
aiohappyeyeballs==2.4.4
|
3 |
-
aiohttp==3.11.11
|
4 |
-
aiosignal==1.3.2
|
5 |
-
aiosqlite==0.20.0
|
6 |
-
altair==5.5.0
|
7 |
-
annotated-types==0.7.0
|
8 |
-
anyio==4.7.0
|
9 |
-
argon2-cffi==23.1.0
|
10 |
-
argon2-cffi-bindings==21.2.0
|
11 |
-
arrow==1.3.0
|
12 |
-
asttokens==3.0.0
|
13 |
-
async-lru==2.0.4
|
14 |
-
attrs==24.2.0
|
15 |
-
babel==2.16.0
|
16 |
-
beautifulsoup4==4.12.3
|
17 |
-
bleach==6.2.0
|
18 |
-
blinker==1.9.0
|
19 |
-
cachetools==5.5.1
|
20 |
-
certifi==2024.8.30
|
21 |
-
cffi==1.17.1
|
22 |
-
charset-normalizer==3.4.0
|
23 |
-
click==8.1.8
|
24 |
-
colorama==0.4.6
|
25 |
-
comm==0.2.2
|
26 |
-
contourpy==1.3.1
|
27 |
-
Crawl4AI==0.4.247
|
28 |
-
cryptography==44.0.0
|
29 |
-
cycler==0.12.1
|
30 |
-
debugpy==1.8.9
|
31 |
-
decorator==5.1.1
|
32 |
-
defusedxml==0.7.1
|
33 |
-
distro==1.9.0
|
34 |
-
executing==2.1.0
|
35 |
-
faiss-cpu==1.9.0.post1
|
36 |
-
fake-http-header==0.3.5
|
37 |
-
fastjsonschema==2.21.1
|
38 |
-
filelock==3.13.1
|
39 |
-
fonttools==4.55.3
|
40 |
-
fqdn==1.5.1
|
41 |
-
frozenlist==1.5.0
|
42 |
-
fsspec==2024.2.0
|
43 |
-
gitdb==4.0.11
|
44 |
-
GitPython==3.1.43
|
45 |
-
greenlet==3.1.1
|
46 |
-
groq==0.15.0
|
47 |
-
h11==0.14.0
|
48 |
-
httpcore==1.0.7
|
49 |
-
httpx==0.27.2
|
50 |
-
huggingface-hub==0.27.1
|
51 |
-
idna==3.10
|
52 |
-
importlib_metadata==8.6.1
|
53 |
-
iniconfig==2.0.0
|
54 |
-
ipykernel==6.29.5
|
55 |
-
ipython==8.30.0
|
56 |
-
isoduration==20.11.0
|
57 |
-
jedi==0.19.2
|
58 |
-
Jinja2==3.1.4
|
59 |
-
jiter==0.8.2
|
60 |
-
joblib==1.4.2
|
61 |
-
json5==0.10.0
|
62 |
-
jsonpointer==3.0.0
|
63 |
-
jsonschema==4.23.0
|
64 |
-
jsonschema-specifications==2024.10.1
|
65 |
-
jupyter-events==0.10.0
|
66 |
-
jupyter-lsp==2.2.5
|
67 |
-
jupyter-server-mathjax==0.2.6
|
68 |
-
jupyter_client==8.6.3
|
69 |
-
jupyter_core==5.7.2
|
70 |
-
jupyter_server==2.14.2
|
71 |
-
jupyter_server_terminals==0.5.3
|
72 |
-
jupyterlab==4.3.3
|
73 |
-
jupyterlab_git==0.50.2
|
74 |
-
jupyterlab_pygments==0.3.0
|
75 |
-
jupyterlab_server==2.27.3
|
76 |
-
kiwisolver==1.4.7
|
77 |
-
litellm==1.59.5
|
78 |
-
lxml==5.3.0
|
79 |
-
markdown-it-py==3.0.0
|
80 |
-
MarkupSafe==3.0.2
|
81 |
-
matplotlib==3.9.3
|
82 |
-
matplotlib-inline==0.1.7
|
83 |
-
mdurl==0.1.2
|
84 |
-
mistune==3.0.2
|
85 |
-
mockito==1.5.4
|
86 |
-
mpmath==1.3.0
|
87 |
-
multidict==6.1.0
|
88 |
-
narwhals==1.23.0
|
89 |
-
nbclient==0.10.1
|
90 |
-
nbconvert==7.16.4
|
91 |
-
nbdime==4.0.2
|
92 |
-
nbformat==5.10.4
|
93 |
-
nest-asyncio==1.6.0
|
94 |
-
networkx==3.2.1
|
95 |
-
nltk==3.9.1
|
96 |
-
notebook_shim==0.2.4
|
97 |
-
numpy==2.2.0
|
98 |
-
openai==1.60.0
|
99 |
-
overrides==7.7.0
|
100 |
-
packaging==24.2
|
101 |
-
pandas==2.2.3
|
102 |
-
pandocfilters==1.5.1
|
103 |
-
parso==0.8.4
|
104 |
-
pexpect==4.9.0
|
105 |
-
pillow==10.4.0
|
106 |
-
platformdirs==4.3.6
|
107 |
-
playwright==1.49.1
|
108 |
-
plotly==5.24.1
|
109 |
-
pluggy==1.5.0
|
110 |
-
prometheus_client==0.21.1
|
111 |
-
prompt_toolkit==3.0.48
|
112 |
-
propcache==0.2.1
|
113 |
-
protobuf==5.29.3
|
114 |
-
psutil==6.1.1
|
115 |
-
ptyprocess==0.7.0
|
116 |
-
pure_eval==0.2.3
|
117 |
-
pyarrow==19.0.0
|
118 |
-
pycparser==2.22
|
119 |
-
pydantic==2.10.5
|
120 |
-
pydantic_core==2.27.2
|
121 |
-
pydeck==0.9.1
|
122 |
-
pyee==12.0.0
|
123 |
-
Pygments==2.18.0
|
124 |
-
pyOpenSSL==25.0.0
|
125 |
-
pyparsing==3.2.0
|
126 |
-
pytest==8.3.4
|
127 |
-
pytest-mockito==0.0.4
|
128 |
-
python-dateutil==2.9.0.post0
|
129 |
-
python-dotenv==1.0.1
|
130 |
-
python-json-logger==3.2.0
|
131 |
-
pytz==2024.2
|
132 |
-
PyYAML==6.0.2
|
133 |
-
pyzmq==26.2.0
|
134 |
-
rank-bm25==0.2.2
|
135 |
-
referencing==0.35.1
|
136 |
-
regex==2024.11.6
|
137 |
-
requests==2.32.3
|
138 |
-
rfc3339-validator==0.1.4
|
139 |
-
rfc3986-validator==0.1.1
|
140 |
-
rich==13.9.4
|
141 |
-
rpds-py==0.22.3
|
142 |
-
safetensors==0.5.2
|
143 |
-
scikit-learn==1.6.0
|
144 |
-
scipy==1.14.1
|
145 |
-
seaborn==0.13.2
|
146 |
-
Send2Trash==1.8.3
|
147 |
-
sentence-transformers==3.3.1
|
148 |
-
setuptools==75.6.0
|
149 |
-
six==1.17.0
|
150 |
-
smmap==5.0.1
|
151 |
-
sniffio==1.3.1
|
152 |
-
snowballstemmer==2.2.0
|
153 |
-
soupsieve==2.6
|
154 |
-
stack-data==0.6.3
|
155 |
-
streamlit==1.41.1
|
156 |
-
sympy==1.13.1
|
157 |
-
tenacity==9.0.0
|
158 |
-
terminado==0.18.1
|
159 |
-
tf-playwright-stealth==1.1.0
|
160 |
-
threadpoolctl==3.5.0
|
161 |
-
tiktoken==0.8.0
|
162 |
-
tinycss2==1.4.0
|
163 |
-
tokenizers==0.21.0
|
164 |
-
toml==0.10.2
|
165 |
-
torch==2.5.1+cpu
|
166 |
-
tornado==6.4.2
|
167 |
-
tqdm==4.67.1
|
168 |
-
traitlets==5.14.3
|
169 |
-
transformers==4.48.1
|
170 |
-
types-python-dateutil==2.9.0.20241206
|
171 |
-
typing_extensions==4.12.2
|
172 |
-
tzdata==2024.2
|
173 |
-
uri-template==1.3.0
|
174 |
-
urllib3==2.2.3
|
175 |
-
watchdog==6.0.0
|
176 |
-
wcwidth==0.2.13
|
177 |
-
webcolors==24.11.1
|
178 |
-
webencodings==0.5.1
|
179 |
-
websocket-client==1.8.0
|
180 |
-
xxhash==3.5.0
|
181 |
-
yarl==1.18.3
|
182 |
-
zipp==3.21.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup_env.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import sys
|
4 |
+
from pathlib import Path
|
5 |
+
import venv
|
6 |
+
import platform
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
def create_virtual_environment(venv_path):
|
13 |
+
"""Create a virtual environment."""
|
14 |
+
try:
|
15 |
+
subprocess.run([sys.executable, "-m", "venv", str(venv_path)], check=True)
|
16 |
+
logger.info(f"Created virtual environment at {venv_path}")
|
17 |
+
|
18 |
+
# Get pip path
|
19 |
+
pip_path = venv_path / "bin" / "pip"
|
20 |
+
|
21 |
+
# Upgrade pip
|
22 |
+
logger.info("Upgrading pip...")
|
23 |
+
subprocess.run([str(pip_path), "install", "--upgrade", "pip"], check=True)
|
24 |
+
except subprocess.CalledProcessError as e:
|
25 |
+
logger.error(f"Failed to create virtual environment: {e}")
|
26 |
+
raise
|
27 |
+
|
28 |
+
def install_dependencies(venv_path):
|
29 |
+
"""Install project dependencies."""
|
30 |
+
try:
|
31 |
+
pip_path = venv_path / "bin" / "pip"
|
32 |
+
|
33 |
+
# Install core dependencies first
|
34 |
+
logger.info("Installing core dependencies...")
|
35 |
+
subprocess.run([
|
36 |
+
str(pip_path), "install", "-e", "."
|
37 |
+
], check=True)
|
38 |
+
|
39 |
+
# Install dev dependencies separately
|
40 |
+
logger.info("Installing development dependencies...")
|
41 |
+
subprocess.run([
|
42 |
+
str(pip_path), "install",
|
43 |
+
"pytest==8.3.4",
|
44 |
+
"pytest-mockito==0.0.4",
|
45 |
+
"black==24.2.0",
|
46 |
+
"isort==5.13.0",
|
47 |
+
"flake8==7.0.0"
|
48 |
+
], check=True)
|
49 |
+
|
50 |
+
# Install playwright
|
51 |
+
logger.info("Installing and setting up Playwright...")
|
52 |
+
subprocess.run([
|
53 |
+
str(pip_path), "install", "playwright"
|
54 |
+
], check=True)
|
55 |
+
playwright_path = venv_path / "bin" / "playwright"
|
56 |
+
subprocess.run([str(playwright_path), "install"], check=True)
|
57 |
+
|
58 |
+
except subprocess.CalledProcessError as e:
|
59 |
+
logger.error(f"Failed to install dependencies: {e}")
|
60 |
+
logger.error(f"Exit code: {e.returncode}")
|
61 |
+
logger.error(f"Output: {e.output if hasattr(e, 'output') else 'No output'}")
|
62 |
+
raise
|
63 |
+
|
64 |
+
def create_env_file():
|
65 |
+
"""Create .env file if it doesn't exist."""
|
66 |
+
env_file = Path(".env")
|
67 |
+
if not env_file.exists():
|
68 |
+
with open(env_file, "w") as f:
|
69 |
+
f.write("GROQ_API_KEY=\n")
|
70 |
+
logger.info("Created .env file")
|
71 |
+
|
72 |
+
def main():
|
73 |
+
"""Main setup function."""
|
74 |
+
try:
|
75 |
+
# Check Python version
|
76 |
+
if sys.version_info < (3, 8):
|
77 |
+
raise RuntimeError("Python 3.8 or higher is required")
|
78 |
+
|
79 |
+
# Get project root directory
|
80 |
+
project_root = Path(__file__).parent
|
81 |
+
venv_path = project_root / ".venv"
|
82 |
+
|
83 |
+
# Create virtual environment if it doesn't exist
|
84 |
+
if not venv_path.exists():
|
85 |
+
create_virtual_environment(venv_path)
|
86 |
+
|
87 |
+
# Install dependencies
|
88 |
+
install_dependencies(venv_path)
|
89 |
+
|
90 |
+
# Create .env file
|
91 |
+
create_env_file()
|
92 |
+
|
93 |
+
logger.info("\nSetup completed successfully!")
|
94 |
+
logger.info("\nNext steps:")
|
95 |
+
logger.info("1. Update the .env file with your API keys")
|
96 |
+
logger.info("2. Activate the virtual environment:")
|
97 |
+
logger.info(" source .venv/bin/activate")
|
98 |
+
logger.info("3. Run the application: python -m streamlit run src/crawlgpt/ui/chat_app.py")
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"Setup failed: {e}")
|
102 |
+
sys.exit(1)
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
main()
|
{core β src}/__init__.py
RENAMED
File without changes
|
src/crawlgpt/__init__.py
ADDED
File without changes
|
{core β src/crawlgpt/core}/DatabaseHandler.py
RENAMED
File without changes
|
{core β src/crawlgpt/core}/LLMBasedCrawler.py
RENAMED
@@ -11,12 +11,12 @@ import logging
|
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
# Internal imports
|
14 |
-
from core.DatabaseHandler import VectorDatabase
|
15 |
-
from core.SummaryGenerator import SummaryGenerator
|
16 |
-
from utils.monitoring import MetricsCollector, RateLimiter, Metrics
|
17 |
-
from utils.progress import ProgressTracker
|
18 |
-
from utils.data_manager import DataManager
|
19 |
-
from utils.content_validator import ContentValidator
|
20 |
|
21 |
|
22 |
# Configure logging
|
|
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
# Internal imports
|
14 |
+
from src.crawlgpt.core.DatabaseHandler import VectorDatabase
|
15 |
+
from src.crawlgpt.core.SummaryGenerator import SummaryGenerator
|
16 |
+
from src.crawlgpt.utils.monitoring import MetricsCollector, RateLimiter, Metrics
|
17 |
+
from src.crawlgpt.utils.progress import ProgressTracker
|
18 |
+
from src.crawlgpt.utils.data_manager import DataManager
|
19 |
+
from src.crawlgpt.utils.content_validator import ContentValidator
|
20 |
|
21 |
|
22 |
# Configure logging
|
{core β src/crawlgpt/core}/SummaryGenerator.py
RENAMED
@@ -55,6 +55,12 @@ class SummaryGenerator:
|
|
55 |
Raises:
|
56 |
Exception: If API call fails or text processing errors occur
|
57 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
messages = [
|
59 |
{"role": "system", "content": "Generate a concise summary for the following text."},
|
60 |
{"role": "user", "content": text},
|
|
|
55 |
Raises:
|
56 |
Exception: If API call fails or text processing errors occur
|
57 |
"""
|
58 |
+
|
59 |
+
# Handle empty input text
|
60 |
+
if not text or not text.strip():
|
61 |
+
return ""
|
62 |
+
|
63 |
+
# Generate a summary using the Groq API
|
64 |
messages = [
|
65 |
{"role": "system", "content": "Generate a concise summary for the following text."},
|
66 |
{"role": "user", "content": text},
|
src/crawlgpt/core/__init__.py
ADDED
File without changes
|
{ui β src/crawlgpt/ui}/chat_app.py
RENAMED
@@ -3,11 +3,11 @@ import streamlit as st
|
|
3 |
import asyncio
|
4 |
import time
|
5 |
from datetime import datetime
|
6 |
-
from core.LLMBasedCrawler import Model
|
7 |
-
from utils.monitoring import MetricsCollector, Metrics
|
8 |
-
from utils.progress import ProgressTracker
|
9 |
-
from utils.data_manager import DataManager
|
10 |
-
from utils.content_validator import ContentValidator
|
11 |
import json
|
12 |
|
13 |
# Streamlit app title and description
|
@@ -46,7 +46,7 @@ with st.sidebar:
|
|
46 |
st.session_state.use_summary = st.checkbox("Use Summarized RAG", value=False, help="Don't use summaization when dealing with Coding Documentation.")
|
47 |
st.subheader("π€ Normal LLM Settings")
|
48 |
temperature = st.slider("Temperature", 0.0, 1.0, 0.7, help="Controls the randomness of the generated text. Lower values are more deterministic.")
|
49 |
-
max_tokens = st.slider("Max Tokens", 500, 10000,
|
50 |
model_id = st.radio("Model ID", ['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'mixtral-8x7b-32768'], help="Choose the model to use for generating responses.")
|
51 |
|
52 |
# Export/Import Data
|
|
|
3 |
import asyncio
|
4 |
import time
|
5 |
from datetime import datetime
|
6 |
+
from src.crawlgpt.core.LLMBasedCrawler import Model
|
7 |
+
from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
|
8 |
+
from src.crawlgpt.utils.progress import ProgressTracker
|
9 |
+
from src.crawlgpt.utils.data_manager import DataManager
|
10 |
+
from src.crawlgpt.utils.content_validator import ContentValidator
|
11 |
import json
|
12 |
|
13 |
# Streamlit app title and description
|
|
|
46 |
st.session_state.use_summary = st.checkbox("Use Summarized RAG", value=False, help="Don't use summaization when dealing with Coding Documentation.")
|
47 |
st.subheader("π€ Normal LLM Settings")
|
48 |
temperature = st.slider("Temperature", 0.0, 1.0, 0.7, help="Controls the randomness of the generated text. Lower values are more deterministic.")
|
49 |
+
max_tokens = st.slider("Max Tokens", 500, 10000, 5000, help="Maximum number of tokens to generate in the response.")
|
50 |
model_id = st.radio("Model ID", ['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'mixtral-8x7b-32768'], help="Choose the model to use for generating responses.")
|
51 |
|
52 |
# Export/Import Data
|
{ui β src/crawlgpt/ui}/chat_ui.py
RENAMED
@@ -5,11 +5,11 @@ import streamlit as st
|
|
5 |
import asyncio
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
-
from core.LLMBasedCrawler import Model
|
9 |
-
from utils.monitoring import MetricsCollector, Metrics
|
10 |
-
from utils.progress import ProgressTracker
|
11 |
-
from utils.data_manager import DataManager
|
12 |
-
from utils.content_validator import ContentValidator
|
13 |
import json
|
14 |
|
15 |
# Streamlit app title and description
|
|
|
5 |
import asyncio
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
+
from src.crawlgpt.core.LLMBasedCrawler import Model
|
9 |
+
from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
|
10 |
+
from src.crawlgpt.utils.progress import ProgressTracker
|
11 |
+
from src.crawlgpt.utils.data_manager import DataManager
|
12 |
+
from src.crawlgpt.utils.content_validator import ContentValidator
|
13 |
import json
|
14 |
|
15 |
# Streamlit app title and description
|
{utils β src/crawlgpt/utils}/__init__.py
RENAMED
File without changes
|
{utils β src/crawlgpt/utils}/content_validator.py
RENAMED
File without changes
|
{utils β src/crawlgpt/utils}/data_manager.py
RENAMED
File without changes
|
{utils β src/crawlgpt/utils}/helper_functions.py
RENAMED
File without changes
|
{utils β src/crawlgpt/utils}/monitoring.py
RENAMED
File without changes
|
{utils β src/crawlgpt/utils}/progress.py
RENAMED
File without changes
|
tests/{test_database_handler.py β test_core/test_database_handler.py}
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
import unittest
|
2 |
import asyncio
|
3 |
from unittest.mock import AsyncMock, MagicMock
|
4 |
-
from core.LLMBasedCrawler import Model
|
5 |
-
from core.DatabaseHandler import VectorDatabase
|
6 |
|
7 |
|
8 |
class TestIntegration(unittest.TestCase):
|
|
|
1 |
import unittest
|
2 |
import asyncio
|
3 |
from unittest.mock import AsyncMock, MagicMock
|
4 |
+
from src.crawlgpt.core.LLMBasedCrawler import Model
|
5 |
+
from src.crawlgpt.core.DatabaseHandler import VectorDatabase
|
6 |
|
7 |
|
8 |
class TestIntegration(unittest.TestCase):
|
tests/{test_integration.py β test_core/test_integration.py}
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
import unittest
|
2 |
from unittest.mock import AsyncMock, MagicMock
|
3 |
-
from core.LLMBasedCrawler import Model
|
4 |
-
from core.DatabaseHandler import VectorDatabase
|
5 |
|
6 |
|
7 |
class TestIntegration(unittest.IsolatedAsyncioTestCase): # Use IsolatedAsyncioTestCase for async tests
|
|
|
1 |
import unittest
|
2 |
from unittest.mock import AsyncMock, MagicMock
|
3 |
+
from crawlgpt.core.LLMBasedCrawler import Model
|
4 |
+
from crawlgpt.core.DatabaseHandler import VectorDatabase
|
5 |
|
6 |
|
7 |
class TestIntegration(unittest.IsolatedAsyncioTestCase): # Use IsolatedAsyncioTestCase for async tests
|
tests/{test_llm_based_crawler.py β test_core/test_llm_based_crawler.py}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import unittest
|
2 |
from unittest.mock import AsyncMock, MagicMock
|
3 |
-
from core.LLMBasedCrawler import Model
|
4 |
import asyncio
|
5 |
|
6 |
|
|
|
1 |
import unittest
|
2 |
from unittest.mock import AsyncMock, MagicMock
|
3 |
+
from crawlgpt.core.LLMBasedCrawler import Model
|
4 |
import asyncio
|
5 |
|
6 |
|
tests/{test_summary_generator.py β test_core/test_summary_generator.py}
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
import unittest
|
2 |
-
from core.SummaryGenerator import SummaryGenerator
|
3 |
|
4 |
|
5 |
class TestSummaryGenerator(unittest.TestCase):
|
|
|
1 |
import unittest
|
2 |
+
from crawlgpt.core.SummaryGenerator import SummaryGenerator
|
3 |
|
4 |
|
5 |
class TestSummaryGenerator(unittest.TestCase):
|