Spaces:
Runtime error
Runtime error
coderpotter
commited on
Upload folder using huggingface_hub
Browse files- .github/workflows/lint_pytest.yml +25 -0
- .github/workflows/update_space.yml +28 -0
- .gitignore +13 -0
- KEYS_TEMPLATE.py +3 -0
- README.md +63 -7
- config/config.yaml +32 -0
- mypy.ini +2 -0
- requirements.txt +28 -0
- setup.py +30 -0
- src/research_assistant/__init__.py +0 -0
- src/research_assistant/app_logging/__init__.py +14 -0
- src/research_assistant/components/__init__.py +0 -0
- src/research_assistant/components/agent.py +74 -0
- src/research_assistant/components/agent_tools.py +96 -0
- src/research_assistant/components/arxiv_search_api.py +84 -0
- src/research_assistant/components/pdfParser.py +68 -0
- src/research_assistant/components/planner.py +112 -0
- src/research_assistant/components/plannerParser.py +168 -0
- src/research_assistant/components/solver.py +72 -0
- src/research_assistant/components/state.py +13 -0
- src/research_assistant/config/__init__.py +0 -0
- src/research_assistant/config/configuration.py +55 -0
- src/research_assistant/constants/__init__.py +12 -0
- src/research_assistant/entity/__init__.py +46 -0
- src/research_assistant/main.py +58 -0
- src/research_assistant/pipeline/__init__.py +0 -0
- src/research_assistant/pipeline/articleSearch.py +15 -0
- src/research_assistant/pipeline/articleSummarization.py +98 -0
- src/research_assistant/utils/__init__.py +0 -0
- src/research_assistant/utils/common.py +57 -0
- src/research_assistant/utils/state_utils.py +25 -0
- src/research_assistant/web/app.py +58 -0
- tests.py +2 -0
.github/workflows/lint_pytest.yml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: lint_pytest
|
2 |
+
on: push
|
3 |
+
jobs:
|
4 |
+
lint:
|
5 |
+
runs-on: ubuntu-latest
|
6 |
+
steps:
|
7 |
+
- uses: actions/checkout@v4
|
8 |
+
- uses: psf/black@stable
|
9 |
+
- name: Set up Python
|
10 |
+
uses: actions/setup-python@v5
|
11 |
+
with:
|
12 |
+
python-version: "3.9.19"
|
13 |
+
- name: Install dependencies
|
14 |
+
run: |
|
15 |
+
python -m pip install --upgrade pip
|
16 |
+
pip install -r requirements.txt
|
17 |
+
- name: Ruff
|
18 |
+
run: |
|
19 |
+
ruff check --output-format=github .
|
20 |
+
- name: Mypy
|
21 |
+
run: |
|
22 |
+
mypy . --install-types --non-interactive
|
23 |
+
- name: Test with pytest
|
24 |
+
run: |
|
25 |
+
pytest -s tests.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
|
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Run Python script
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: '3.9'
|
20 |
+
|
21 |
+
- name: Install Gradio
|
22 |
+
run: python -m pip install gradio
|
23 |
+
|
24 |
+
- name: Log in to Hugging Face
|
25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
26 |
+
|
27 |
+
- name: Deploy to Spaces
|
28 |
+
run: gradio deploy
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**cache**
|
2 |
+
**playground**
|
3 |
+
.DS_Store
|
4 |
+
.env
|
5 |
+
settings.json
|
6 |
+
data
|
7 |
+
logs
|
8 |
+
keys.yaml
|
9 |
+
*.egg-info
|
10 |
+
research_trails
|
11 |
+
KEYS.py
|
12 |
+
.gradio/
|
13 |
+
**.pdf
|
KEYS_TEMPLATE.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
ANTHROPIC = "sk-xxx"
|
2 |
+
FIREWORKS_AI = "xxx"
|
3 |
+
OPENAI = "sk-proj-xxx"
|
README.md
CHANGED
@@ -1,12 +1,68 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.6.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: research-assistant
|
3 |
+
app_file: src/research_assistant/web/app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.6.0
|
|
|
|
|
6 |
---
|
7 |
+
# Research Assistant
|
8 |
|
9 |
+
This is a Research Assistant that helps in analyzing and simplifying the content present in a research article, such that you don't have to read the whole thing to understand what knowledge is being presented inside the article. This tool takes care of it and provides you the understanding you would need.
|
10 |
+
|
11 |
+
# How to run?
|
12 |
+
## Step 1:
|
13 |
+
First install the requirements
|
14 |
+
```bash
|
15 |
+
pip install -r requirements.txt
|
16 |
+
pip install -e .
|
17 |
+
```
|
18 |
+
|
19 |
+
## Step 2:
|
20 |
+
create a file named keys.yaml following the template present in keys_template.yaml
|
21 |
+
Input your api keys inside keys.yaml and save them
|
22 |
+
|
23 |
+
## Step 3:
|
24 |
+
Update the contents in config/config.yaml file. The path for your file name, and the search parameters for articles are present inside the config.yaml file. Before every run, if you want to change the serach configuration or summarization parameters, you need to update the config.yaml file.
|
25 |
+
|
26 |
+
## Step 4:
|
27 |
+
The Summarization pipeline can be run in 2 ways:
|
28 |
+
|
29 |
+
### From Command Line Interface as Pip Package:
|
30 |
+
Step 1 installs the whole repo as a pip installable package in editable mode in your pip.
|
31 |
+
To access the package and get the summary of the file, run the following command:
|
32 |
+
|
33 |
+
|
34 |
+
```bash
|
35 |
+
research --pipeline_name
|
36 |
+
```
|
37 |
+
### From running the Main File:
|
38 |
+
|
39 |
+
##### Confirm the Arguments:
|
40 |
+
To change the model names, filepath to get the summary, filepath to save the summary, make changes in the config/config.yaml file.
|
41 |
+
|
42 |
+
##### Run the following command
|
43 |
+
```bash
|
44 |
+
python src/research_assistant/main.py --pipeline_name
|
45 |
+
```
|
46 |
+
|
47 |
+
# Different Pipelines:
|
48 |
+
There are two pipelies available here:
|
49 |
+
## Summarization Pipeline:
|
50 |
+
Given a paper, this pipeline gives out the summarization. The paper filepath can be set in config/config.yaml. To activate this pipeline, run the following command:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
# If you want to use the package version, run the following command
|
54 |
+
research --summarize_article
|
55 |
+
|
56 |
+
#If you directly want to directly compile in CLI, run the following command
|
57 |
+
python src/research_assistant/main.py --summarize_article
|
58 |
+
```
|
59 |
+
## Searching for Articles:
|
60 |
+
Given a few keywords, and other parameters, this pipeline will crawl Arxiv and get you the artciles that are relevant for you. To activate this pipeline, run the following command
|
61 |
+
|
62 |
+
```bash
|
63 |
+
# If you want to use the package version, run the following command
|
64 |
+
research --search_articles
|
65 |
+
|
66 |
+
#If you directly want to directly compile in CLI, run the following command
|
67 |
+
python src/research_assistant/main.py --search_articles
|
68 |
+
```
|
config/config.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# RENAME THIS TO YOUR CONFIGURATIONS
|
2 |
+
article_details:
|
3 |
+
file_path: 'data/2106.07691v1.pdf'
|
4 |
+
summary_save_dir: 'data/summary_results'
|
5 |
+
|
6 |
+
# TODO: Remove this when TECH-67 is implemented
|
7 |
+
article_search_details:
|
8 |
+
search_terms: ['Hallucination', 'Natural Language Inference']
|
9 |
+
num_results: 5
|
10 |
+
date_range:
|
11 |
+
start_date: '2023-01-01'
|
12 |
+
end_date: '2024-12-31'
|
13 |
+
sort_by: 'submittedDate'
|
14 |
+
sort_order: 'descending'
|
15 |
+
|
16 |
+
#################################################################################################################
|
17 |
+
# MAKE CHANGES FROM HERE ONLY IF YOU ARE SURE OF WHAT YOU ARE DOING, ELSE DO NOT EDIT
|
18 |
+
#################################################################################################################
|
19 |
+
|
20 |
+
planner:
|
21 |
+
model_name: 'claude-3-5-sonnet-20241022'
|
22 |
+
|
23 |
+
planner_parser:
|
24 |
+
tool_list_model: 'claude-3-5-sonnet-20241022'
|
25 |
+
argument_list_model: 'claude-3-5-sonnet-20241022'
|
26 |
+
dependency_list_model: 'claude-3-5-sonnet-20241022'
|
27 |
+
|
28 |
+
qa_tool:
|
29 |
+
model_name: 'claude-3-5-sonnet-20241022'
|
30 |
+
|
31 |
+
solver:
|
32 |
+
model_name: 'claude-3-5-sonnet-20241022'
|
mypy.ini
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[mypy]
|
2 |
+
ignore_missing_imports = True
|
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-dotenv
|
2 |
+
langsmith
|
3 |
+
langgraph
|
4 |
+
langchain
|
5 |
+
numexpr
|
6 |
+
langchain-anthropic
|
7 |
+
langchain-community
|
8 |
+
langchain-core
|
9 |
+
pdfminer
|
10 |
+
pytest
|
11 |
+
pdfminer.six
|
12 |
+
arxiv
|
13 |
+
python-box
|
14 |
+
langchain-fireworks
|
15 |
+
langchain-google_vertexai
|
16 |
+
langchain-openai
|
17 |
+
types-PyYAML
|
18 |
+
gradio
|
19 |
+
markdown2
|
20 |
+
fpdf2
|
21 |
+
mistletoe
|
22 |
+
pydantic==2.9.0
|
23 |
+
xmltodict
|
24 |
+
mypy
|
25 |
+
types-requests
|
26 |
+
ruff
|
27 |
+
pytest
|
28 |
+
pytest-cov
|
setup.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setuptools
|
2 |
+
|
3 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
4 |
+
long_description = f.read()
|
5 |
+
|
6 |
+
__version__ = "0.0.0"
|
7 |
+
REPO_NAME = "research-assistant"
|
8 |
+
AUTHOR_USER_NAME = "Actualization-AI"
|
9 |
+
SRC_REPO = "research-assistant"
|
10 |
+
AUTHOR_EMAIL = "[email protected]"
|
11 |
+
setuptools.setup(
|
12 |
+
name=SRC_REPO,
|
13 |
+
version=__version__,
|
14 |
+
author=AUTHOR_USER_NAME,
|
15 |
+
author_email=AUTHOR_EMAIL,
|
16 |
+
description="A Research Assistant which can scrape and summarize research articles for easier understanding",
|
17 |
+
long_description=long_description,
|
18 |
+
long_description_content_type="text/markdown",
|
19 |
+
url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
|
20 |
+
project_urls={
|
21 |
+
"Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
|
22 |
+
},
|
23 |
+
entry_points={ # Entry point for CLI
|
24 |
+
"console_scripts": [
|
25 |
+
"research=research_assistant.main:main", # Command to run main function
|
26 |
+
]
|
27 |
+
},
|
28 |
+
package_dir={"": "src"},
|
29 |
+
packages=setuptools.find_packages(where="src"),
|
30 |
+
)
|
src/research_assistant/__init__.py
ADDED
File without changes
|
src/research_assistant/app_logging/__init__.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
|
5 |
+
log_dir = "logs"
|
6 |
+
log_filepath = os.path.join(log_dir, "running_logs.log")
|
7 |
+
os.makedirs(log_dir, exist_ok=True)
|
8 |
+
|
9 |
+
logging.basicConfig(
|
10 |
+
level=logging.INFO,
|
11 |
+
format="[%(asctime)s: %(levelname)s: %(module)s: %(message)s]",
|
12 |
+
handlers=[logging.FileHandler(log_filepath), logging.StreamHandler(sys.stdout)],
|
13 |
+
)
|
14 |
+
app_logger = logging.getLogger("ResearchAssistantLogger")
|
src/research_assistant/components/__init__.py
ADDED
File without changes
|
src/research_assistant/components/agent.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from langchain_anthropic import ChatAnthropic
|
5 |
+
from langchain_fireworks import ChatFireworks
|
6 |
+
from langchain_google_vertexai import ChatVertexAI
|
7 |
+
from langchain_openai import ChatOpenAI
|
8 |
+
|
9 |
+
sys.path.append(os.getcwd())
|
10 |
+
import KEYS
|
11 |
+
from research_assistant.app_logging import app_logger
|
12 |
+
|
13 |
+
|
14 |
+
def set_api_key(env_var: str, api_key: str):
|
15 |
+
os.environ[env_var] = api_key
|
16 |
+
|
17 |
+
|
18 |
+
class Agent:
|
19 |
+
def __init__(self, model_name: str):
|
20 |
+
model_classes = {
|
21 |
+
"gpt": (
|
22 |
+
(ChatOpenAI, "OPENAI_API_KEY", KEYS.OPENAI) # type: ignore
|
23 |
+
if "OPENAI" in KEYS.__dict__
|
24 |
+
else (None, None, None)
|
25 |
+
),
|
26 |
+
"claude": (
|
27 |
+
(ChatAnthropic, "ANTHROPIC_API_KEY", KEYS.ANTHROPIC) # type: ignore
|
28 |
+
if "ANTHROPIC" in KEYS.__dict__
|
29 |
+
else (None, None, None)
|
30 |
+
),
|
31 |
+
"gemini": (
|
32 |
+
(ChatVertexAI, "GOOGLE_API_KEY", KEYS.VERTEX_AI) # type: ignore
|
33 |
+
if "VERTEX_AI" in KEYS.__dict__
|
34 |
+
else (None, None, None)
|
35 |
+
),
|
36 |
+
"fireworks": (
|
37 |
+
(ChatFireworks, "FIREWORKS_API_KEY", KEYS.FIREWORKS_AI) # type: ignore
|
38 |
+
if "FIREWORKS_AI" in KEYS.__dict__
|
39 |
+
else (None, None, None)
|
40 |
+
),
|
41 |
+
}
|
42 |
+
max_tokens_map = {
|
43 |
+
"gpt-3.5": 16000,
|
44 |
+
"gpt-4": 8000,
|
45 |
+
"gpt-4o-mini": 8000,
|
46 |
+
"llama-v3p2-1b-instruct": 128000,
|
47 |
+
"llama-v3p2-3b-instruct": 128000,
|
48 |
+
"llama-v3p1-8b-instruct": 128000,
|
49 |
+
"llama-v3p1-70b-instruct": 128000,
|
50 |
+
"llama-v3p1-405b-instruct": 128000,
|
51 |
+
"mixtral-8x22b-instruct": 64000,
|
52 |
+
"mixtral-8x7b-instruct": 32000,
|
53 |
+
"mixtral-8x7b-instruct-hf": 32000,
|
54 |
+
"qwen2p5-72b-instruct": 32000,
|
55 |
+
"gemma2-9b-it": 8000,
|
56 |
+
"llama-v3-8b-instruct": 8000,
|
57 |
+
"llama-v3-70b-instruct": 8000,
|
58 |
+
"llama-v3-70b-instruct-hf": 8000,
|
59 |
+
}
|
60 |
+
for key, (model_class, env_var, api_key) in model_classes.items():
|
61 |
+
if model_class is not None and key in model_name:
|
62 |
+
set_api_key(env_var, api_key) # type: ignore
|
63 |
+
model = model_class(model=model_name, temperature=0.5) # type: ignore
|
64 |
+
max_tokens = max_tokens_map.get(model_name, 128000)
|
65 |
+
break
|
66 |
+
else:
|
67 |
+
raise ValueError(f"Model {model_name} not supported")
|
68 |
+
|
69 |
+
app_logger.info(f"Model {model_name} is initialized successfully")
|
70 |
+
self.model = model
|
71 |
+
self.max_tokens = max_tokens
|
72 |
+
|
73 |
+
def get_model(self):
|
74 |
+
return self.model
|
src/research_assistant/components/agent_tools.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
|
3 |
+
from langchain_anthropic.chat_models import ChatAnthropic
|
4 |
+
from langchain_community.utilities.arxiv import ArxivAPIWrapper
|
5 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
6 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
7 |
+
from langchain_core.runnables import RunnableConfig
|
8 |
+
from langchain_core.tools import StructuredTool
|
9 |
+
from pydantic import BaseModel, Field
|
10 |
+
|
11 |
+
|
12 |
+
class ExecuteCode(BaseModel):
|
13 |
+
"""The input to the summarizer tool function."""
|
14 |
+
|
15 |
+
reasoning: str = Field(
|
16 |
+
description="The reasoning behind the code expression, including how context is included, if applicable.",
|
17 |
+
)
|
18 |
+
answer: str = Field(
|
19 |
+
...,
|
20 |
+
description="The answer to the question about the research article.",
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def get_qa_tool(llm: ChatAnthropic):
|
25 |
+
prompt = ChatPromptTemplate.from_messages(
|
26 |
+
[
|
27 |
+
SystemMessage(
|
28 |
+
(
|
29 |
+
"You are an advanced research assistant answering questions about a specific research article. The question may require external information beyond the research article itself. This external information, along with the parsed content from the research article, will be provided as 'Additional Context'.\n\n"
|
30 |
+
# comment for readibility
|
31 |
+
"You must:\n"
|
32 |
+
"1. Thoroughly analyze the research article to understand its key objectives, methods, findings, and implications.\n"
|
33 |
+
"2. Use the research article and any additional context to construct a comprehensive, well-informed answer to the given question.\n"
|
34 |
+
"3. Explicitly reference and combine information from both the research article and the additional context when needed, ensuring that the response is relevant, accurate, and complete.\n\n"
|
35 |
+
# comment for readibility
|
36 |
+
"Follow these steps when answering:\n"
|
37 |
+
"- If the question can be answered using information from the research article alone, do so.\n"
|
38 |
+
"- If additional context is needed to supplement or clarify the answer, carefully integrate it with the information from the article.\n"
|
39 |
+
"- Ensure the response is precise, concise, and clear, citing the research article and additional context appropriately."
|
40 |
+
)
|
41 |
+
),
|
42 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
43 |
+
MessagesPlaceholder(variable_name="question"),
|
44 |
+
]
|
45 |
+
)
|
46 |
+
summarizer = prompt | llm.with_structured_output(ExecuteCode)
|
47 |
+
|
48 |
+
def get_answer(
|
49 |
+
question: str,
|
50 |
+
context: Optional[List[str]] = None,
|
51 |
+
config: Optional[RunnableConfig] = None,
|
52 |
+
):
|
53 |
+
context_str = "\n".join(context).strip() if context else None
|
54 |
+
chain_input = {
|
55 |
+
"question": [HumanMessage(question)],
|
56 |
+
"context": [
|
57 |
+
(
|
58 |
+
HumanMessage(
|
59 |
+
(
|
60 |
+
f"Additional context has been provided from other tools (such as parsed PDF content or information retrieved from internet searches). Use it to substitute into any {{#}} variables or other words in the question. Do not directly substitute the value. Rather, extract information in the best suitable format and then substitute. Use this context to enrich your answer by integrating it with the information from the research article. Context:\n{context_str}\n\n"
|
61 |
+
# comment for readibility
|
62 |
+
"Instructions:\n"
|
63 |
+
"- Identify where the additional context is necessary to supplement or clarify the research article's information.\n"
|
64 |
+
"- Replace any placeholders or variable information (e.g., {{#}}) with appropriate details from the context.\n"
|
65 |
+
"- Make sure the final answer blends the research article content with the additional context in a cohesive and accurate manner.\n\n"
|
66 |
+
# comment for readibility
|
67 |
+
"Once done, output the updated, comprehensive answer."
|
68 |
+
)
|
69 |
+
)
|
70 |
+
if context_str
|
71 |
+
else HumanMessage("No Additional Context is Provided")
|
72 |
+
)
|
73 |
+
],
|
74 |
+
}
|
75 |
+
return summarizer.invoke(chain_input, config)
|
76 |
+
|
77 |
+
return StructuredTool.from_function(
|
78 |
+
name="qa_agent",
|
79 |
+
func=get_answer,
|
80 |
+
description="This tool is designed to answer specific questions about a research article, rather than simply providing a full summary. It offers a well-rounded and accurate response to your inquiry, allowing you to focus on the exact information you need without having to go through the entire article.",
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
def get_arxiv_tool(
|
85 |
+
k_results: int = 3,
|
86 |
+
max_query_length: int = 300,
|
87 |
+
max_docs: int = 3,
|
88 |
+
doc_content_chars_max: int = 40000,
|
89 |
+
):
|
90 |
+
return ArxivAPIWrapper( # type: ignore
|
91 |
+
top_k_results=k_results,
|
92 |
+
ARXIV_MAX_QUERY_LENGTH=max_query_length,
|
93 |
+
load_max_docs=max_docs,
|
94 |
+
load_all_available_meta=False,
|
95 |
+
doc_content_chars_max=doc_content_chars_max,
|
96 |
+
)
|
src/research_assistant/components/arxiv_search_api.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import xmltodict
|
5 |
+
|
6 |
+
from research_assistant.app_logging import app_logger
|
7 |
+
from research_assistant.constants import ARXIV_API_ACCESS_POINT
|
8 |
+
from research_assistant.entity import ArticleSearchConfig
|
9 |
+
|
10 |
+
|
11 |
+
class ArxivApiWrap:
|
12 |
+
def __init__(self, config: ArticleSearchConfig):
|
13 |
+
self.config = config
|
14 |
+
|
15 |
+
def convert_link_to_pdflink(self, link):
|
16 |
+
return link.replace("/abs/", "/pdf/") + ".pdf"
|
17 |
+
|
18 |
+
def convert_date(self, date):
|
19 |
+
return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
|
20 |
+
|
21 |
+
"""
|
22 |
+
Fetches the response from the arXiv API based on the specified search terms and parameters.
|
23 |
+
Args used by the arXiv API:
|
24 |
+
Keywords (list of str): Contains the search terms
|
25 |
+
max_length (int): Maximum number of articles to retrieve
|
26 |
+
Date range : Contains start and end dates for the search
|
27 |
+
Sort by : Sorts the results by a specific field (e.g., submittedDate)
|
28 |
+
Sort order (str): Sort order for the results (e.g., asc, desc)
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
requests.Response: The HTTP response object returned by the arXiv API.
|
32 |
+
"""
|
33 |
+
|
34 |
+
def get_arxiv_api_response(self):
|
35 |
+
keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms])
|
36 |
+
if self.config.date_range.start_date:
|
37 |
+
query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]"
|
38 |
+
else:
|
39 |
+
query = f" all:{keyword_query}"
|
40 |
+
params = {
|
41 |
+
"search_query": query,
|
42 |
+
"start": 0, # Starts from page 1 of the results obtained
|
43 |
+
"max_results": self.config.num_results, # Adjust the number of results as needed
|
44 |
+
"sortBy": self.config.sort_by, # Sort by submission date
|
45 |
+
"sortOrder": self.config.sort_order, # Sort in descending order (latest first)
|
46 |
+
}
|
47 |
+
return requests.get(ARXIV_API_ACCESS_POINT, params=params)
|
48 |
+
|
49 |
+
"""
|
50 |
+
Retrieves article search results from the arXiv API and logs detailed information about each article.
|
51 |
+
|
52 |
+
This method fetches the API response, parses the XML content into a structured format,
|
53 |
+
and extracts key information such as the title, summary, link, and authors for each article.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
list of str: A list of article links retrieved from the arXiv API.
|
57 |
+
"""
|
58 |
+
|
59 |
+
def get_article_search_result(self):
|
60 |
+
response = self.get_arxiv_api_response() # Fetch the API response
|
61 |
+
article_links = []
|
62 |
+
if response.status_code == 200: # Check if the request was successful
|
63 |
+
# Parse the response (arXiv API returns XML)
|
64 |
+
data = xmltodict.parse(response.content)
|
65 |
+
for entry in data["feed"]["entry"]:
|
66 |
+
title, summary, link, authors = (
|
67 |
+
entry["title"],
|
68 |
+
entry["summary"],
|
69 |
+
entry["id"],
|
70 |
+
[author["name"] for author in entry["author"]],
|
71 |
+
)
|
72 |
+
app_logger.info(
|
73 |
+
f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}"
|
74 |
+
)
|
75 |
+
article_links.append(link)
|
76 |
+
else:
|
77 |
+
app_logger.info(f"Failed to retrieve papers: {response.status_code}")
|
78 |
+
return article_links
|
79 |
+
|
80 |
+
def download_pdf(self, pdf_url):
|
81 |
+
response, title = requests.get(pdf_url), pdf_url.split("/")[0]
|
82 |
+
with open(f"data/{title}.pdf", "wb") as f:
|
83 |
+
f.write(response.content)
|
84 |
+
print(f"Downloaded: {title}.pdf")
|
src/research_assistant/components/pdfParser.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from pdfminer.high_level import extract_pages
|
4 |
+
from pdfminer.layout import LTTextContainer
|
5 |
+
|
6 |
+
from research_assistant.app_logging import app_logger
|
7 |
+
|
8 |
+
|
9 |
+
def pdf_parser(pdf_path):
|
10 |
+
"""
|
11 |
+
Extracts text from a PDF file, removing headers, footers, and page numbers.
|
12 |
+
Args:
|
13 |
+
pdf_path (str): The file path to the PDF.
|
14 |
+
Returns:
|
15 |
+
str: The extracted text suitable for LLM input.
|
16 |
+
"""
|
17 |
+
extracted_text = []
|
18 |
+
header_counter, footer_counter = {}, {}
|
19 |
+
header_patterns, footer_patterns = set(), set()
|
20 |
+
# Matches lines with page numbers
|
21 |
+
page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$")
|
22 |
+
|
23 |
+
try:
|
24 |
+
# First pass: identify headers and footers by tracking recurring lines
|
25 |
+
total_pages = 0
|
26 |
+
for page_layout in extract_pages(pdf_path):
|
27 |
+
total_pages += 1
|
28 |
+
page_text = [
|
29 |
+
element.get_text().strip()
|
30 |
+
for element in page_layout
|
31 |
+
if isinstance(element, LTTextContainer) and element.get_text().strip()
|
32 |
+
]
|
33 |
+
|
34 |
+
if len(page_text) >= 2:
|
35 |
+
header, footer = page_text[0], page_text[-1]
|
36 |
+
header_counter[header] = header_counter.get(header, 0) + 1
|
37 |
+
footer_counter[footer] = footer_counter.get(footer, 0) + 1
|
38 |
+
|
39 |
+
# Determine most common headers and footers
|
40 |
+
header_patterns = {
|
41 |
+
k for k, v in header_counter.items() if v > total_pages * 0.5
|
42 |
+
}
|
43 |
+
footer_patterns = {
|
44 |
+
k for k, v in footer_counter.items() if v > total_pages * 0.5
|
45 |
+
}
|
46 |
+
|
47 |
+
# Compile regex patterns
|
48 |
+
header_regexes = [re.compile(re.escape(header)) for header in header_patterns]
|
49 |
+
footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns]
|
50 |
+
|
51 |
+
# Second pass: extract and clean text
|
52 |
+
for page_layout in extract_pages(pdf_path):
|
53 |
+
page_text = [
|
54 |
+
element.get_text().strip()
|
55 |
+
for element in page_layout
|
56 |
+
if isinstance(element, LTTextContainer) and element.get_text().strip()
|
57 |
+
]
|
58 |
+
extracted_text.extend(
|
59 |
+
line
|
60 |
+
for line in page_text
|
61 |
+
if not any(regex.match(line) for regex in header_regexes)
|
62 |
+
and not any(regex.match(line) for regex in footer_regexes)
|
63 |
+
and not page_number_pattern.match(line)
|
64 |
+
)
|
65 |
+
return " ".join(extracted_text).replace("\n", " ").strip()
|
66 |
+
except Exception as e:
|
67 |
+
app_logger.error(f"Failed to parse PDF {pdf_path}: {e}")
|
68 |
+
return ""
|
src/research_assistant/components/planner.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Optional
|
2 |
+
|
3 |
+
from langchain_anthropic.chat_models import ChatAnthropic
|
4 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
5 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
6 |
+
from langchain_core.runnables import RunnableConfig
|
7 |
+
from langchain_core.tools import StructuredTool
|
8 |
+
from pydantic import BaseModel, Field
|
9 |
+
|
10 |
+
from research_assistant.components.plannerParser import PlannerParser
|
11 |
+
from research_assistant.constants import HEILMEIER_CATECHISM
|
12 |
+
|
13 |
+
|
14 |
+
class PlannerOutput(BaseModel):
|
15 |
+
plan_str: str = Field(
|
16 |
+
...,
|
17 |
+
description=(
|
18 |
+
"This plan includes a detailed breakdown of each step, specifying the task, the tool used, the arguments provided, and any dependencies (outputs from previous steps) required as inputs for that step. An example of a single step would be:\n"
|
19 |
+
'Plan_step: "Using the different shots from #E2, analyze their impact on the game. #E3: LLM [What impact do the shots mentioned in #E2 have on the game?]"'
|
20 |
+
),
|
21 |
+
)
|
22 |
+
tools: list[str] = Field(
|
23 |
+
..., description="The Tool each step of the plan needs to use."
|
24 |
+
)
|
25 |
+
dependencies: Dict[int, list[int]] = Field(
|
26 |
+
...,
|
27 |
+
description=(
|
28 |
+
"A dictionary of dependencies, which elaborates on what outputs are needed for each step in the plan to execute. So that the output of those steps is retrieved and added inside the prompt for the present step. An example of a dependency would be: {2 : [1]}"
|
29 |
+
),
|
30 |
+
)
|
31 |
+
arguments: list[str] = Field(
|
32 |
+
...,
|
33 |
+
description="The arguments that the tool needs to be given. These arguments will be used in the prompt to get the output.",
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
def get_planner(llm: ChatAnthropic):
|
38 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
39 |
+
[
|
40 |
+
SystemMessage(
|
41 |
+
(
|
42 |
+
f"You are a research assistant whose primary job is to explain a research article in a clear and accessible way. Your goal is to read the entire article and provide an explanation that allows other researchers to understand its content without having to read it themselves. Additionally, you should be able to answer any questions they might have. The most efficient way to accomplish this is by answering the following Heilmeier catechism questions in detail:\n{HEILMEIER_CATECHISM}\n\n"
|
43 |
+
# comment for readability
|
44 |
+
"You are a planner of the research assistant agent architecture. You need to generate a step-by-step plan process such that you can get all the answers to the given questions using the provided tools:\n"
|
45 |
+
"(1) Arxiv [input]: A tool that searches for results from the Arxiv website. It is useful for finding information on specific topics. The input should be a concise text string, similar to a search query used in a Google search. This tool searches published articles and provides details about the article and a summary of its content. The information obtained is reliable, so if you need information not covered in the research article or require external information, use this tool.\n"
|
46 |
+
"(2) LLM [input]: A pretrained language model that can answer any questions. You provide the query and additional context, and it generates a relevant, summarized answer. The additional context may include the output from previous steps or evidence gathered using the Arxiv tool.\n\n"
|
47 |
+
# comment for readability
|
48 |
+
"For example,\n"
|
49 |
+
"Task: Explain different kinds of cricket shots.\n"
|
50 |
+
"plan_str:\n"
|
51 |
+
'1. Start by finding different kinds of cricket shots. #E1 = Arxiv["Different kinds of cricket shots"].\n'
|
52 |
+
'2. Given the result of the search query, find different types of cricket shots. #E2 = LLM ["Find the different types of cricket shots given the result of search query #E1."]\n'
|
53 |
+
'3. Now, let us find out about different types of cricket shots and their impact on the game. #E3 = LLM ["Given the different types of cricket shots from the step #E1 till step #E2, how does their impact on the game look like?"]\n\n'
|
54 |
+
# comment for readability
|
55 |
+
"Describe the steps of your plan with rich details. Each step of the plan should contain #E as shown in the example. DO NOT write a step at the end to summarize the plan."
|
56 |
+
)
|
57 |
+
),
|
58 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
59 |
+
MessagesPlaceholder(variable_name="article_text"),
|
60 |
+
]
|
61 |
+
)
|
62 |
+
planner = prompt_template | llm
|
63 |
+
|
64 |
+
# parse the response to get the plan, tasks, tools, dependencies, and arguments
|
65 |
+
def parse_plan(plan_string: str):
|
66 |
+
parser = PlannerParser(plan_string=plan_string)
|
67 |
+
return PlannerOutput(
|
68 |
+
plan_str=plan_string,
|
69 |
+
tools=parser.get_tool_list(),
|
70 |
+
dependencies=parser.get_dependency_list(),
|
71 |
+
arguments=parser.get_argument_list(),
|
72 |
+
)
|
73 |
+
|
74 |
+
def get_plan(
|
75 |
+
article_text: str,
|
76 |
+
_context: Optional[List[str]] = None, # TODO: rename when context is used
|
77 |
+
_config: Optional[RunnableConfig] = None, # TODO: rename when config is used
|
78 |
+
):
|
79 |
+
response = planner.invoke(
|
80 |
+
{
|
81 |
+
"article_text": [
|
82 |
+
HumanMessage(
|
83 |
+
f"You are given a research document with the following content:\n{article_text}.\n\n"
|
84 |
+
"Read the research document thoroughly. Using the tools provided to you, generate a step-by-step plan that would use these tools in the specified step-wise manner to get a detailed summary for all the questions."
|
85 |
+
)
|
86 |
+
]
|
87 |
+
}
|
88 |
+
)
|
89 |
+
if isinstance(response.content, str):
|
90 |
+
return parse_plan(response.content)
|
91 |
+
else:
|
92 |
+
raise TypeError(
|
93 |
+
"Response.Content i.e the plan given out from the llm must be a string"
|
94 |
+
)
|
95 |
+
|
96 |
+
return StructuredTool.from_function(
|
97 |
+
name="planner",
|
98 |
+
func=get_plan,
|
99 |
+
description=(
|
100 |
+
(
|
101 |
+
'This tool is used to generate a plan for obtaining a summary of research articles. Rather than providing the entire summary, it focuses on creating a step-by-step plan that guides the agent in producing a detailed, accurate summary of a research article. This tool can be considered the "brain" that designs the agent\'s workflow.\n'
|
102 |
+
"For Example:\n"
|
103 |
+
"Input: The parsed pdf string of the article\n"
|
104 |
+
"Answer: An object consisting of the following fields:\n"
|
105 |
+
"plan_string: str\n"
|
106 |
+
"steps : List[str]\n"
|
107 |
+
"tools : List[str]\n"
|
108 |
+
"dependencies : dict\n"
|
109 |
+
"arguments : List[str]\n"
|
110 |
+
)
|
111 |
+
),
|
112 |
+
)
|
src/research_assistant/components/plannerParser.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
3 |
+
|
4 |
+
from research_assistant.components.agent import Agent
|
5 |
+
from research_assistant.config.configuration import ConfigurationManager
|
6 |
+
|
7 |
+
|
8 |
+
class PlannerParser:
|
9 |
+
tool_list_schema = {
|
10 |
+
"title": "ToolSchema",
|
11 |
+
"description": "This is the schema used to get tools list, after parsing the plan string, which will be used to do tool or fucntion calling for the further parts of the agentic framework",
|
12 |
+
"type": "object",
|
13 |
+
"properties": {
|
14 |
+
"tools": {
|
15 |
+
"type": "array",
|
16 |
+
"items": {"type": "string"},
|
17 |
+
"title": "Tools",
|
18 |
+
"description": "The Tool each step in the plan needs to access. If we have seven steps, this has a list of 7 values depicting the tool name for each step that needs to be used",
|
19 |
+
"default": [],
|
20 |
+
},
|
21 |
+
},
|
22 |
+
"required": ["tools"],
|
23 |
+
}
|
24 |
+
argument_list_schema = {
|
25 |
+
"title": "ArgumentSchema",
|
26 |
+
"description": "This is the schema used to get argument list used for each tool call, after parsing the plan string, which will be used to do tool or fucntion calling for the further parts of the agentic framework",
|
27 |
+
"type": "object",
|
28 |
+
"properties": {
|
29 |
+
"arguments": {
|
30 |
+
"type": "array",
|
31 |
+
"items": {"type": "string"},
|
32 |
+
"title": "Arguments",
|
33 |
+
"description": "The Arugment for tool call for each step in the plan needs to access. If we have seven steps, this has a list of 7 values depicting the argument value for each step that needs to be used",
|
34 |
+
"default": [],
|
35 |
+
},
|
36 |
+
},
|
37 |
+
"required": ["arguments"],
|
38 |
+
}
|
39 |
+
dependency_list_schema = {
|
40 |
+
"title": "DependentSchema",
|
41 |
+
"description": "This schema defines dependencies for each tool call to fetch additional context after parsing the plan string for the agentic framework.",
|
42 |
+
"type": "object",
|
43 |
+
"properties": {
|
44 |
+
"dependencies": {
|
45 |
+
"type": "object",
|
46 |
+
"additionalProperties": {
|
47 |
+
"type": "array",
|
48 |
+
"items": {"type": "integer"},
|
49 |
+
},
|
50 |
+
"title": "Dependencies",
|
51 |
+
"description": "A dictionary where each key is a step identifier, and the value is an array of integers, representing the step dependencies.",
|
52 |
+
"default": {},
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"required": ["dependencies"],
|
56 |
+
}
|
57 |
+
|
58 |
+
def __init__(self, plan_string: str):
|
59 |
+
self.plan_string = plan_string
|
60 |
+
self.config = ConfigurationManager().get_planner_parser_config()
|
61 |
+
|
62 |
+
def get_tool_list(self):
|
63 |
+
llm = Agent(self.config.tool_list_model).get_model()
|
64 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
65 |
+
[
|
66 |
+
SystemMessage(
|
67 |
+
(
|
68 |
+
"You are a planner parser. You will be given a plan consisting of a series of steps, and you need to give me a list consisting of what tool is being used for each step.\n"
|
69 |
+
"So if there are 7 steps, I need a list of length 7, where each value is the tool that is being used for the corresponding step.\n"
|
70 |
+
"For example:\n"
|
71 |
+
'1. Find out the temperature right now. #E1 = Google["What is the temperature near me?"]\n'
|
72 |
+
'2. Check the weather forecast for tomorrow. #E2: WeatherAPI["What will be the weather like tomorrow?"]\n'
|
73 |
+
'3. Get the top 5 news articles related to the topic of climate change. !3 = NewsAPI["What are the top 5 news articles about climate change?"]\n'
|
74 |
+
"If this is the given plan, the output should be: ['Google', 'WeatherAPI', 'NewsAPI'].\n"
|
75 |
+
"Be very careful that you don't miss any step or tool. The number of steps and tools should be the same. Check your output thoroughly. Write only one tool on each line."
|
76 |
+
),
|
77 |
+
),
|
78 |
+
MessagesPlaceholder(variable_name="plan"),
|
79 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
80 |
+
]
|
81 |
+
)
|
82 |
+
tool_parser = prompt_template | llm.with_structured_output(
|
83 |
+
self.tool_list_schema
|
84 |
+
)
|
85 |
+
return tool_parser.invoke(
|
86 |
+
{
|
87 |
+
"plan": [
|
88 |
+
HumanMessage(
|
89 |
+
f"This is the generated plan:\n{self.plan_string}\n\n"
|
90 |
+
"Now parse this content and give me the list of tools for each step."
|
91 |
+
)
|
92 |
+
]
|
93 |
+
}
|
94 |
+
)["tools"]
|
95 |
+
|
96 |
+
def get_argument_list(self):
|
97 |
+
llm = Agent(self.config.argument_list_model).get_model()
|
98 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
99 |
+
[
|
100 |
+
SystemMessage(
|
101 |
+
(
|
102 |
+
"You are a planner parser. You take in a plan consisting of a series of steps, and you need to give me a list consisting of what argument is being used for each tool call in each step.\n"
|
103 |
+
"So if there are 7 steps, I need a list of length 7, where each value is the argument that is being called inside the tool for each step. A step is defined as #Ex where x is the step number. An argument will always be of the format #Ex = tool_name['argument']\n"
|
104 |
+
"For example:\n"
|
105 |
+
'1. Find out the temperature right now. #E1 = Google["What is the temperature near me?"]\n'
|
106 |
+
'2. Check the weather forecast for tomorrow. #E2 : WeatherAPI["What will be the weather like tomorrow?"]\n'
|
107 |
+
'3. Get the top 5 news articles related to the topic of climate change. !3 = NewsAPI["What are the top 5 news articles about climate change?"]\n'
|
108 |
+
"If this is the given plan, the output should be: ['What is the temperature near me?', 'What will be the weather like tomorrow?', 'What are the top 5 news articles about climate change?']\n"
|
109 |
+
"Be very careful that you don't miss any step or argument. The number of steps and arguments should be the same. Check your output thoroughly."
|
110 |
+
),
|
111 |
+
),
|
112 |
+
MessagesPlaceholder(variable_name="plan"),
|
113 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
114 |
+
]
|
115 |
+
)
|
116 |
+
argument_parser = prompt_template | llm.with_structured_output(
|
117 |
+
self.argument_list_schema
|
118 |
+
)
|
119 |
+
return argument_parser.invoke(
|
120 |
+
{
|
121 |
+
"plan": [
|
122 |
+
HumanMessage(
|
123 |
+
f"This is the generated plan:\n{self.plan_string}\n\n"
|
124 |
+
"Give me the list of arguments of each tool call for each step. For each step I need to know what is the argument that is being passed inside the tool. If I have 10 steps, I need a list of length 10, consisting of the query or argument for each tool call in each step."
|
125 |
+
)
|
126 |
+
]
|
127 |
+
}
|
128 |
+
)["arguments"]
|
129 |
+
|
130 |
+
def get_dependency_list(self):
|
131 |
+
llm = Agent(self.config.dependency_list_model).get_model()
|
132 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
133 |
+
[
|
134 |
+
SystemMessage(
|
135 |
+
(
|
136 |
+
"You are a planner parser. You get a plan consisting of a series of steps, and you need to give me a dictionary consisting of what step results each step argument is dependent upon. If there are 7 steps and 5 steps require the results of previous steps, I need a dictionary containing those 5 keys, where each value is a list of step numbers that the key step is dependent upon. For example:\n"
|
137 |
+
'1. Find out the temperature right now. #E1 = Location["What is my current location?"]\n'
|
138 |
+
'2. Check the weather forecast for tomorrow. #E2 : WeatherAPI["What will be the weather like tomorrow at #E1?"]\n'
|
139 |
+
'3. Get the top 5 news articles related to the topic of climate change. !3 = LLM["What are major tourist things to do based on the information of #E1 and #E2?"]\n'
|
140 |
+
"If this is the given plan, the output should be:\n"
|
141 |
+
"{'2': [1], '3': [1, 2]}\n"
|
142 |
+
"This is because step 2 is dependent on the value of #E1, and step 3 is dependent on the value of #E2 and #E1.\n"
|
143 |
+
"A step cannot depend on itself. A step cannot depend on any step that comes after it. A dependency only exists if a step's content has 'from #Ex' where x is the step number.\n"
|
144 |
+
"For example:\n"
|
145 |
+
'#E8 = LLM["What were the main results and findings from their experiments? Explain the results in simple terms without technical jargon"]\n'
|
146 |
+
'#E9 = Arxiv["UNLI limitations natural language inference"]\n'
|
147 |
+
"Under no circumstances can the output be: {'8': [9]}\n"
|
148 |
+
"This is because 8 comes before 9.\n\n"
|
149 |
+
"Be very careful that you don't miss any step or dependency. The number of steps and dependencies should be the same. Check your output thoroughly."
|
150 |
+
),
|
151 |
+
),
|
152 |
+
MessagesPlaceholder(variable_name="plan"),
|
153 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
154 |
+
]
|
155 |
+
)
|
156 |
+
dependency_parser = prompt_template | llm.with_structured_output(
|
157 |
+
self.dependency_list_schema
|
158 |
+
)
|
159 |
+
return dependency_parser.invoke(
|
160 |
+
{
|
161 |
+
"plan": [
|
162 |
+
HumanMessage(
|
163 |
+
f"This is the generated plan:\n{self.plan_string}\n\n"
|
164 |
+
"Parse this content and give me the dictionary of dependencies. Look at each step and see on what steps each argument of the step is dependent upon, and give me the values in the form of a key value pair. Key being the step number and value being the list of step numbers that the key is dependent upon."
|
165 |
+
)
|
166 |
+
]
|
167 |
+
}
|
168 |
+
)["dependencies"]
|
src/research_assistant/components/solver.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
|
3 |
+
from langchain_anthropic.chat_models import ChatAnthropic
|
4 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
5 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
6 |
+
from langchain_core.runnables import RunnableConfig
|
7 |
+
from langchain_core.tools import StructuredTool
|
8 |
+
from pydantic import BaseModel, Field
|
9 |
+
|
10 |
+
from research_assistant.app_logging import app_logger
|
11 |
+
from research_assistant.constants import HEILMEIER_CATECHISM
|
12 |
+
|
13 |
+
|
14 |
+
class SolverResponse(BaseModel):
|
15 |
+
"""The input to the summarizer tool function."""
|
16 |
+
|
17 |
+
answer: str = Field(
|
18 |
+
...,
|
19 |
+
description="The summary of the research article ",
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def get_solver(llm: ChatAnthropic):
|
24 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
25 |
+
[
|
26 |
+
SystemMessage(
|
27 |
+
(
|
28 |
+
"You are a research assistant responsible for simplifying and explaining the core concepts of a research article to a user who may not be familiar with technical terms. You will be given a plan created by a planner tool, which breaks down the main ideas of the research article through a series of questions. Each question is paired with an answer, providing insights into the purpose, methodology, and key findings of the article. Your task is to synthesize these questions and answers to produce a clear, concise summary that captures the main message of the research article. The summary should enable the user to understand the article's significance, its contributions, and whether it contains information relevant to their needs or goals. Your summary should be:\n"
|
29 |
+
"1. Simple and accessible, avoiding technical jargon.\n"
|
30 |
+
"2. Comprehensive enough to convey the article's goals and key insights.\n"
|
31 |
+
"3. Informative so that the user can decide if reading the full article is worth their time.\n"
|
32 |
+
"Please proceed by summarizing based on these questions and answers. Make sure to respond in the markdown format."
|
33 |
+
),
|
34 |
+
),
|
35 |
+
MessagesPlaceholder(variable_name="context", optional=True),
|
36 |
+
MessagesPlaceholder(variable_name="text"),
|
37 |
+
]
|
38 |
+
)
|
39 |
+
solver = prompt_template | llm
|
40 |
+
|
41 |
+
def get_joined_answer(
|
42 |
+
input: str,
|
43 |
+
_context: Optional[List[str]] = None, # TODO: rename when context is used
|
44 |
+
_config: Optional[RunnableConfig] = None, # TODO: rename when config is used
|
45 |
+
):
|
46 |
+
response = solver.invoke(
|
47 |
+
{
|
48 |
+
"text": [
|
49 |
+
HumanMessage(
|
50 |
+
(
|
51 |
+
f"Solve the following task or question. To solve the question, we have made a step-by-step plan and retrieved corresponding evidence for each plan. Use them with caution since long evidence might contain irrelevant information. Here's the plan with the evidence:\n{input}\n\n"
|
52 |
+
f"Now solve the question or task according to the provided evidence above. Respond with:\n{HEILMEIER_CATECHISM}\n\n"
|
53 |
+
"Since you are a research assistant, you need to be as detailed as possible to help me understand your breakdown of the research document. Assume I have no prior knowledge of the document’s content. Make it clear, comprehensive, and easy to understand. Avoid complex language and technical jargon. The more in-depth the explanation, the better\n"
|
54 |
+
"The output should be answering all the Heilmeier catechism questions using the obtained evidence information. Answer all the questions.\n"
|
55 |
+
)
|
56 |
+
)
|
57 |
+
]
|
58 |
+
}
|
59 |
+
)
|
60 |
+
if not isinstance(response.content, str):
|
61 |
+
app_logger.info(
|
62 |
+
"The response from solver is not a string. It is %s",
|
63 |
+
type(response.content),
|
64 |
+
)
|
65 |
+
response.content = str(response.content)
|
66 |
+
return SolverResponse(answer=response.content)
|
67 |
+
|
68 |
+
return StructuredTool.from_function(
|
69 |
+
name="solver",
|
70 |
+
func=get_joined_answer,
|
71 |
+
description="This is a tool which takes in a list of questions and the answers provide to those questions, and generates a summary by using all of this information.",
|
72 |
+
)
|
src/research_assistant/components/state.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
from typing_extensions import TypedDict
|
4 |
+
|
5 |
+
|
6 |
+
class ResearchSummary(TypedDict):
|
7 |
+
article_text: str
|
8 |
+
plan_string: str
|
9 |
+
dependencies: dict
|
10 |
+
tools: List[str]
|
11 |
+
arguments: List[str]
|
12 |
+
results: dict
|
13 |
+
result: str
|
src/research_assistant/config/__init__.py
ADDED
File without changes
|
src/research_assistant/config/configuration.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from research_assistant.constants import CONFIG_FILE_PATH
|
2 |
+
from research_assistant.entity import (
|
3 |
+
ArticleSearchConfig,
|
4 |
+
PlannerConfig,
|
5 |
+
PlannerParserConfig,
|
6 |
+
QaToolConfig,
|
7 |
+
SolverConfig,
|
8 |
+
SubmittedDateConfig,
|
9 |
+
articleLoaderConfig,
|
10 |
+
)
|
11 |
+
from research_assistant.utils.common import read_yaml
|
12 |
+
|
13 |
+
|
14 |
+
class ConfigurationManager:
|
15 |
+
def __init__(self, config_filepath=CONFIG_FILE_PATH):
|
16 |
+
self.config = read_yaml(config_filepath)
|
17 |
+
|
18 |
+
def get_article_details_config(self) -> articleLoaderConfig:
|
19 |
+
config = self.config.article_details
|
20 |
+
return articleLoaderConfig(
|
21 |
+
file_path=config.file_path, summary_save_dir=config.summary_save_dir
|
22 |
+
)
|
23 |
+
|
24 |
+
def get_article_search_params(self) -> ArticleSearchConfig:
|
25 |
+
config = self.config.article_search_details
|
26 |
+
return ArticleSearchConfig(
|
27 |
+
search_terms=config.search_terms,
|
28 |
+
num_results=config.num_results,
|
29 |
+
date_range=SubmittedDateConfig(
|
30 |
+
start_date=config.date_range.start_date,
|
31 |
+
end_date=config.date_range.end_date,
|
32 |
+
),
|
33 |
+
sort_by=config.sort_by,
|
34 |
+
sort_order=config.sort_order,
|
35 |
+
)
|
36 |
+
|
37 |
+
def get_planner_config(self) -> PlannerConfig:
|
38 |
+
config = self.config.planner
|
39 |
+
return PlannerConfig(model_name=config.model_name)
|
40 |
+
|
41 |
+
def get_qa_tool_config(self) -> QaToolConfig:
|
42 |
+
config = self.config.qa_tool
|
43 |
+
return QaToolConfig(model_name=config.model_name)
|
44 |
+
|
45 |
+
def get_solver_config(self) -> SolverConfig:
|
46 |
+
config = self.config.solver
|
47 |
+
return SolverConfig(model_name=config.model_name)
|
48 |
+
|
49 |
+
def get_planner_parser_config(self) -> PlannerParserConfig:
|
50 |
+
config = self.config.planner_parser
|
51 |
+
return PlannerParserConfig(
|
52 |
+
tool_list_model=config.tool_list_model,
|
53 |
+
argument_list_model=config.argument_list_model,
|
54 |
+
dependency_list_model=config.dependency_list_model,
|
55 |
+
)
|
src/research_assistant/constants/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
CONFIG_FILE_PATH = Path("config/config.yaml")
|
4 |
+
HEILMEIER_CATECHISM = (
|
5 |
+
"1. What are they trying to do? Articulate the objectives using absolutely no jargon.\n"
|
6 |
+
"2. How was it done before this article, and what are the limitations of those practices?\n"
|
7 |
+
"3. What is new in their approach, and why do they think it will be successful?\n"
|
8 |
+
"4. Who cares? If they are successful, what difference will it make?\n"
|
9 |
+
"5. What experiment do they design to show their approach works? What dataset or question set did they use? What LLMs or other AI systems did they work? How did they measure effectiveness?\n"
|
10 |
+
"6. What were the results? What do they show? Again, articulate this using absolutely no jargon.\n"
|
11 |
+
)
|
12 |
+
ARXIV_API_ACCESS_POINT = "http://export.arxiv.org/api/query"
|
src/research_assistant/entity/__init__.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass(frozen=True)
|
7 |
+
class articleLoaderConfig:
|
8 |
+
file_path: Path
|
9 |
+
summary_save_dir: Path
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True)
|
13 |
+
class SubmittedDateConfig:
|
14 |
+
start_date: str
|
15 |
+
end_date: str
|
16 |
+
|
17 |
+
|
18 |
+
@dataclass(frozen=True)
|
19 |
+
class ArticleSearchConfig:
|
20 |
+
search_terms: List[str]
|
21 |
+
num_results: int
|
22 |
+
date_range: SubmittedDateConfig
|
23 |
+
sort_by: str
|
24 |
+
sort_order: str
|
25 |
+
|
26 |
+
|
27 |
+
@dataclass(frozen=True)
|
28 |
+
class PlannerConfig:
|
29 |
+
model_name: str
|
30 |
+
|
31 |
+
|
32 |
+
@dataclass(frozen=True)
|
33 |
+
class QaToolConfig:
|
34 |
+
model_name: str
|
35 |
+
|
36 |
+
|
37 |
+
@dataclass(frozen=True)
|
38 |
+
class SolverConfig:
|
39 |
+
model_name: str
|
40 |
+
|
41 |
+
|
42 |
+
@dataclass(frozen=True)
|
43 |
+
class PlannerParserConfig:
|
44 |
+
tool_list_model: str
|
45 |
+
argument_list_model: str
|
46 |
+
dependency_list_model: str
|
src/research_assistant/main.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
from research_assistant.app_logging import app_logger
|
7 |
+
from research_assistant.config.configuration import ConfigurationManager
|
8 |
+
from research_assistant.pipeline.articleSearch import ArticleSearchPipeline
|
9 |
+
from research_assistant.pipeline.articleSummarization import ArticleSummarization
|
10 |
+
from research_assistant.utils.common import write_summary_to_file
|
11 |
+
|
12 |
+
|
13 |
+
def article_summarization():
|
14 |
+
app_logger.info("Starting the Summarization Pipeline")
|
15 |
+
load_dotenv(Path(".env"))
|
16 |
+
article_config = ConfigurationManager().get_article_details_config()
|
17 |
+
filepath = article_config.file_path
|
18 |
+
app_logger.info(f"Processing file: {filepath}")
|
19 |
+
summary = ArticleSummarization(filepath).get_summary()
|
20 |
+
app_logger.info(f"Summary: {summary}")
|
21 |
+
app_logger.info("Completed Summarizing the article")
|
22 |
+
write_summary_to_file(article_config, summary)
|
23 |
+
app_logger.info("Summarization Pipeline completed successfully")
|
24 |
+
|
25 |
+
|
26 |
+
def article_search():
|
27 |
+
app_logger.info("Starting the article search pipeline")
|
28 |
+
arxiv_search_details = ConfigurationManager().get_article_search_params()
|
29 |
+
article_search = ArticleSearchPipeline(arxiv_search_details)
|
30 |
+
article_list = article_search.get_article_list()
|
31 |
+
app_logger.info(
|
32 |
+
f"Completed searching for articles. We found a total of {len(article_list)} articles"
|
33 |
+
)
|
34 |
+
|
35 |
+
|
36 |
+
def main():
|
37 |
+
parser = argparse.ArgumentParser(description="Research Assistant CLI")
|
38 |
+
parser.add_argument(
|
39 |
+
"--summarize_article",
|
40 |
+
action="store_true",
|
41 |
+
help="Runs the article summmarization pipeline",
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--search_articles",
|
45 |
+
action="store_true",
|
46 |
+
help="Run the article search pipeline",
|
47 |
+
)
|
48 |
+
args = parser.parse_args()
|
49 |
+
if args.summarize_article:
|
50 |
+
article_summarization()
|
51 |
+
elif args.search_articles:
|
52 |
+
article_search()
|
53 |
+
else:
|
54 |
+
app_logger.info("No valid arguments provided. Use --help for options.")
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
main()
|
src/research_assistant/pipeline/__init__.py
ADDED
File without changes
|
src/research_assistant/pipeline/articleSearch.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from research_assistant.app_logging import app_logger
|
2 |
+
from research_assistant.components.arxiv_search_api import ArxivApiWrap
|
3 |
+
from research_assistant.entity import ArticleSearchConfig
|
4 |
+
|
5 |
+
|
6 |
+
class ArticleSearchPipeline:
|
7 |
+
def __init__(self, config: ArticleSearchConfig):
|
8 |
+
self.config = config
|
9 |
+
|
10 |
+
def get_article_list(self):
|
11 |
+
arxiv_api = ArxivApiWrap(self.config)
|
12 |
+
article_list = arxiv_api.get_article_search_result()
|
13 |
+
if not article_list:
|
14 |
+
app_logger.info("No articles were found for the given parameters.")
|
15 |
+
return article_list
|
src/research_assistant/pipeline/articleSummarization.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.messages import HumanMessage
|
2 |
+
from langgraph.graph import END, START, StateGraph
|
3 |
+
|
4 |
+
from research_assistant.app_logging import app_logger
|
5 |
+
from research_assistant.components.agent import Agent
|
6 |
+
from research_assistant.components.agent_tools import get_arxiv_tool, get_qa_tool
|
7 |
+
from research_assistant.components.pdfParser import pdf_parser
|
8 |
+
from research_assistant.components.planner import get_planner
|
9 |
+
from research_assistant.components.solver import get_solver
|
10 |
+
from research_assistant.components.state import ResearchSummary
|
11 |
+
from research_assistant.config.configuration import ConfigurationManager
|
12 |
+
from research_assistant.utils.state_utils import SummaryStateUtils
|
13 |
+
|
14 |
+
|
15 |
+
class ArticleSummarization:
|
16 |
+
def __init__(self, file_path):
|
17 |
+
self.article_path = file_path
|
18 |
+
self.config = ConfigurationManager()
|
19 |
+
self.summary_utils = SummaryStateUtils()
|
20 |
+
|
21 |
+
# This function gives us the model name being requested for any component in the workflow.
|
22 |
+
def get_model(self, component: str):
|
23 |
+
if component == "planner":
|
24 |
+
config = self.config.get_planner_config()
|
25 |
+
elif component == "qa_tool":
|
26 |
+
config = self.config.get_qa_tool_config()
|
27 |
+
elif component == "solver":
|
28 |
+
config = self.config.get_solver_config()
|
29 |
+
else:
|
30 |
+
raise ValueError("Invalid component name for getting the Model")
|
31 |
+
agent = Agent(config.model_name)
|
32 |
+
return agent.get_model()
|
33 |
+
|
34 |
+
# This function generates the plan for the given task using planner tool. This is attached to the planner node.
|
35 |
+
def get_plan(self, state: ResearchSummary):
|
36 |
+
response = get_planner(llm=self.get_model("planner")).invoke(
|
37 |
+
{"article_text": state["article_text"]}
|
38 |
+
)
|
39 |
+
if len(response.tools) != len(response.arguments):
|
40 |
+
raise ValueError("The Plan string is not parsed properly")
|
41 |
+
app_logger.info(f"The plan produced is: {response.plan_str}")
|
42 |
+
return {
|
43 |
+
"plan_string": response.plan_str,
|
44 |
+
"dependencies": response.dependencies,
|
45 |
+
"arguments": response.arguments,
|
46 |
+
"tools": response.tools,
|
47 |
+
}
|
48 |
+
|
49 |
+
# This function executes the tools of the plan. This is attached to the tool execution node.
|
50 |
+
def tool_execution(self, state: ResearchSummary):
|
51 |
+
"""Worker node that executes the tools of a given plan."""
|
52 |
+
current_step = self.summary_utils.get_current_task(state)
|
53 |
+
arg, tools = state["arguments"], state["tools"]
|
54 |
+
results_dict = (state["results"] or {}) if "results" in state else {}
|
55 |
+
# Tool calling for each step.
|
56 |
+
if tools[current_step - 1] == "Arxiv":
|
57 |
+
result = get_arxiv_tool().run(arg[current_step - 1])
|
58 |
+
elif tools[current_step - 1] == "LLM":
|
59 |
+
result = get_qa_tool(llm=self.get_model("qa_tool")).invoke(
|
60 |
+
{
|
61 |
+
"question": arg[current_step - 1],
|
62 |
+
"context": self.summary_utils.get_current_dependencies(
|
63 |
+
state, current_step
|
64 |
+
),
|
65 |
+
}
|
66 |
+
)
|
67 |
+
else:
|
68 |
+
raise ValueError
|
69 |
+
# Store the result in the results dictionary with the step number as key.
|
70 |
+
results_dict[current_step] = str(result)
|
71 |
+
return {"results": results_dict}
|
72 |
+
|
73 |
+
# This function generates the final answer using the results obtained from tool executions. This is attached to the solve node.
|
74 |
+
def solve(self, state: ResearchSummary):
|
75 |
+
return {
|
76 |
+
"result": get_solver(llm=self.get_model("solver"))
|
77 |
+
.invoke(self.summary_utils.get_plan_results(state))
|
78 |
+
.answer
|
79 |
+
}
|
80 |
+
|
81 |
+
# This function builds the execution graph for the article summarization workflow.
|
82 |
+
def get_graph(self):
|
83 |
+
graph = StateGraph(ResearchSummary)
|
84 |
+
graph.add_node("plan", self.get_plan)
|
85 |
+
graph.add_node("tool", self.tool_execution)
|
86 |
+
graph.add_node("solve", self.solve)
|
87 |
+
graph.add_edge("plan", "tool")
|
88 |
+
graph.add_edge("solve", END)
|
89 |
+
graph.add_conditional_edges("tool", self.summary_utils.route)
|
90 |
+
graph.add_edge(START, "plan")
|
91 |
+
return graph.compile()
|
92 |
+
|
93 |
+
# This function builds the execution graph for the summarization task workflow.
|
94 |
+
def get_summary(self):
|
95 |
+
app = self.get_graph()
|
96 |
+
for s in app.stream({"article_text": pdf_parser(self.article_path)}):
|
97 |
+
final_output = s
|
98 |
+
return final_output["solve"]["result"]
|
src/research_assistant/utils/__init__.py
ADDED
File without changes
|
src/research_assistant/utils/common.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import yaml
|
4 |
+
from box import ConfigBox
|
5 |
+
from box.exceptions import BoxValueError
|
6 |
+
|
7 |
+
from research_assistant.app_logging import app_logger
|
8 |
+
from research_assistant.entity import articleLoaderConfig
|
9 |
+
|
10 |
+
|
11 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
12 |
+
"""reads yaml file and returns
|
13 |
+
Args:
|
14 |
+
path_to_yaml (str): path like input
|
15 |
+
Raises:
|
16 |
+
ValueError: if yaml file is empty
|
17 |
+
e: empty file
|
18 |
+
Returns:
|
19 |
+
ConfigBox: ConfigBox type
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
with open(path_to_yaml) as yaml_file:
|
23 |
+
app_logger.info(f"yaml file: {path_to_yaml} loaded successfully")
|
24 |
+
return ConfigBox(yaml.safe_load(yaml_file))
|
25 |
+
except BoxValueError as e:
|
26 |
+
raise ValueError("yaml file is empty") from e
|
27 |
+
|
28 |
+
|
29 |
+
def create_directories(path_to_directories: list, verbose=True):
|
30 |
+
"""create list of directories
|
31 |
+
Args:
|
32 |
+
path_to_directories (list): list of path of directories
|
33 |
+
verbose (bool, optional): whether to log the creation of directories. Defaults to True.
|
34 |
+
"""
|
35 |
+
for path in path_to_directories:
|
36 |
+
Path(path).mkdir(parents=True, exist_ok=True)
|
37 |
+
if verbose:
|
38 |
+
app_logger.info(f"created directory at: {path}")
|
39 |
+
|
40 |
+
|
41 |
+
def write_to_file(filename, text):
|
42 |
+
"""write text to file
|
43 |
+
Args:
|
44 |
+
path (str): file path
|
45 |
+
text (str): text to write
|
46 |
+
"""
|
47 |
+
with open(filename, "w") as file:
|
48 |
+
file.write(text)
|
49 |
+
app_logger.info(f"wrote text to file: {filename}")
|
50 |
+
|
51 |
+
|
52 |
+
def write_summary_to_file(config: articleLoaderConfig, text: str):
|
53 |
+
create_directories([config.summary_save_dir])
|
54 |
+
output_filepath = (
|
55 |
+
Path(config.summary_save_dir) / f"summary_{Path(config.file_path).stem}.md"
|
56 |
+
)
|
57 |
+
write_to_file(output_filepath, text)
|
src/research_assistant/utils/state_utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from research_assistant.components.state import ResearchSummary
|
2 |
+
|
3 |
+
|
4 |
+
class SummaryStateUtils:
|
5 |
+
|
6 |
+
def get_current_task(self, state: ResearchSummary):
|
7 |
+
if results := state.get("results"):
|
8 |
+
return None if len(results) == len(state["arguments"]) else len(results) + 1
|
9 |
+
return 1
|
10 |
+
|
11 |
+
def get_current_dependencies(self, state: ResearchSummary, step: int):
|
12 |
+
return [
|
13 |
+
state["results"].get(i, "") for i in state["dependencies"].get(step, [])
|
14 |
+
] or [state["article_text"]]
|
15 |
+
|
16 |
+
def route(self, state: ResearchSummary):
|
17 |
+
return "solve" if self.get_current_task(state) is None else "tool"
|
18 |
+
|
19 |
+
def get_plan_results(self, state: ResearchSummary) -> str:
|
20 |
+
results = state.get("results", {})
|
21 |
+
plan_lines = [
|
22 |
+
f"Plan: {plan}\n Answer = {results.get(i+1, '')}"
|
23 |
+
for i, plan in enumerate(state["arguments"])
|
24 |
+
]
|
25 |
+
return "\n".join(plan_lines)
|
src/research_assistant/web/app.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from fpdf import FPDF
|
3 |
+
from mistletoe import markdown
|
4 |
+
|
5 |
+
from research_assistant.app_logging import app_logger
|
6 |
+
from research_assistant.pipeline.articleSummarization import ArticleSummarization
|
7 |
+
|
8 |
+
|
9 |
+
def process_file(file):
|
10 |
+
try:
|
11 |
+
app_logger.info(f"Processing file: {file}")
|
12 |
+
summary_pipeline = ArticleSummarization(file)
|
13 |
+
summary = summary_pipeline.get_summary()
|
14 |
+
word_count = len(summary.split())
|
15 |
+
except Exception as e:
|
16 |
+
summary = f"An error occurred: {e}"
|
17 |
+
word_count = 0
|
18 |
+
return summary, word_count
|
19 |
+
|
20 |
+
|
21 |
+
def generate_pdf(summary):
|
22 |
+
pdf = FPDF()
|
23 |
+
pdf.add_page()
|
24 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
25 |
+
pdf.set_font("Helvetica", size=12)
|
26 |
+
try:
|
27 |
+
html_content = markdown(summary)
|
28 |
+
pdf.write_html(html_content)
|
29 |
+
except Exception as e:
|
30 |
+
app_logger.error(f"Error generating PDF: {e}")
|
31 |
+
pdf.write(5, "Error generating PDF content.")
|
32 |
+
|
33 |
+
pdf_output_path = "summary.pdf"
|
34 |
+
pdf.output(name=pdf_output_path)
|
35 |
+
return pdf_output_path
|
36 |
+
|
37 |
+
|
38 |
+
def process_and_generate_pdf(file):
|
39 |
+
summary, wordcount = process_file(file)
|
40 |
+
pdf_output_path = generate_pdf(summary)
|
41 |
+
return summary, wordcount, pdf_output_path
|
42 |
+
|
43 |
+
|
44 |
+
iface = gr.Interface(
|
45 |
+
fn=process_and_generate_pdf,
|
46 |
+
inputs=gr.File(label="Upload PDF", type="filepath"),
|
47 |
+
outputs=[
|
48 |
+
gr.Textbox(label="Summary"),
|
49 |
+
gr.Number(label="Word Count"),
|
50 |
+
gr.File(label="Download PDF"),
|
51 |
+
],
|
52 |
+
title="Research Assistant Summarizer",
|
53 |
+
description="Summarize your research paper.",
|
54 |
+
theme=gr.themes.Default(),
|
55 |
+
)
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
iface.launch(share=True)
|
tests.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
def test_dummy():
|
2 |
+
pass
|