Hasan Iqbal
commited on
Commit
•
cbfd993
1
Parent(s):
cff683c
Added LLM Response Evaluation frontend
Browse files- error_output.txt +10 -0
- requirements.txt +171 -173
- scripts/app.sh +1 -1
- src/openfactcheck/app/app.py +36 -2
- src/openfactcheck/app/evaluate_response.py +154 -0
- src/openfactcheck/app/utils.py +35 -0
- src/openfactcheck/core/base.py +183 -34
- src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py +1 -1
- src/openfactcheck/solvers/rarr_solvers/rarr_editor.py +2 -2
- src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py +1 -1
- src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py +1 -1
- src/openfactcheck/solvers/tutorial_solvers/utils/api.py +1 -1
- src/openfactcheck/solvers/webservice/rarr_rtv.py +1 -1
- src/openfactcheck/solvers/webservice/rarr_vfr.py +1 -1
error_output.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
```python
|
2 |
+
["This is a sample LLM response."]
|
3 |
+
```
|
4 |
+
```python
|
5 |
+
["{'text': 'This is a sample LLM response.'}"]
|
6 |
+
```
|
7 |
+
def process(input: str):
|
8 |
+
sentences = input.replace("{'text': '", "").replace("'}", "").split(". ")
|
9 |
+
processed_sentences = [sentences[0] + ".", sentences[1] + "."]
|
10 |
+
return processed_sentences
|
requirements.txt
CHANGED
@@ -1,174 +1,172 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
dnspython
|
34 |
-
docutils
|
35 |
-
email_validator
|
36 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
fonttools
|
46 |
-
free-proxy
|
47 |
-
frozenlist
|
48 |
-
fsspec
|
49 |
-
gitdb
|
50 |
-
GitPython
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
pyarrow
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
python-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
sphinxcontrib-
|
132 |
-
sphinxcontrib-
|
133 |
-
sphinxcontrib-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
streamlit
|
141 |
-
streamlit-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
1 |
+
aiohttp==3.8.4
|
2 |
+
aiosignal==1.3.1
|
3 |
+
alabaster==0.7.16
|
4 |
+
altair==5.3.0
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.4.0
|
7 |
+
arrow==1.3.0
|
8 |
+
async-timeout==4.0.3
|
9 |
+
asyncio==3.4.3
|
10 |
+
attrs==23.2.0
|
11 |
+
Babel==2.15.0
|
12 |
+
backoff==2.2.1
|
13 |
+
beautifulsoup4==4.12.3
|
14 |
+
bibtexparser==1.4.1
|
15 |
+
blinker==1.8.2
|
16 |
+
blis==0.7.11
|
17 |
+
bs4==0.0.2
|
18 |
+
cachetools==5.4.0
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.7.4
|
21 |
+
charset-normalizer==3.3.2
|
22 |
+
click==8.1.7
|
23 |
+
cloudpathlib==0.18.1
|
24 |
+
confection==0.1.5
|
25 |
+
contourpy==1.2.1
|
26 |
+
cycler==0.12.1
|
27 |
+
cymem==2.0.8
|
28 |
+
datasets==2.20.0
|
29 |
+
Deprecated==1.2.14
|
30 |
+
dill==0.3.8
|
31 |
+
distro==1.9.0
|
32 |
+
docutils==0.20.1
|
|
|
|
|
|
|
33 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
34 |
+
entrypoints==0.4
|
35 |
+
evaluate==0.4.2
|
36 |
+
factool==0.1.3
|
37 |
+
fake-useragent==1.5.1
|
38 |
+
Faker==26.0.0
|
39 |
+
fastapi==0.96.0
|
40 |
+
favicon==0.7.0
|
41 |
+
filelock==3.15.4
|
42 |
+
fonttools==4.53.1
|
43 |
+
free-proxy==1.1.1
|
44 |
+
frozenlist==1.4.1
|
45 |
+
fsspec==2024.5.0
|
46 |
+
gitdb==4.0.11
|
47 |
+
GitPython==3.1.43
|
48 |
+
h11==0.14.0
|
49 |
+
htbuilder==0.6.2
|
50 |
+
httpcore==1.0.5
|
51 |
+
httpx==0.27.0
|
52 |
+
huggingface-hub==0.24.2
|
53 |
+
idna==3.7
|
54 |
+
imagesize==1.4.1
|
55 |
+
Jinja2==3.1.4
|
56 |
+
joblib==1.4.2
|
57 |
+
jsonlines==4.0.0
|
58 |
+
jsonschema==4.23.0
|
59 |
+
jsonschema-specifications==2023.12.1
|
60 |
+
kiwisolver==1.4.5
|
61 |
+
langcodes==3.4.0
|
62 |
+
language_data==1.2.0
|
63 |
+
lxml==5.2.2
|
64 |
+
marisa-trie==1.2.0
|
65 |
+
Markdown==3.6
|
66 |
+
markdown-it-py==3.0.0
|
67 |
+
markdownlit==0.0.7
|
68 |
+
MarkupSafe==2.1.5
|
69 |
+
matplotlib==3.9.1
|
70 |
+
mdurl==0.1.2
|
71 |
+
more-itertools==10.3.0
|
72 |
+
mpmath==1.3.0
|
73 |
+
multidict==6.0.5
|
74 |
+
multiprocess==0.70.16
|
75 |
+
murmurhash==1.0.10
|
76 |
+
networkx==3.3
|
77 |
+
nltk==3.8.1
|
78 |
+
numpy==1.26.4
|
79 |
+
openai==1.37.0
|
80 |
+
outcome==1.3.0.post0
|
81 |
+
packaging==24.1
|
82 |
+
pandas==2.2.2
|
83 |
+
pillow==10.4.0
|
84 |
+
preshed==3.0.9
|
85 |
+
prometheus_client==0.20.0
|
86 |
+
protobuf==5.27.2
|
87 |
+
pyarrow==17.0.0
|
88 |
+
pyarrow-hotfix==0.6
|
89 |
+
pydantic==1.10.9
|
90 |
+
pydantic_core==2.20.1
|
91 |
+
pydeck==0.9.1
|
92 |
+
Pygments==2.18.0
|
93 |
+
pymdown-extensions==10.8.1
|
94 |
+
pyparsing==3.1.2
|
95 |
+
PySocks==1.7.1
|
96 |
+
python-dateutil==2.9.0.post0
|
97 |
+
python-dotenv==1.0.1
|
98 |
+
pytz==2024.1
|
99 |
+
PyYAML==6.0
|
100 |
+
referencing==0.35.1
|
101 |
+
regex==2024.7.24
|
102 |
+
requests==2.32.3
|
103 |
+
rich==13.7.1
|
104 |
+
rpds-py==0.19.1
|
105 |
+
safetensors==0.4.3
|
106 |
+
scholarly==1.7.11
|
107 |
+
scikit-learn==1.5.1
|
108 |
+
scipy==1.14.0
|
109 |
+
selenium==4.23.1
|
110 |
+
sentence-transformers==3.0.1
|
111 |
+
shellingham==1.5.4
|
112 |
+
six==1.16.0
|
113 |
+
smart-open==7.0.4
|
114 |
+
smmap==5.0.1
|
115 |
+
sniffio==1.3.1
|
116 |
+
snowballstemmer==2.2.0
|
117 |
+
sortedcontainers==2.4.0
|
118 |
+
soupsieve==2.5
|
119 |
+
spacy==3.7.5
|
120 |
+
spacy-legacy==3.0.12
|
121 |
+
spacy-loggers==1.0.5
|
122 |
+
Sphinx==7.4.7
|
123 |
+
sphinx-rtd-theme==2.0.0
|
124 |
+
sphinxcontrib-applehelp==1.0.8
|
125 |
+
sphinxcontrib-devhelp==1.0.6
|
126 |
+
sphinxcontrib-htmlhelp==2.0.6
|
127 |
+
sphinxcontrib-jquery==4.1
|
128 |
+
sphinxcontrib-jsmath==1.0.1
|
129 |
+
sphinxcontrib-qthelp==1.0.8
|
130 |
+
sphinxcontrib-serializinghtml==1.1.10
|
131 |
+
srsly==2.4.8
|
132 |
+
st-annotated-text==4.0.1
|
133 |
+
st-theme==1.2.3
|
134 |
+
starlette==0.27.0
|
135 |
+
streamlit==1.36.0
|
136 |
+
streamlit-camera-input-live==0.2.0
|
137 |
+
streamlit-card==1.0.2
|
138 |
+
streamlit-embedcode==0.1.2
|
139 |
+
streamlit-extras==0.4.3
|
140 |
+
streamlit-faker==0.0.3
|
141 |
+
streamlit-image-coordinates==0.1.9
|
142 |
+
streamlit-keyup==0.2.4
|
143 |
+
streamlit-option-menu==0.3.13
|
144 |
+
streamlit-toggle-switch==1.0.2
|
145 |
+
streamlit-vertical-slider==2.5.5
|
146 |
+
sympy==1.13.1
|
147 |
+
tenacity==8.5.0
|
148 |
+
thinc==8.2.5
|
149 |
+
threadpoolctl==3.5.0
|
150 |
+
tokenizers==0.19.1
|
151 |
+
toml==0.10.2
|
152 |
+
toolz==0.12.1
|
153 |
+
torch==2.4.0
|
154 |
+
tornado==6.4.1
|
155 |
+
tqdm==4.66.4
|
156 |
+
transformers==4.43.2
|
157 |
+
trio==0.26.0
|
158 |
+
trio-websocket==0.11.1
|
159 |
+
typer==0.12.3
|
160 |
+
types-python-dateutil==2.9.0.20240316
|
161 |
+
typing_extensions==4.12.2
|
162 |
+
tzdata==2024.1
|
163 |
+
urllib3==2.2.2
|
164 |
+
uvicorn==0.22.0
|
165 |
+
validators==0.33.0
|
166 |
+
wasabi==1.1.3
|
167 |
+
weasel==0.4.1
|
168 |
+
websocket-client==1.8.0
|
169 |
+
wrapt==1.16.0
|
170 |
+
wsproto==1.2.0
|
171 |
+
xxhash==3.4.1
|
172 |
+
yarl==1.9.4
|
scripts/app.sh
CHANGED
@@ -11,4 +11,4 @@ source "${BASH_SOURCE%/*}/common.sh"
|
|
11 |
|
12 |
# Executing Python script
|
13 |
export PYTHONPATH="$PYTHONPATH:src/"
|
14 |
-
|
|
|
11 |
|
12 |
# Executing Python script
|
13 |
export PYTHONPATH="$PYTHONPATH:src/"
|
14 |
+
streamlit run src/openfactcheck/app/app.py -- "$@"
|
src/openfactcheck/app/app.py
CHANGED
@@ -1,11 +1,31 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
from streamlit_option_menu import option_menu
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
class App:
|
5 |
def __init__(self):
|
6 |
pass
|
7 |
|
8 |
-
def run(self):
|
|
|
|
|
|
|
9 |
|
10 |
# Set up Dashboard
|
11 |
st.set_page_config(page_title="OpenFactCheck Dashboard",
|
@@ -24,6 +44,20 @@ class App:
|
|
24 |
orientation="horizontal"
|
25 |
)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
if __name__ == "__main__":
|
|
|
|
|
28 |
app = App()
|
29 |
-
app.run()
|
|
|
1 |
+
import argparse
|
2 |
import streamlit as st
|
3 |
from streamlit_option_menu import option_menu
|
4 |
|
5 |
+
from openfactcheck.core.base import OpenFactCheck, OpenFactCheckConfig
|
6 |
+
from openfactcheck.app.evaluate_response import evaluate_response
|
7 |
+
|
8 |
+
def parse_args():
|
9 |
+
parser = argparse.ArgumentParser(description='Initialize OpenFactCheck with custom configuration.')
|
10 |
+
|
11 |
+
# Add arguments here, example:
|
12 |
+
parser.add_argument("--config-path",
|
13 |
+
type=str,
|
14 |
+
help="Config File Path",
|
15 |
+
default="config.json")
|
16 |
+
|
17 |
+
# Parse arguments from command line
|
18 |
+
args = parser.parse_args()
|
19 |
+
return args
|
20 |
+
|
21 |
class App:
|
22 |
def __init__(self):
|
23 |
pass
|
24 |
|
25 |
+
def run(self, config_path: str = "config.json"):
|
26 |
+
# Initialize OpenFactCheck
|
27 |
+
config = OpenFactCheckConfig(config_path)
|
28 |
+
ofc = OpenFactCheck(config)
|
29 |
|
30 |
# Set up Dashboard
|
31 |
st.set_page_config(page_title="OpenFactCheck Dashboard",
|
|
|
44 |
orientation="horizontal"
|
45 |
)
|
46 |
|
47 |
+
# Load the selected page
|
48 |
+
if selected == "Evaluate LLM Response":
|
49 |
+
evaluate_response(ofc)
|
50 |
+
# elif selected == "Evaluate LLM":
|
51 |
+
# evaluate_llm()
|
52 |
+
# elif selected == "Evaluate FactChecker":
|
53 |
+
# evaluate_factchecker()
|
54 |
+
# elif selected == "Leaderboards":
|
55 |
+
# leaderboards()
|
56 |
+
# else:
|
57 |
+
# about()
|
58 |
+
|
59 |
if __name__ == "__main__":
|
60 |
+
args = parse_args()
|
61 |
+
|
62 |
app = App()
|
63 |
+
app.run(args.config_path)
|
src/openfactcheck/app/evaluate_response.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import time
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
from openfactcheck.core.base import OpenFactCheck
|
7 |
+
from openfactcheck.app.utils import style_metric_cards
|
8 |
+
|
9 |
+
# Create a function to check a LLM response
|
10 |
+
def evaluate_response(ofc: OpenFactCheck):
|
11 |
+
"""
|
12 |
+
This function creates a Streamlit app to evaluate the factuality of a LLM response.
|
13 |
+
"""
|
14 |
+
if 'response' not in st.session_state:
|
15 |
+
st.session_state.response = None
|
16 |
+
|
17 |
+
# Initialize the solvers
|
18 |
+
claimprocessors = ofc.list_claimprocessors()
|
19 |
+
retrievers = ofc.list_retrievers()
|
20 |
+
verifiers = ofc.list_verifiers()
|
21 |
+
|
22 |
+
st.write("This is where you can check factuality of a LLM response.")
|
23 |
+
|
24 |
+
# Customize FactChecker
|
25 |
+
st.write("Customize FactChecker")
|
26 |
+
|
27 |
+
# Dropdown in three columns
|
28 |
+
col1, col2, col3 = st.columns(3)
|
29 |
+
with col1:
|
30 |
+
claimprocessor = st.selectbox("Select Claim Processor", list(claimprocessors))
|
31 |
+
with col2:
|
32 |
+
retriever = st.selectbox("Select Retriever", list(retrievers))
|
33 |
+
with col3:
|
34 |
+
verifier = st.selectbox("Select Verifier", list(verifiers))
|
35 |
+
|
36 |
+
# Input
|
37 |
+
input_text = {"text": st.text_area("Enter LLM response here", "This is a sample LLM response.")}
|
38 |
+
|
39 |
+
# Button to check factuality
|
40 |
+
if st.button("Check Factuality"):
|
41 |
+
with st.status("Checking factuality...", expanded=True) as status:
|
42 |
+
# Configure the pipeline
|
43 |
+
st.write("Configuring pipeline...")
|
44 |
+
ofc.init_pipeline_manually([claimprocessor, retriever, verifier])
|
45 |
+
st.write("Pipeline configured...")
|
46 |
+
|
47 |
+
# Evaluate the response
|
48 |
+
st.write("Evaluating response...")
|
49 |
+
|
50 |
+
response = ofc(input_text, stream=True)
|
51 |
+
st.write("Response evaluated...")
|
52 |
+
|
53 |
+
status.update(label="Factuality checked...", state="complete", expanded=False)
|
54 |
+
|
55 |
+
# Display pipeline configuration
|
56 |
+
pipeline_str = " ┈➤ ".join([claimprocessor, retriever, verifier])
|
57 |
+
st.info(f"""**Pipeline**: \n{pipeline_str}""")
|
58 |
+
|
59 |
+
# Store the final response in the session state
|
60 |
+
st.session_state.final_response = None
|
61 |
+
|
62 |
+
col1, col2 = st.columns([3, 1])
|
63 |
+
with col1:
|
64 |
+
def process_stream(responses):
|
65 |
+
"""
|
66 |
+
Process each response from the stream as a simulated chat output.
|
67 |
+
This function yields each word from the formatted text of the response,
|
68 |
+
adding a slight delay to simulate typing in a chat.
|
69 |
+
"""
|
70 |
+
|
71 |
+
for response in responses:
|
72 |
+
if "claimprocessor" in response["solver_name"]:
|
73 |
+
# Extract response details
|
74 |
+
output_text = response["output"]
|
75 |
+
|
76 |
+
# Get the number of detected claims
|
77 |
+
detected_claims = output_text.get("claims", [])
|
78 |
+
|
79 |
+
def extract_text(claim):
|
80 |
+
"""
|
81 |
+
Extracts text from a claim that might be a string formatted as a dictionary.
|
82 |
+
"""
|
83 |
+
# Try to extract text using regular expression if claim is a string formatted as a dictionary
|
84 |
+
match = re.search(r"'text': '([^']+)'", claim)
|
85 |
+
if match:
|
86 |
+
return match.group(1)
|
87 |
+
return claim # Return as is if no dictionary format detected
|
88 |
+
|
89 |
+
# Generate formatted text with enumerated claims in Markdown format
|
90 |
+
formatted_text = "#### Detected Claims\n" + "\n".join(f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1)) + "\n"
|
91 |
+
|
92 |
+
with col2:
|
93 |
+
st.metric(label="Detected Claims", value=len(detected_claims))
|
94 |
+
style_metric_cards(background_color="#F0F0F0", border_color="#F0F0F0", border_radius_px=0)
|
95 |
+
|
96 |
+
# Yield each word with a space and simulate typing by sleeping
|
97 |
+
for word in formatted_text.split(" "):
|
98 |
+
yield word + " "
|
99 |
+
time.sleep(0.01)
|
100 |
+
|
101 |
+
st.session_state.claimprocessor_flag = True
|
102 |
+
|
103 |
+
elif "retriever" in response["solver_name"]:
|
104 |
+
# Extract response details
|
105 |
+
output_text = response["output"]
|
106 |
+
|
107 |
+
evidences = []
|
108 |
+
for _, claim_with_evidences in output_text.get("claims_with_evidences", {}).items():
|
109 |
+
for evidence in claim_with_evidences:
|
110 |
+
evidences.append(evidence[1])
|
111 |
+
|
112 |
+
# Generate formatted text with enumerated evidences in Markdown format
|
113 |
+
formatted_text = "#### Retrieved Evidences\n" + "\n".join(f"{i}. {evidence}" for i, evidence in enumerate(evidences, start=1))
|
114 |
+
|
115 |
+
with col2:
|
116 |
+
st.metric(label="Retrieved Evidences", value=len(evidences))
|
117 |
+
style_metric_cards(background_color="#F0F0F0", border_color="#F0F0F0", border_radius_px=0)
|
118 |
+
|
119 |
+
# Yield each word with a space and simulate typing by sleeping
|
120 |
+
for word in formatted_text.split(" "):
|
121 |
+
yield word + " "
|
122 |
+
time.sleep(0.01)
|
123 |
+
|
124 |
+
elif "verifier" in response["solver_name"]:
|
125 |
+
# Extract response details
|
126 |
+
output_text = response["output"]
|
127 |
+
|
128 |
+
# Store the final response in the session state
|
129 |
+
st.session_state.final_response = output_text
|
130 |
+
|
131 |
+
# Yield each word with a space and simulate typing by sleeping
|
132 |
+
for word in formatted_text.split(" "):
|
133 |
+
yield word + " "
|
134 |
+
time.sleep(0.01)
|
135 |
+
|
136 |
+
st.write_stream(process_stream(response))
|
137 |
+
|
138 |
+
# Process the final response
|
139 |
+
final_response = st.session_state.final_response
|
140 |
+
if final_response is not None:
|
141 |
+
overall_factuality = final_response.get("label", "Unknown")
|
142 |
+
with col2:
|
143 |
+
if overall_factuality == True:
|
144 |
+
st.metric(label="Overall Factuality", value="True")
|
145 |
+
style_metric_cards(background_color="#D4EDDA", border_color="#D4EDDA", border_radius_px=0, border_left_color="#28A745")
|
146 |
+
elif overall_factuality == False:
|
147 |
+
st.metric(label="Overall Factuality", value="False")
|
148 |
+
style_metric_cards(background_color="#F8D7DA", border_color="#F8D7DA", border_radius_px=0, border_left_color="#DC3545")
|
149 |
+
|
150 |
+
# Button to reset
|
151 |
+
if st.session_state.response is not None:
|
152 |
+
if st.button("Reset"):
|
153 |
+
st.session_state.response = None
|
154 |
+
st.rerun()
|
src/openfactcheck/app/utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def style_metric_cards(
|
4 |
+
background_color: str = "#FFF",
|
5 |
+
border_size_px: int = 1,
|
6 |
+
border_color: str = "#CCC",
|
7 |
+
border_radius_px: int = 5,
|
8 |
+
border_left_color: str = "#9AD8E1",
|
9 |
+
) -> None:
|
10 |
+
"""
|
11 |
+
Applies a custom style to st.metrics in the page
|
12 |
+
|
13 |
+
Args:
|
14 |
+
background_color (str, optional): Background color. Defaults to "#FFF".
|
15 |
+
border_size_px (int, optional): Border size in pixels. Defaults to 1.
|
16 |
+
border_color (str, optional): Border color. Defaults to "#CCC".
|
17 |
+
border_radius_px (int, optional): Border radius in pixels. Defaults to 5.
|
18 |
+
border_left_color (str, optional): Borfer left color. Defaults to "#9AD8E1".
|
19 |
+
box_shadow (bool, optional): Whether a box shadow is applied. Defaults to True.
|
20 |
+
"""
|
21 |
+
st.markdown(
|
22 |
+
f"""
|
23 |
+
<style>
|
24 |
+
div[data-testid="stMetric"],
|
25 |
+
div[data-testid="metric-container"] {{
|
26 |
+
background-color: {background_color};
|
27 |
+
border: {border_size_px}px solid {border_color};
|
28 |
+
padding: 5% 5% 5% 10%;
|
29 |
+
border-radius: {border_radius_px}px;
|
30 |
+
border-left: 0.5rem solid {border_left_color} !important;
|
31 |
+
}}
|
32 |
+
</style>
|
33 |
+
""",
|
34 |
+
unsafe_allow_html=True,
|
35 |
+
)
|
src/openfactcheck/core/base.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
import tqdm
|
4 |
-
import yaml
|
5 |
import json
|
6 |
import traceback
|
7 |
from pathlib import Path
|
|
|
8 |
|
9 |
from openfactcheck.lib.logger import logger
|
10 |
from openfactcheck.lib.config import OpenFactCheckConfig
|
@@ -12,6 +13,66 @@ from openfactcheck.core.solver import SOLVER_REGISTRY, Solver
|
|
12 |
from openfactcheck.core.state import FactCheckerState
|
13 |
|
14 |
class OpenFactCheck:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def __init__(self, config: OpenFactCheckConfig):
|
16 |
"""
|
17 |
Initialize OpenFactCheck with the given configuration.
|
@@ -61,6 +122,45 @@ class OpenFactCheck:
|
|
61 |
"""
|
62 |
return SOLVER_REGISTRY
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def init_solver(self, solver_name, args):
|
65 |
"""
|
66 |
Initialize a solver with the given configuration
|
@@ -130,36 +230,85 @@ class OpenFactCheck:
|
|
130 |
with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'a', encoding="utf-8") as f:
|
131 |
f.write(json.dumps(result, ensure_ascii=False) + '\n')
|
132 |
|
133 |
-
def
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
logger.info(f"
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import uuid
|
4 |
import tqdm
|
|
|
5 |
import json
|
6 |
import traceback
|
7 |
from pathlib import Path
|
8 |
+
from typing import Callable
|
9 |
|
10 |
from openfactcheck.lib.logger import logger
|
11 |
from openfactcheck.lib.config import OpenFactCheckConfig
|
|
|
13 |
from openfactcheck.core.state import FactCheckerState
|
14 |
|
15 |
class OpenFactCheck:
|
16 |
+
"""
|
17 |
+
OpenFactCheck class to evaluate the factuality of a response using a pipeline of solvers.
|
18 |
+
|
19 |
+
Parameters
|
20 |
+
----------
|
21 |
+
config : OpenFactCheckConfig
|
22 |
+
An instance of OpenFactCheckConfig containing the configuration
|
23 |
+
settings for OpenFactCheck.
|
24 |
+
|
25 |
+
Attributes
|
26 |
+
----------
|
27 |
+
logger : Logger
|
28 |
+
An instance of the logger to log messages.
|
29 |
+
config : OpenFactCheckConfig
|
30 |
+
An instance of OpenFactCheckConfig containing the configuration
|
31 |
+
settings for OpenFactCheck.
|
32 |
+
solver_configs : dict
|
33 |
+
A dictionary containing the configuration settings for the solvers.
|
34 |
+
pipeline : list
|
35 |
+
A list of solvers to be included in the pipeline.
|
36 |
+
output_path : str
|
37 |
+
The path to the output directory where the results will be stored.
|
38 |
+
|
39 |
+
Methods
|
40 |
+
-------
|
41 |
+
load_solvers(solver_paths)
|
42 |
+
Load solvers from the given paths.
|
43 |
+
list_solvers()
|
44 |
+
List all registered solvers.
|
45 |
+
list_claimprocessors()
|
46 |
+
List all registered claim processors.
|
47 |
+
list_retrievers()
|
48 |
+
List all registered retrievers.
|
49 |
+
list_verifiers()
|
50 |
+
List all registered verifiers.
|
51 |
+
init_solver(solver_name, args)
|
52 |
+
Initialize a solver with the given configuration.
|
53 |
+
init_solvers()
|
54 |
+
Initialize all registered solvers.
|
55 |
+
init_pipeline()
|
56 |
+
Initialize the pipeline with the given configuration.
|
57 |
+
init_pipeline_manually(pipeline)
|
58 |
+
Initialize the pipeline with the given configuration.
|
59 |
+
persist_output(state, idx, solver_name, cont, sample_name=0)
|
60 |
+
Persist the output of the solver.
|
61 |
+
read_output(sample_name)
|
62 |
+
Read the output file for the given sample.
|
63 |
+
remove_output(sample_name)
|
64 |
+
Remove the output file for the given sample.
|
65 |
+
__call__(response, question, callback_fun, **kwargs)
|
66 |
+
Evaluate the response using the pipeline.
|
67 |
+
|
68 |
+
Examples
|
69 |
+
--------
|
70 |
+
>>> config = OpenFactCheckConfig("config.json")
|
71 |
+
>>> ofc = OpenFactCheck(config)
|
72 |
+
>>> response, sample_name = ofc("This is a sample response.")
|
73 |
+
>>> output = ofc.read_output(sample_name)
|
74 |
+
>>> ofc.remove_output(sample_name)
|
75 |
+
"""
|
76 |
def __init__(self, config: OpenFactCheckConfig):
|
77 |
"""
|
78 |
Initialize OpenFactCheck with the given configuration.
|
|
|
122 |
"""
|
123 |
return SOLVER_REGISTRY
|
124 |
|
125 |
+
@staticmethod
|
126 |
+
def list_claimprocessors():
|
127 |
+
"""
|
128 |
+
List all registered claim processors
|
129 |
+
"""
|
130 |
+
# Get all claim processors
|
131 |
+
claimprocessors = {}
|
132 |
+
for solver, value in SOLVER_REGISTRY.items():
|
133 |
+
if "claimprocessor" in solver:
|
134 |
+
claimprocessors[solver] = value
|
135 |
+
|
136 |
+
return claimprocessors
|
137 |
+
|
138 |
+
@staticmethod
|
139 |
+
def list_retrievers():
|
140 |
+
"""
|
141 |
+
List all registered retrievers
|
142 |
+
"""
|
143 |
+
# Get all retrievers
|
144 |
+
retrievers = {}
|
145 |
+
for solver, value in SOLVER_REGISTRY.items():
|
146 |
+
if "retriever" in solver:
|
147 |
+
retrievers[solver] = value
|
148 |
+
|
149 |
+
return retrievers
|
150 |
+
|
151 |
+
@staticmethod
|
152 |
+
def list_verifiers():
|
153 |
+
"""
|
154 |
+
List all registered verifiers
|
155 |
+
"""
|
156 |
+
# Get all verifiers
|
157 |
+
verifiers = {}
|
158 |
+
for solver, value in SOLVER_REGISTRY.items():
|
159 |
+
if "verifier" in solver:
|
160 |
+
verifiers[solver] = value
|
161 |
+
|
162 |
+
return verifiers
|
163 |
+
|
164 |
def init_solver(self, solver_name, args):
|
165 |
"""
|
166 |
Initialize a solver with the given configuration
|
|
|
230 |
with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'a', encoding="utf-8") as f:
|
231 |
f.write(json.dumps(result, ensure_ascii=False) + '\n')
|
232 |
|
233 |
+
def read_output(self, sample_name):
|
234 |
+
"""
|
235 |
+
Read the output file for the given sample
|
236 |
+
"""
|
237 |
+
with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'r', encoding="utf-8") as f:
|
238 |
+
return [json.loads(line) for line in f]
|
239 |
+
|
240 |
+
def remove_output(self, sample_name):
|
241 |
+
"""
|
242 |
+
Remove the output file for the given sample
|
243 |
+
"""
|
244 |
+
os.remove(os.path.join(self.output_path, f'{sample_name}.jsonl'))
|
245 |
+
|
246 |
+
def __call__(self, response: str, question: str = None, stream: bool = False, callback: Callable = None, **kwargs):
|
247 |
+
"""
|
248 |
+
Evaluate the response using the pipeline
|
249 |
+
"""
|
250 |
+
|
251 |
+
def evaluate_response():
|
252 |
+
# Check if sample_name is provided in kwargs else generate a random one
|
253 |
+
sample_name = kwargs.get("sample_name", str(uuid.uuid4().hex[:6]))
|
254 |
+
|
255 |
+
# Initialize the state
|
256 |
+
solver_output = FactCheckerState(question=question, response=response)
|
257 |
+
|
258 |
+
# Initialize the output name
|
259 |
+
output_name = "response"
|
260 |
+
for idx, (name, (solver, input_name, output_name)) in tqdm.tqdm(enumerate(self.pipeline.items()),
|
261 |
+
total=len(self.pipeline)):
|
262 |
+
logger.info(f"Invoking solver: {idx}-{name}")
|
263 |
+
logger.info(f"State content: {solver_output}")
|
264 |
+
|
265 |
+
try:
|
266 |
+
# Solver input is the output of the previous solver
|
267 |
+
solver_input = solver_output
|
268 |
+
|
269 |
+
# Run the solver
|
270 |
+
cont, solver_output = solver(solver_input, **kwargs)
|
271 |
+
|
272 |
+
# Persist the output
|
273 |
+
logger.debug(f"Latest result: {solver_output}")
|
274 |
+
if callback:
|
275 |
+
callback(
|
276 |
+
index=idx,
|
277 |
+
sample_name=sample_name,
|
278 |
+
solver_name=name,
|
279 |
+
input_name=input_name,
|
280 |
+
output_name=output_name,
|
281 |
+
input=solver_input.__dict__,
|
282 |
+
output=solver_output.__dict__,
|
283 |
+
continue_run=cont
|
284 |
+
)
|
285 |
+
|
286 |
+
# Stream the output
|
287 |
+
if stream:
|
288 |
+
yield {
|
289 |
+
"index": idx,
|
290 |
+
"solver_name": name,
|
291 |
+
"input_name": input_name,
|
292 |
+
"output_name": output_name,
|
293 |
+
"input": solver_input.__dict__,
|
294 |
+
"output": solver_output.__dict__,
|
295 |
+
"continue_run": cont
|
296 |
+
}
|
297 |
+
|
298 |
+
self.persist_output(solver_output, idx, name, cont, sample_name=sample_name)
|
299 |
+
|
300 |
+
except:
|
301 |
+
logger.error(f"Error at {traceback.format_exc()}")
|
302 |
+
cont = False
|
303 |
+
output_name = input_name
|
304 |
+
|
305 |
+
# Break if the solver returns False
|
306 |
+
if not cont:
|
307 |
+
logger.info(f"Break at {name}")
|
308 |
+
break
|
309 |
+
|
310 |
+
if not stream:
|
311 |
+
return solver_output.get(output_name)
|
312 |
+
|
313 |
+
# Execute the generator if stream is True, otherwise process normally
|
314 |
+
return evaluate_response()
|
src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py
CHANGED
@@ -14,7 +14,7 @@ class RARRAgreementGate(StandardTaskSolver):
|
|
14 |
def __init__(self, args):
|
15 |
super().__init__(args)
|
16 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
17 |
-
self.model = self.global_config.get("model", "
|
18 |
|
19 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
20 |
claims = state.get(self.input_name)
|
|
|
14 |
def __init__(self, args):
|
15 |
super().__init__(args)
|
16 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
17 |
+
self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
|
18 |
|
19 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
20 |
claims = state.get(self.input_name)
|
src/openfactcheck/solvers/rarr_solvers/rarr_editor.py
CHANGED
@@ -14,8 +14,8 @@ import Levenshtein
|
|
14 |
class RARREditor(StandardTaskSolver):
|
15 |
def __init__(self, args):
|
16 |
super().__init__(args)
|
17 |
-
self.model = self.global_config.get("model", "
|
18 |
-
# self.model = args.get("model", "
|
19 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
20 |
self.max_edit_ratio = args.get("max_edit_ratio", 100)
|
21 |
self.output_claim_only = args.get("output_claim_only", False)
|
|
|
14 |
class RARREditor(StandardTaskSolver):
|
15 |
def __init__(self, args):
|
16 |
super().__init__(args)
|
17 |
+
self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
|
18 |
+
# self.model = args.get("model", "gpt-3.5-turbo-instruct")
|
19 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
20 |
self.max_edit_ratio = args.get("max_edit_ratio", 100)
|
21 |
self.output_claim_only = args.get("output_claim_only", False)
|
src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py
CHANGED
@@ -13,7 +13,7 @@ from .prompts.hallucination_prompts import EVIDENCE_HALLUCINATION
|
|
13 |
class LLMRetriever(StandardTaskSolver):
|
14 |
def __init__(self, args):
|
15 |
super().__init__(args)
|
16 |
-
self.model = self.global_config.get("model", "
|
17 |
|
18 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
19 |
claims = state.get(self.input_name)
|
|
|
13 |
class LLMRetriever(StandardTaskSolver):
|
14 |
def __init__(self, args):
|
15 |
super().__init__(args)
|
16 |
+
self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
|
17 |
|
18 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
19 |
claims = state.get(self.input_name)
|
src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py
CHANGED
@@ -17,7 +17,7 @@ from .prompts import rarr_prompts
|
|
17 |
class RARRQuestionGenerator(StandardTaskSolver):
|
18 |
def __init__(self, args):
|
19 |
super().__init__(args)
|
20 |
-
self.model = self.global_config.get("model", "
|
21 |
self.temperature_qgen = args.get("temperature_qgen", 0.7)
|
22 |
self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
|
23 |
|
|
|
17 |
class RARRQuestionGenerator(StandardTaskSolver):
|
18 |
def __init__(self, args):
|
19 |
super().__init__(args)
|
20 |
+
self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
|
21 |
self.temperature_qgen = args.get("temperature_qgen", 0.7)
|
22 |
self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
|
23 |
|
src/openfactcheck/solvers/tutorial_solvers/utils/api.py
CHANGED
@@ -34,7 +34,7 @@ def chatgpt(user_input):
|
|
34 |
|
35 |
def davinci(prompt):
|
36 |
# Set up the model and prompt
|
37 |
-
model_engine = "
|
38 |
|
39 |
# Generate a response
|
40 |
completion = client.completions.create(
|
|
|
34 |
|
35 |
def davinci(prompt):
|
36 |
# Set up the model and prompt
|
37 |
+
model_engine = "gpt-3.5-turbo-instruct"
|
38 |
|
39 |
# Generate a response
|
40 |
completion = client.completions.create(
|
src/openfactcheck/solvers/webservice/rarr_rtv.py
CHANGED
@@ -10,7 +10,7 @@ from .rarr_utils import search
|
|
10 |
class RARRRetriever(StandardTaskSolver):
|
11 |
def __init__(self, args):
|
12 |
super().__init__(args)
|
13 |
-
self.model = self.global_config.get("rarr_model", "
|
14 |
self.temperature_qgen = args.get("temperature_qgen", 0.7)
|
15 |
self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
|
16 |
self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
|
|
|
10 |
class RARRRetriever(StandardTaskSolver):
|
11 |
def __init__(self, args):
|
12 |
super().__init__(args)
|
13 |
+
self.model = self.global_config.get("rarr_model", "gpt-3.5-turbo-instruct")
|
14 |
self.temperature_qgen = args.get("temperature_qgen", 0.7)
|
15 |
self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
|
16 |
self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
|
src/openfactcheck/solvers/webservice/rarr_vfr.py
CHANGED
@@ -10,7 +10,7 @@ class RARRAgreementGate(StandardTaskSolver):
|
|
10 |
def __init__(self, args):
|
11 |
super().__init__(args)
|
12 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
13 |
-
self.model = self.global_config.get("rarr_model", "
|
14 |
|
15 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
16 |
claims_with_evidences = state.get(self.input_name)
|
|
|
10 |
def __init__(self, args):
|
11 |
super().__init__(args)
|
12 |
self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
|
13 |
+
self.model = self.global_config.get("rarr_model", "gpt-3.5-turbo-instruct")
|
14 |
|
15 |
def __call__(self, state: FactCheckerState, *args, **kwargs):
|
16 |
claims_with_evidences = state.get(self.input_name)
|