diff --git a/.gitignore b/.gitignore
index 541e7257efd19e4d4c846ad7a60a7cc06f8521c8..0a34294cee7a76b57d39af04800c5ba0673de2b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,9 @@ tmp/
.venv/
__pycache__/
*.pyc
+
+# Logs and Output
+*.log
+passage.json
+questions.txt
+verification_error.txt
\ No newline at end of file
diff --git a/configs/devhasaniqbal.json b/configs/devhasaniqbal.json
deleted file mode 100644
index d02fab0312019fd1aefff7cb15a802ef96d82a56..0000000000000000000000000000000000000000
--- a/configs/devhasaniqbal.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "hello": "world"
-}
\ No newline at end of file
diff --git a/configs/solvers/dummy.yaml b/configs/solvers/dummy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37bc1428a7d2b5a652614c59ce52eacaee9454c3
--- /dev/null
+++ b/configs/solvers/dummy.yaml
@@ -0,0 +1,15 @@
+fake_claim_extractor:
+ input_name: response
+ output_name: claims
+ max_claims: 3
+ min_claims: 2
+ignorant_search_engine_retriever:
+ input_name: claims
+ output_name: claims_with_evidences
+ max_num_documents: 5
+confused_claim_examiner:
+ input_name: claims_with_evidences
+ output_name: claims_with_tags
+useless_response_regenerator:
+ input_name: claims_with_tags
+ output_name: output
\ No newline at end of file
diff --git a/configs/solvers/factcheckgpt.yaml b/configs/solvers/factcheckgpt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..361dc5e0201fb9d22e1d937f30004be9cb2e4b17
--- /dev/null
+++ b/configs/solvers/factcheckgpt.yaml
@@ -0,0 +1,24 @@
+factcheckgpt_model: gpt-3.5-turbo
+factcheckgpt_claimprocessor:
+ input_name: response
+ output_name: claims
+ mode: independent_sentences
+ rule_based_method: spacy
+ spacy_model: en_core_web_sm
+factcheckgpt_retriever:
+ input_name: claims
+ output_name: claims_with_evidences
+ n_questions: 1
+ question_gen_round: 1
+ qgen_temp: 0.7
+ search_timeout: 10
+ max_search_results_per_query: 2
+ max_passages_per_search_result_to_return: 3
+ sentences_per_passage: 5
+ max_passages_per_question: 5
+ max_aggregated_evidences: 5
+factcheckgpt_verifier:
+ input_name: claims_with_evidences
+ output_name: label
+ stance_model: gpt-3.5-turbo
+ verify_retries: 3
\ No newline at end of file
diff --git a/configs/solvers/factool.yaml b/configs/solvers/factool.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ffb7ff8edc80772bb45f96695c43e6daf8cc2d6
--- /dev/null
+++ b/configs/solvers/factool.yaml
@@ -0,0 +1,11 @@
+facttool_model: gpt-3.5-turbo
+factool_claimprocessor:
+ input_name: response
+ output_name: claims
+factool_retriever:
+ input_name: claims
+ output_name: claims_with_evidences
+ snippet_cnt: 10
+factool_verifier:
+ input_name: claims_with_evidences
+ output_name: label
\ No newline at end of file
diff --git a/configs/solvers/rarr.yaml b/configs/solvers/rarr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45b2fe646f74b67ba4e4cdeac9ed9f2fa1b6a4ea
--- /dev/null
+++ b/configs/solvers/rarr.yaml
@@ -0,0 +1,12 @@
+rarr_model: gpt-3.5-turbo-instruct
+rarr_retriever:
+ input_name: claims
+ output_name: claims_with_evidences
+ max_search_results_per_query: 5
+ max_sentences_per_passage: 4
+ sliding_distance: 1
+ max_passages_per_search_result: 1
+rarr_verifier:
+ input_name: claims_with_evidences
+ output_name: label
+ max_evidences_per_question: 1
\ No newline at end of file
diff --git a/src/openfactcheck/__init__.py b/src/openfactcheck/__init__.py
index 3c30745ed872e26c5dffe0221cea35913deff234..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/src/openfactcheck/__init__.py
+++ b/src/openfactcheck/__init__.py
@@ -1,2 +0,0 @@
-from .lib.config import OpenFactCheckConfig
-from .core.base import OpenFactCheck
\ No newline at end of file
diff --git a/src/openfactcheck/core/__init__.py b/src/openfactcheck/core/__init__.py
index fa0f283d7c88c70395682788e0e2843ef04ff02c..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/src/openfactcheck/core/__init__.py
+++ b/src/openfactcheck/core/__init__.py
@@ -1 +0,0 @@
-from .base import OpenFactCheck
\ No newline at end of file
diff --git a/src/openfactcheck/core/base.py b/src/openfactcheck/core/base.py
index a81cb280869ae8f5ff92827348f9c7ebf169e0f8..eaa045c6aa01e457fac26437c81ec96a4705c005 100644
--- a/src/openfactcheck/core/base.py
+++ b/src/openfactcheck/core/base.py
@@ -1,13 +1,165 @@
+import os
+import sys
+import tqdm
+import yaml
+import json
+import traceback
+from pathlib import Path
+
from openfactcheck.lib.logger import logger
from openfactcheck.lib.config import OpenFactCheckConfig
+from openfactcheck.core.solver import SOLVER_REGISTRY, Solver
+from openfactcheck.core.state import FactCheckerState
class OpenFactCheck:
def __init__(self, config: OpenFactCheckConfig):
+ """
+ Initialize OpenFactCheck with the given configuration.
+
+ Parameters
+ ----------
+ config : OpenFactCheckConfig
+ An instance of OpenFactCheckConfig containing the configuration
+ settings for OpenFactCheck.
+ """
self.logger = logger
- self.config = config
+ self.config = config
+
+ # Initialize attributes
+ self.solver_configs = self.config.solver_configs
+ self.pipeline = self.config.pipeline
+ self.output_path = os.path.abspath(self.config.output_path)
+
+ # Load and register solvers
+ self.load_solvers(self.config.solver_paths)
+ self.logger.info(f"Loaded solvers: {list(self.list_solvers().keys())}")
+
+ # Initialize the pipeline
+ self.pipeline = self.init_pipeline()
+
+ self.logger.info("-------------- OpenFactCheck Initialized ----------------")
+ self.logger.info("Pipeline:")
+ for idx, (name, (solver, iname, oname)) in enumerate(self.pipeline.items()):
+ self.logger.info(f"{idx}-{name} ({iname} -> {oname})")
+ self.logger.info("---------------------------------------------------------")
+
+ @staticmethod
+ def load_solvers(solver_paths):
+ """
+ Load solvers from the given paths
+ """
+ for solver_path in solver_paths:
+ abs_path = Path(solver_path).resolve()
+ if abs_path.is_dir():
+ sys.path.append(str(abs_path.parent))
+ Solver.load(str(abs_path), abs_path.name)
+
+ @staticmethod
+ def list_solvers():
+ """
+ List all registered solvers
+ """
+ return SOLVER_REGISTRY
+
+ def init_solver(self, solver_name, args):
+ """
+ Initialize a solver with the given configuration
+ """
+
+ # Check if the solver is registered
+ if solver_name not in SOLVER_REGISTRY:
+ logger.error(f"{solver_name} not in SOLVER_REGISTRY")
+ raise RuntimeError(f"{solver_name} not in SOLVER_REGISTRY")
+
+ # Initialize the solver
+ solver_cls = SOLVER_REGISTRY[solver_name]
+ solver_cls.input_name = args.get("input_name", solver_cls.input_name)
+ solver_cls.output_name = args.get("output_name", solver_cls.output_name)
+
+ logger.info(f"Solver {solver_cls(args)} initialized")
+
+ return solver_cls(args), solver_cls.input_name, solver_cls.output_name
+
+ def init_solvers(self):
+ """
+ Initialize all registered solvers
+ """
+ solvers = {}
+ for k, v in self.solver_configs.items():
+ solver, input_name, output_name = self.init_solver(k, v)
+ solvers[k] = (solver, input_name, output_name)
+ return solvers
+
+ def init_pipeline(self):
+ """
+ Initialize the pipeline with the given configuration
+ """
+ pipeline = {}
+ for required_solver in self.config.pipeline:
+ if required_solver not in self.solver_configs:
+ logger.error(f"{required_solver} not in solvers config")
+ raise RuntimeError(f"{required_solver} not in solvers config")
+ solver, input_name, output_name = self.init_solver(required_solver, self.solver_configs[required_solver])
+ pipeline[required_solver] = (solver, input_name, output_name)
+
+ return pipeline
+
+ def init_pipeline_manually(self, pipeline: list):
+ """
+ Initialize the pipeline with the given configuration
+
+ Parameters
+ ----------
+ pipeline : list
+ A list of solvers to be included in the pipeline
+ """
+ self.pipeline = {}
+ for required_solver in pipeline:
+ if required_solver not in self.solver_configs:
+ raise RuntimeError(f"{required_solver} not in solvers config")
+ solver, input_name, output_name = self.init_solver(required_solver, self.solver_configs[required_solver])
+ self.pipeline[required_solver] = (solver, input_name, output_name)
+
+ def persist_output(self, state: FactCheckerState, idx, solver_name, cont, sample_name=0):
+ result = {
+ "idx": idx,
+ "solver": solver_name,
+ "continue": cont,
+ "state": state.to_dict()
+ }
+ with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'a', encoding="utf-8") as f:
+ f.write(json.dumps(result, ensure_ascii=False) + '\n')
- self.logger.info("OpenFactCheck initialized")
+ def __call__(self, response: str, question: str = None, callback_fun=None, **kwargs):
+ sample_name = kwargs.get("sample_name", 0)
+ solver_output = FactCheckerState(question=question, response=response)
+ oname = "response"
+ for idx, (name, (solver, iname, oname)) in tqdm.tqdm(enumerate(self.pipeline.items()),
+ total=len(self.pipeline)):
+ logger.info(f"Invoking solver: {idx}-{name}")
+ logger.debug(f"State content: {solver_output}")
+ try:
+ solver_input = solver_output
+ cont, solver_output = solver(solver_input, **kwargs)
+ logger.debug(f"Latest result: {solver_output}")
+ if callback_fun:
+ callback_fun(
+ index=idx,
+ sample_name=sample_name,
+ solver_name=name,
+ input_name=iname,
+ output_name=oname,
+ input=solver_input.__dict__,
+ output=solver_output.__dict__,
+ continue_run=cont
+ )
+ self.persist_output(solver_output, idx, name, cont, sample_name=sample_name)
+ except:
+ print(traceback.format_exc())
+ cont = False
+ oname = iname
+ if not cont:
+ logger.info(f"Break at {name}")
+ break
-if __name__ == "__main__":
- config = OpenFactCheckConfig()
- ofc = OpenFactCheck(config)
\ No newline at end of file
+ return solver_output.get(oname)
diff --git a/src/openfactcheck/core/cli.py b/src/openfactcheck/core/cli.py
index 1c8ab4fdce9290b8ece0909e34b7d2735d86bc69..464f85654ca429579f1d222fc00a09c68c103a0a 100644
--- a/src/openfactcheck/core/cli.py
+++ b/src/openfactcheck/core/cli.py
@@ -21,4 +21,8 @@ if __name__ == "__main__":
ofc = OpenFactCheck(OpenFactCheckConfig(args.config_path))
+ #result = ofc("Pakistan is a country in Asia")
+
+ #print(result)
+
\ No newline at end of file
diff --git a/src/openfactcheck/core/solver.py b/src/openfactcheck/core/solver.py
new file mode 100644
index 0000000000000000000000000000000000000000..3413d4216b88edd82588e6ff61ea1eb751f6cef5
--- /dev/null
+++ b/src/openfactcheck/core/solver.py
@@ -0,0 +1,134 @@
+import os
+import importlib
+
+from openfactcheck.lib.logger import logger
+from openfactcheck.core.state import FactCheckerState
+
+# Global solver registry
+SOLVER_REGISTRY = {}
+
+class StandardTaskSolver:
+ """
+ A class to represent a standard task solver. A standard task solver is a
+ class that implements a specific task in a fact-checking system. It
+ receives a FactCheckerState object as input and returns a new
+ FactCheckerState object as output.
+
+ Parameters
+ ----------
+ args : dict
+ A dictionary containing the arguments to be passed to the solver.
+ """
+
+ name: str = None
+ input_name: str = None
+ output_name: str = None
+ global_config: dict = dict()
+
+ def __init__(self, args: dict):
+ self.logger = logger
+ self.args = args
+
+ logger.debug(self.args)
+
+ def __call__(self, state: FactCheckerState, **kwargs) -> tuple[
+ bool, FactCheckerState]:
+ raise NotImplementedError
+
+ @classmethod
+ def build_solver(cls, args):
+ raise NotImplementedError
+
+ @property
+ def input_name(self):
+ return self.__class__.input_name
+
+ @property
+ def output_name(self):
+ return self.__class__.output_name
+
+ def __str__(self):
+ return f'[name:"{self.__class__.name}", input: "{self.__class__.input_name}": output: "{self.__class__.output_name}"]'
+
+class Solver:
+ """
+ Class to handle the registration and loading of solvers
+ """
+ def __init__(self):
+ pass
+
+ def register(name, input_name=None, output_name=None):
+ def decorator(cls):
+ """
+ Decorator to register a solver class
+ """
+
+ # Check if the solver is already registered
+ if name in SOLVER_REGISTRY:
+ return SOLVER_REGISTRY[name]
+
+ # Check if the solver class extends StandardTaskSolver
+ if not issubclass(cls, StandardTaskSolver):
+ logger.error(f"Solver '{name}' must extend StandardTaskSolver, got {cls.__name__}.")
+ raise ValueError(f"Solver '{name}' must extend StandardTaskSolver, got {cls.__name__}.")
+
+ # Register the solver
+ SOLVER_REGISTRY[name] = cls
+ cls.name = name
+ cls.input_name = input_name
+ cls.output_name = output_name
+
+ logger.info(f"Solver '{name}' registered")
+ return cls
+
+ return decorator
+
+ @staticmethod
+ def load_from_directory(directory, namespace):
+ """
+ Load solvers from a directory
+ """
+
+ # Check if the directory exists
+ for item in sorted(os.listdir(directory),
+ key=lambda x: os.path.isdir(os.path.join(directory, x)),
+ reverse=True):
+
+ # Skip hidden files and directories
+ if item.startswith('_') or item.startswith('.'):
+ continue
+
+ # Get the full path of the item
+ full_path = os.path.join(directory, item)
+
+ # Load the item
+ if os.path.isdir(full_path):
+ Solver.load_from_directory(full_path, namespace + '.' + item)
+ else:
+ Solver.load_from_file(full_path, namespace)
+
+ @staticmethod
+ def load_from_file(file_path, namespace):
+ """
+ Load a solver from a file
+ """
+
+ # Check if the file is a Python file
+ if file_path.endswith(".py"):
+ # Get the solver name
+ solver_name = os.path.basename(file_path)[:-3]
+
+ # Get the module name
+ module_name = namespace + "." + solver_name
+
+ # Import the module
+ logger.debug(f"Importing {module_name}")
+ importlib.import_module(module_name)
+
+ @staticmethod
+ def load(path, namespace):
+ if os.path.isdir(path):
+ Solver.load_from_directory(path, namespace)
+ else:
+ Solver.load_from_file(path, namespace)
+ return
diff --git a/src/openfactcheck/core/standard_task_solver.py b/src/openfactcheck/core/standard_task_solver.py
deleted file mode 100644
index 01614f2ff1f6f9b5a9265f0b198b81d242575e74..0000000000000000000000000000000000000000
--- a/src/openfactcheck/core/standard_task_solver.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import logging
-from typing import Tuple
-
-from fact_check_state import FactCheckerState
-
-class StandardTaskSolver:
- """
- A class to represent a standard task solver. A standard task solver is a
- class that implements a specific task in a fact-checking system. It
- receives a FactCheckerState object as input and returns a new
- FactCheckerState object as output.
-
- Parameters
- ----------
- args : dict
- A dictionary containing the arguments to be passed to the solver.
- """
-
- name: str = None
- input_name: str = None
- output_name: str = None
- global_config: dict = dict()
-
- def __init__(self, args: dict):
- self.args = args
- logging.debug(self.args)
-
- def __call__(self, state: FactCheckerState, **kwargs) -> Tuple[
- bool, FactCheckerState]:
- raise NotImplementedError
-
- @classmethod
- def build_solver(cls, args):
- raise NotImplementedError
-
- @property
- def input_name(self):
- return self.__class__.input_name
-
- @property
- def output_name(self):
- return self.__class__.output_name
-
- def __str__(self):
- return f'[name:"{self.__class__.name}", input: "{self.__class__.input_name}": output: "{self.__class__.output_name}"]'
diff --git a/src/openfactcheck/core/fact_check_state.py b/src/openfactcheck/core/state.py
similarity index 100%
rename from src/openfactcheck/core/fact_check_state.py
rename to src/openfactcheck/core/state.py
diff --git a/src/openfactcheck/lib/__init__.py b/src/openfactcheck/lib/__init__.py
index 28c741a639f1f9a1f0f36776cd426186dd11884a..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/src/openfactcheck/lib/__init__.py
+++ b/src/openfactcheck/lib/__init__.py
@@ -1,2 +0,0 @@
-from .config import OpenFactCheckConfig
-from .logger import logger
\ No newline at end of file
diff --git a/src/openfactcheck/lib/config.py b/src/openfactcheck/lib/config.py
index 95d66f6c943c0ae8f8f94e37fcd918b9b9dd9576..1ffa69cb0538c6ff87ff0b9cd286d090cf9537cb 100644
--- a/src/openfactcheck/lib/config.py
+++ b/src/openfactcheck/lib/config.py
@@ -1,20 +1,115 @@
+import os
import json
+import yaml
+import datasets
+import transformers
from pathlib import Path
from typing import Union
+from collections import namedtuple
-from .logger import logger
+from .logger import logger, set_logger_level
class OpenFactCheckConfig:
+ """
+ Class to load the OpenFactCheck configuration from a JSON or YAML file.
+ """
def __init__(self, filename: Union[str, Path] = "config.json"):
# Setup Logger
self.logger = logger
-
self.filename = filename
+ # Define namedtuple structures
+ Secrets = namedtuple("Secrets", ["openai_api_key",
+ "serper_api_key",
+ "azure_search_key"])
+ Pipeline = namedtuple("Pipeline", ["claimprocessor",
+ "retriever",
+ "verifier"])
+
+ # Define Attributes
+ self.retries = 0
+ self.pipeline = None
+ self.solver_configs = None
+ self.solver_paths = None
+ self.output_path = None
+ self.secrets = None
+ self.verbose = ""
+
try:
# Loading Config File
with open(self.filename, encoding="utf-8") as file:
- self.filename = json.load(file)
+ config = json.load(file)
+
+ # Initialize Retries
+ if 'retries' in config:
+ self.retries = config['retries']
+ else:
+ self.logger.warning("Retries config missing or incomplete in the configuration file.")
+ self.retries = 0
+
+ # Initialize Solver Configs
+ if 'solver_configs' in config:
+ self.solver_configs = SolversConfig(config['solver_configs'])()
+ else:
+ self.logger.warning("Solver configs missing or incomplete in the configuration file.")
+ self.solver_configs = None
+
+ # Initialize Solver Paths
+ if 'solver_paths' in config:
+ self.solver_paths = config['solver_paths']
+ else:
+ self.logger.warning("Solver paths missing or incomplete in the configuration file.")
+ self.solver_paths = None
+
+ # Initialize Output Path
+ if 'output_path' in config:
+ self.output_path = config['output_path']
+ os.makedirs(self.output_path, exist_ok=True)
+ else:
+ self.logger.warning("Output path missing or incomplete in the configuration file. Using default path.")
+ self.output_path = "tmp/output"
+ os.makedirs(self.output_path, exist_ok=True)
+
+ # Initialize Pipeline config
+ if 'pipeline' in config:
+ self.pipeline = Pipeline(claimprocessor=config['pipeline']['claimprocessor'],
+ retriever=config['pipeline']['retriever'],
+ verifier=config['pipeline']['verifier'])
+ else:
+ self.logger.warning("Pipeline config missing or incomplete in the configuration file.")
+ self.pipeline = Pipeline(claimprocessor=None, retriever=None, verifier=None)
+
+ self.logger.info(f"Config file loaded successfully from {self.filename}")
+
+ # Initialize Secrets config
+ if 'secrets' in config:
+ self.secrets = Secrets(openai_api_key=config['secrets']['openai_api_key'],
+ serper_api_key=config['secrets']['serper_api_key'],
+ azure_search_key=config['secrets']['azure_search_key'])
+ else:
+ self.logger.warning("Secrets config missing or incomplete in the configuration file.")
+ self.secrets = Secrets(openai_api_key=None, serper_api_key=None, azure_search_key=None)
+
+ # Initialize Environment Variables
+ if self.secrets.openai_api_key:
+ os.environ['OPENAI_API_KEY'] = self.secrets.openai_api_key
+ if self.secrets.serper_api_key:
+ os.environ['SERPER_API_KEY'] = self.secrets.serper_api_key
+ if self.secrets.azure_search_key:
+ os.environ['AZURE_SEARCH_KEY'] = self.secrets.azure_search_key
+
+ # Initialize Verbose
+ if 'verbose' in config:
+ self.verbose = config['verbose']
+ set_logger_level(self.logger, self.verbose)
+ else:
+ self.logger.warning("Verbose config missing or incomplete in the configuration file.")
+ self.verbose = ""
+ set_logger_level(self.logger, "INFO")
+
+ # Disable Transformers and Datasets logging
+ transformers.logging.set_verbosity_error()
+ datasets.logging.set_verbosity_error()
except FileNotFoundError:
self.logger.error(f"Config file not found: {self.filename}")
@@ -25,6 +120,57 @@ class OpenFactCheckConfig:
raise ValueError(f"Invalid JSON in config file: {self.filename}")
except Exception as e:
- self.logger.error(f"Error loading config file: {e}")
- raise Exception(f"Error loading config file: {e}")
-
\ No newline at end of file
+ self.logger.error(f"Unexpected error loading config file: {e}")
+ raise Exception(f"Unexpected error loading config file: {e}")
+
+class SolversConfig:
+ """
+ Class to load the solvers configuration from one or more JSON or YAML files.
+ Merges all configurations into a single dictionary.
+
+ Parameters
+ ----------
+ filename(s): str, Path, list
+ The path to the solvers configuration or a list of paths to multiple solvers configurations.
+ """
+ def __init__(self, filename_s: Union[str, Path, list]):
+ self.logger = logger
+ self.filename_s = filename_s
+ self.solvers = {}
+
+ try:
+ if isinstance(self.filename_s, (str, Path)):
+ self.load_file(self.filename_s)
+ elif isinstance(self.filename_s, list):
+ for filename in self.filename_s:
+ self.load_file(filename)
+ else:
+ self.logger.error(f"Invalid filename type: {type(self.filename_s)}")
+ raise ValueError(f"Invalid filename type: {type(self.filename_s)}")
+
+ except FileNotFoundError:
+ self.logger.error(f"Solvers file not found: {self.filename_s}")
+ raise FileNotFoundError(f"Solvers file not found: {self.filename_s}")
+ except json.JSONDecodeError:
+ self.logger.error(f"Invalid JSON in solvers file: {self.filename_s}")
+ raise ValueError(f"Invalid JSON in solvers file: {self.filename_s}")
+ except Exception as e:
+ self.logger.error(f"Unexpected error loading solvers file: {e}")
+ raise Exception(f"Unexpected error loading solvers file: {e}")
+
+ def load_file(self, filename: Union[str, Path]):
+ with open(filename, encoding="utf-8") as file:
+ if filename.endswith(".yaml"):
+ file_data = yaml.load(file, Loader=yaml.FullLoader)
+ elif filename.endswith(".json"):
+ file_data = json.load(file)
+ else:
+ self.logger.error(f"Invalid file format: {filename}")
+ raise ValueError(f"Invalid file format: {filename}")
+
+ # Merge current file data into existing solvers dictionary
+ self.solvers.update(file_data)
+ self.logger.info(f"Solvers file loaded and merged successfully from {filename}")
+
+ def __call__(self):
+ return self.solvers
\ No newline at end of file
diff --git a/src/openfactcheck/lib/errors.py b/src/openfactcheck/lib/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..1563687a66c1bef9b98ef3cad3299d45578c4849
--- /dev/null
+++ b/src/openfactcheck/lib/errors.py
@@ -0,0 +1,5 @@
+class Error(Exception):
+ """Base class for other exceptions"""
+
+class ConfigError(Error):
+ """Raised when there is an error with the configurations"""
\ No newline at end of file
diff --git a/src/openfactcheck/lib/logger.py b/src/openfactcheck/lib/logger.py
index ca25b92b7c5690f4cc9b8108a51e525baf177bde..275fa54425ab5863ede5643762073d31ff8d7ae9 100644
--- a/src/openfactcheck/lib/logger.py
+++ b/src/openfactcheck/lib/logger.py
@@ -1,38 +1,100 @@
import logging
-def get_logger():
+class CustomStreamLoggingFormatter(logging.Formatter):
"""
- This function returns a logger object that can be used to log messages
- to the console and a file.
+ Custom log formatter class to colorize log messages based on their level.
"""
- # Console Logger
- console_formatter = logging.Formatter('%(levelname)s -- %(message)s')
- console_handler = logging.StreamHandler()
- console_handler.setLevel(logging.DEBUG)
- console_handler.setFormatter(console_formatter)
+ # Define the color codes
+ grey = "\x1b[38;20m"
+ green = "\x1b[32;20m"
+ yellow = "\x1b[33;20m"
+ red = "\x1b[31;20m"
+ bold_red = "\x1b[31;1m"
+ reset = "\x1b[0m"
+ log_format = "%(levelname)s"
+
+ # Define the log message formats for different log levels
+ FORMATS = {
+ logging.DEBUG: grey + log_format + reset,
+ logging.INFO: green + log_format + reset,
+ logging.WARNING: yellow + log_format + reset,
+ logging.ERROR: red + log_format + reset,
+ logging.CRITICAL: bold_red + log_format + reset
+ }
+
+ def format(self, record):
+ log_fmt = self.FORMATS.get(record.levelno)
+ formatter = logging.Formatter(log_fmt + " -- %(message)s")
+ return formatter.format(record)
+
+class CustomFileLoggingFormatter(logging.Formatter):
+ """
+ Custom log formatter class for file logging.
+ """
- # File Logger
- # file_formatter = logging.Formatter(
- # '%(asctime)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s', datefmt='%m-%d-%Y %H:%M:%S')
- # file_handler = logging.FileHandler("lambda.log")
- # file_handler.setLevel(logging.DEBUG)
- # file_handler.setFormatter(file_formatter)
+ log_format = "%(asctime)s - %(levelname)s - %(message)s"
- # Getting the root logger
- newlogger = logging.getLogger(__name__)
+ def format(self, record):
+ formatter = logging.Formatter(self.log_format)
+ return formatter.format(record)
+
+def get_logger(name=__name__, enable_file_logging=False, file_name="app.log"):
+ """
+ Returns a logger object configured with a console handler and optionally a file handler.
+
+ Parameters
+ ----------
+ name : str
+ The name of the logger.
+ enable_file_logging : bool
+ Whether to enable file logging.
+ file_name : str
+ The name of the log file.
+ """
+ logger = logging.getLogger(name)
+ logger.setLevel(logging.DEBUG) # Set the logger level to DEBUG
- # Adding the handlers
- # logger.addHandler(file_handler)
- newlogger.addHandler(console_handler)
+ # Console Handler
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(CustomStreamLoggingFormatter()) # Apply the custom formatter
+ logger.addHandler(console_handler)
- # Setting the level
- newlogger.setLevel(logging.DEBUG)
+ # Optional File Handler
+ if enable_file_logging:
+ file_handler = logging.FileHandler(file_name)
+ file_handler.setFormatter(CustomFileLoggingFormatter()) # Apply the custom formatter
+ logger.addHandler(file_handler)
- # Preventing the loggers from propagating to the root logger
- newlogger.propagate = False
+ logger.propagate = False # Prevent the logger from propagating to the root logger
- return newlogger
+ return logger
+def set_logger_level(logger, level):
+ """
+ Set the logger level based on the input string.
+
+ Parameters
+ ----------
+ logger : logging.Logger
+ The logger object.
+ level : str
+ The log level string (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+ """
+ level = level.upper()
+ if level == "DEBUG":
+ logger.setLevel(logging.DEBUG)
+ elif level == "INFO":
+ logger.setLevel(logging.INFO)
+ elif level == "WARNING":
+ logger.setLevel(logging.WARNING)
+ elif level == "ERROR":
+ logger.setLevel(logging.ERROR)
+ elif level == "CRITICAL":
+ logger.setLevel(logging.CRITICAL)
+ else:
+ logger.warning("Invalid log level. Using default level INFO.")
+ logger.setLevel(logging.INFO)
-logger = get_logger()
+# Create a logger object
+logger = get_logger(__name__, enable_file_logging=True, file_name="app.log")
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/dummy/__init__.py b/src/openfactcheck/solvers/dummy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/dummy/confused_claim_examiner.py b/src/openfactcheck/solvers/dummy/confused_claim_examiner.py
new file mode 100644
index 0000000000000000000000000000000000000000..c121634ea0bbde36d8b6e4cd2c4a95879168f7b8
--- /dev/null
+++ b/src/openfactcheck/solvers/dummy/confused_claim_examiner.py
@@ -0,0 +1,19 @@
+import random
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+@Solver.register("confused_claim_examiner", "claims_with_evidences", "claims_with_tags")
+class ConfusedClaimExaminer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ claims_with_tags = {}
+ for claim_key, docs in claims.items():
+ claims_with_tags[claim_key] = random.choice([True, False])
+
+ state.set(self.output_name, claims_with_tags)
+ return True, state
diff --git a/src/openfactcheck/solvers/dummy/fake_claim_extractor.py b/src/openfactcheck/solvers/dummy/fake_claim_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..60cc81e51ce8a0e8f59d36d65769cb7a6c6223e2
--- /dev/null
+++ b/src/openfactcheck/solvers/dummy/fake_claim_extractor.py
@@ -0,0 +1,27 @@
+import random
+
+from openfactcheck.core.solver import Solver
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+@Solver.register("fake_claim_extractor", "response", "claims")
+class FakeClaimExtractor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.max_claims = args.get("max_claims", 5)
+ self.min_claims = args.get("min_claims", 2)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ response = state.get(self.input_name)
+
+ response_len = len(response)
+ num_claims = random.randint(min(self.min_claims, response_len), min(self.min_claims, response_len))
+ cut_pont = list(range(response_len))
+ random.shuffle(cut_pont)
+ cut_pont = sorted(cut_pont[:num_claims + 1])
+ claims = []
+ for i in range(len(cut_pont) - 1):
+ claims.append(response[cut_pont[i]:cut_pont[i + 1]])
+
+ state.set(self.output_name, claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/dummy/ignorant_search_engine_retriever.py b/src/openfactcheck/solvers/dummy/ignorant_search_engine_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0620e760ee5d74775cb176ace96d730069ee361
--- /dev/null
+++ b/src/openfactcheck/solvers/dummy/ignorant_search_engine_retriever.py
@@ -0,0 +1,25 @@
+import random
+import string
+
+from openfactcheck.core.solver import Solver
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+@Solver.register("ignorant_search_engine_retriever", "claims", "claims_with_evidences")
+class IgnorantSearchEngineRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.max_num_documents = args.get("max_num_documents",5)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ claims_with_evidences = {}
+ for idx, claim in enumerate(claims):
+ # Assume we call some search engine API here
+ documents = [string.ascii_letters[random.randint(0, 25)] for i in range(self.max_num_documents)]
+ key = f"{idx}-{claim}" # Convert the tuple to a string key
+ claims_with_evidences[key] = documents
+
+ state.set(self.output_name, claims_with_evidences)
+ return True, state
diff --git a/src/openfactcheck/solvers/dummy/useless_response_regenerator.py b/src/openfactcheck/solvers/dummy/useless_response_regenerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b5a6710f94fdebf78ae6c536b7cc38ddf1c434
--- /dev/null
+++ b/src/openfactcheck/solvers/dummy/useless_response_regenerator.py
@@ -0,0 +1,15 @@
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+@Solver.register("useless_response_regenerator", "claims_with_tags", "output")
+class UselessResponseRegenerator(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ true_claims = [k[1] for k, v in claims.items() if v is True]
+ new_response = ' '.join(true_claims)
+ state.set(self.output_name, new_response)
+ return True, state
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_cp.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_cp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a43b07a55d6755898de53e1fc22464af4be3a78
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_cp.py
@@ -0,0 +1,70 @@
+import nltk
+import spacy
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.data_util import save_to_file
+from .factcheckgpt_utils.prompt import DOC_TO_INDEPEDENT_SENTENCES_PROMPT, SENTENCES_TO_CLAIMS_PROMPT, \
+ DOC_TO_SENTENCES_PROMPT, CHECKWORTHY_PROMPT_BOOL, SPECIFY_CHECKWORTHY_CATEGORY_PROMPT
+
+@Solver.register("factcheckgpt_claimprocessor", "response", "claims")
+class FactCheckGPTClaimProcessor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("factcheckgpt_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ self.mode = args.get("mode", "independent_sentences")
+ self.decompose_system_role = "You are good at decomposing and decontextualizing text."
+ self.worthines_filter_system_role = "You are a helpful factchecker assistant."
+ self.rule_based_method = args.get("rule_based_tool", "spacy")
+ self.spacy_model = args.get("spacy_model", "en_core_web_sm")
+ self.prompt = {
+ "sentences": DOC_TO_SENTENCES_PROMPT,
+ "independent_sentences": DOC_TO_INDEPEDENT_SENTENCES_PROMPT,
+ "claims": SENTENCES_TO_CLAIMS_PROMPT
+ }.get(self.mode, DOC_TO_INDEPEDENT_SENTENCES_PROMPT)
+ nlp = spacy.load(self.spacy_model)
+ self.rule_based_tool = {
+ "nltk": lambda x: [x.strip() for x in nltk.sent_tokenize(x) if len(x.strip()) >= 3],
+ "spacy": lambda x: [x.text.strip() for x in nlp(x).sents if len(x.text.strip()) >= 3]
+ }.get(self.rule_based_method, "nltk")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ # We have merged the text decomposer and worthiness filter here.
+ response = state.get(self.input_name)
+ claims = [response]
+
+ user_input = self.prompt.format(doc=response).strip()
+ r = gpt(user_input, model=self.model, system_role=self.decompose_system_role, num_retries=self.num_retries)
+ try:
+ claims = eval(r)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ save_to_file(r)
+
+ if not isinstance(claims, list):
+ print(
+ f"{self.model} output {r}. It does not output a list of sentences correctly, return rule-based split results.")
+ claims = self.rule_based_tool(response)
+
+ worthiness = [True] * len(claims)
+ user_input = CHECKWORTHY_PROMPT_BOOL.format(claims=claims)
+ response = gpt(user_input, model=self.model, system_role=self.worthines_filter_system_role,
+ num_retries=self.num_retries)
+ # TODO refine check worthiness prompt, value returned not reasonable.
+ try:
+ worthiness = eval(response)
+ assert len(worthiness) == len(claims)
+ except AssertionError as e:
+ print(f"An unexpected error occurred: {e}")
+ print(f"There are {len(claims)} texts, while {len(worthiness)} checkworthy predictions.")
+ return False, state
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, state
+
+ valid_claims = list(map(lambda x: x[1], filter(lambda x: x[0], zip(worthiness, claims))))
+ state.set(self.output_name, valid_claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_rtv.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_rtv.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e7a70e6ef318329c78c4ad1009b19c1cfade79
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_rtv.py
@@ -0,0 +1,322 @@
+import re
+import bs4
+import torch
+import spacy
+import backoff
+import requests
+import itertools
+import numpy as np
+import concurrent.futures
+from copy import deepcopy
+from openai import RateLimitError
+from sentence_transformers import CrossEncoder
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.prompt import QGEN_PROMPT, QGEN_PROMPT_FMT
+from .factcheckgpt_utils.data_util import save_txt, save_json
+
+@Solver.register("factcheckgpt_retriever", "claims", "claims_with_evidences")
+class FactCheckGPTRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("factcheckgpt_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ self.tokenizer = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer"])
+ self.question_duplicate_model = CrossEncoder(
+ 'navteca/quora-roberta-base',
+ device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ )
+ self.passage_ranker = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
+ )
+ # self.system_role = args.get("system_role", "You are a student full of curiosity")
+ self.qgen_system_role = "You are a student full of curiosity"
+ self.n_questions = args.get("n_questions", 5)
+ self.question_gen_round = args.get("question_gen_round", 1)
+ self.qgen_temp = args.get("qgen_temp", 0.7)
+ self.search_timeout = args.get("search_timeout", 10)
+ self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
+ self.max_passages_per_search_result_to_return = args.get("max_passages_per_search_result_to_return", 3)
+ self.sentences_per_passage = args.get("sentences_per_passage", 5)
+ self.max_passages_per_question = args.get("max_passages_per_question", 5)
+ self.max_aggregated_evidences = args.get("max_aggregated_evidences", 5)
+ self.question_persist_path = args.get("question_persist_path", 'questions.txt')
+ self.snippets_persist_path = args.get("snippets_persist_path", "passage.json")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ claims_with_evidences = {}
+ for i, claim in enumerate(claims):
+ evidences = self.get_web_evidences_for_claim(claim)
+ claims_with_evidences[claim] = [(q, e['text']) for q, e in evidences['aggregated']]
+ state.set(self.output_name, claims_with_evidences)
+ return True, state
+
+ def generate_questions(self, claim, max_loop=5):
+ questions = []
+ while len(questions) <= 0:
+ questions = self.run_question_generation(claim)
+ if len(questions) >= 0:
+ questions = self.remove_duplicate_questions(questions)
+ save_txt(questions, self.question_persist_path)
+ return questions
+
+ def retrieve_documents(self, questions):
+ snippets = {}
+ for question in questions:
+ retrieved_passages = self.get_relevant_snippets(question)
+ snippets[question] = sorted(
+ retrieved_passages,
+ key=lambda x: x['retrieval_score'],
+ reverse=True
+ )[:self.max_passages_per_question]
+ save_json(snippets, self.snippets_persist_path)
+ return snippets
+
+ def get_web_evidences_for_claim(self, claim):
+ evidences = dict()
+ evidences["aggregated"] = list()
+ questions = self.generate_questions(claim)
+ snippets = self.retrieve_documents(questions)
+ evidences["question_wise"] = snippets
+ total_snippets = sum(list(map(lambda x: len(x), snippets.values())))
+ if total_snippets == 0:
+ raise RuntimeError("No passages are retrieved, check your network...")
+ if total_snippets > self.max_aggregated_evidences:
+ while len(evidences["aggregated"]) < self.max_aggregated_evidences:
+ for key in evidences["question_wise"]:
+ # Take top evidences for each question
+ if len(evidences["question_wise"][key]) > 0:
+ index = int(len(evidences["aggregated"]) / len(evidences["question_wise"]))
+ evidence = evidences["question_wise"][key][index]
+ evidences["aggregated"].append((key, evidence))
+ else:
+ evidences["aggregated"] = itertools.chain.from_iterable(
+ [[(q, e) for e in es] for q, es in snippets.items()]
+ )
+ return evidences
+
+ @backoff.on_exception(backoff.expo, RateLimitError)
+ def run_question_generation(self, claim):
+ questions = set()
+ for _ in range(self.question_gen_round):
+ user_input = QGEN_PROMPT_FMT.format(claim=claim, n=self.n_questions)
+ response = gpt(
+ user_input,
+ model=self.model,
+ system_role=self.qgen_system_role,
+ num_retries=self.num_retries,
+ temperature=self.qgen_temp
+ )
+ try:
+ cur_round_questions = set(eval(response))
+ questions.update(cur_round_questions)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ questions = list(sorted(questions))
+ return questions
+
+ def remove_duplicate_questions(self, all_questions):
+ qset = [all_questions[0]]
+ for question in all_questions[1:]:
+ q_list = [(q, question) for q in qset]
+ scores = self.question_duplicate_model.predict(q_list)
+ if np.max(scores) < 0.60:
+ qset.append(question)
+ return qset
+
+ def scrape_url(self, url: str, timeout: float = 3) -> tuple[str, str]:
+ """Scrapes a URL for all text information.
+
+ Args:
+ url: URL of webpage to scrape.
+ timeout: Timeout of the requests call.
+ Returns:
+ web_text: The visible text of the scraped URL.
+ url: URL input.
+ """
+ # Scrape the URL
+ try:
+ response = requests.get(url, timeout=timeout)
+ response.raise_for_status()
+ except requests.exceptions.RequestException as _:
+ print("URL Error", url)
+ return None, url
+
+ # Extract out all text from the tags
+ try:
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
+ texts = soup.findAll(text=True)
+ # Filter out invisible text from the page.
+ visible_text = filter(self.is_tag_visible, texts)
+ except Exception as _:
+ print("Parsing Error", response.text)
+ return None, url
+
+ # Returns all the text concatenated as a string.
+ web_text = " ".join(t.strip() for t in visible_text).strip()
+ # Clean up spacing.
+ web_text = " ".join(web_text.split())
+ return web_text, url
+
+ def is_tag_visible(self, element: bs4.element) -> bool:
+ """Determines if an HTML element is visible.
+
+ Args:
+ element: A BeautifulSoup element to check the visiblity of.
+ returns:
+ Whether the element is visible.
+ """
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ] or isinstance(element, bs4.element.Comment):
+ return False
+ return True
+
+ def search_google(self, query: str, num_web_pages: int = 10, timeout: int = 6, save_url: str = '') -> list[str]:
+ """Searches the query using Google.
+ Args:
+ query: Search query.
+ num_web_pages: the number of web pages to request.
+ save_url: path to save returned urls, such as 'urls.txt'
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+ query = query.replace(" ", "+")
+
+ # set headers: Google returns different web-pages according to agent device
+ # desktop user-agent
+ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
+ # mobile user-agent
+ MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
+ headers = {'User-Agent': USER_AGENT}
+
+ # set language
+ # set the Google interface language, use &hl=XX
+ # set the preferred language of the search results, use &lr=lang_XX
+ # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
+ lang = "en"
+
+ # scrape google results
+ urls = []
+ for page in range(0, num_web_pages, 10):
+ # here page is google search's bottom page meaning, click 2 -> start=10
+ # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+ url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
+ r = requests.get(url, headers=headers, timeout=timeout)
+ # collect all urls by regular expression
+ # how to do if I just want to have the returned top-k pages?
+ urls += re.findall('href="(https?://.*?)"', r.text)
+
+ # set to remove repeated urls
+ urls = list(set(urls))
+
+ # save all url into a txt file
+ if not save_url == "":
+ with open(save_url, 'w') as file:
+ for url in urls:
+ file.write(url + '\n')
+ return urls
+
+ def chunk_text(
+ self,
+ text: str,
+ tokenizer,
+ sentences_per_passage: int = 5,
+ filter_sentence_len: int = 250,
+ sliding_distance: int = 2,
+ ) -> list[str]:
+ """Chunks text into passages using a sliding window.
+
+ Args:
+ text: Text to chunk into passages.
+ sentences_per_passage: Number of sentences for each passage.
+ filter_sentence_len: Maximum number of chars of each sentence before being filtered.
+ sliding_distance: Sliding distance over the text. Allows the passages to have
+ overlap. The sliding distance cannot be greater than the window size.
+ Returns:
+ passages: Chunked passages from the text.
+ """
+ if not sliding_distance or sliding_distance > sentences_per_passage:
+ sliding_distance = sentences_per_passage
+ assert sentences_per_passage > 0 and sliding_distance > 0
+
+ passages = []
+ try:
+ doc = tokenizer(text[:500000]) # Take 500k chars to not break tokenization.
+ sents = [
+ s.text.replace("\n", " ")
+ for s in doc.sents
+ if len(s.text) <= filter_sentence_len # Long sents are usually metadata.
+ ]
+ for idx in range(0, len(sents), sliding_distance):
+ passages.append(
+ (" ".join(sents[idx: idx + sentences_per_passage]), idx, idx + sentences_per_passage - 1))
+ except UnicodeEncodeError as _: # Sometimes run into Unicode error when tokenizing.
+ print("Unicode error when using Spacy. Skipping text.")
+
+ return passages
+
+ def get_relevant_snippets(
+ self,
+ query,
+ ):
+ search_results = self.search_google(query, timeout=self.search_timeout)
+
+ with concurrent.futures.ThreadPoolExecutor() as e:
+ scraped_results = e.map(self.scrape_url, search_results, itertools.repeat(self.search_timeout))
+ # Remove URLs if we weren't able to scrape anything or if they are a PDF.
+ scraped_results = [r for r in scraped_results if r[0] and ".pdf" not in r[1]]
+ # print("Num Bing Search Results: ", len(scraped_results))
+ retrieved_passages = list()
+ for webtext, url in scraped_results[:self.max_search_results_per_query]:
+ passages = self.chunk_text(
+ text=webtext,
+ tokenizer=self.tokenizer,
+ sentences_per_passage=self.sentences_per_passage
+ )
+ if not passages:
+ continue
+
+ # Score the passages by relevance to the query using a cross-encoder.
+ scores = self.passage_ranker.predict([(query, p[0]) for p in passages]).tolist()
+ # Take the top passages_per_search passages for the current search result.
+ passage_scores = sorted(zip(passages, scores), reverse=True, key=lambda x: x[1])
+
+ relevant_items = list()
+ for passage_item, score in passage_scores:
+ overlap = False
+ if len(relevant_items) > 0:
+ for item in relevant_items:
+ if passage_item[1] >= item[1] and passage_item[1] <= item[2]:
+ overlap = True
+ break
+ if passage_item[2] >= item[1] and passage_item[2] <= item[2]:
+ overlap = True
+ break
+
+ # Only consider top non-overlapping relevant passages to maximise for information
+ if not overlap:
+ relevant_items.append(deepcopy(passage_item))
+ retrieved_passages.append(
+ {
+ "text": passage_item[0],
+ "url": url,
+ "sents_per_passage": self.sentences_per_passage,
+ "retrieval_score": score, # Cross-encoder score as retr score
+ }
+ )
+ if len(relevant_items) >= self.max_passages_per_search_result_to_return:
+ break
+ # print("Total snippets extracted: ", len(retrieved_passages))
+ return retrieved_passages
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/__init__.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/data_util.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/data_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1099285c6e2d493da717517d3f8cf5ed20fc1acb
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/data_util.py
@@ -0,0 +1,136 @@
+import csv
+import json
+import numpy as np
+from collections import Counter
+from typing import Dict, List, Any
+
+
+def save_to_file(text, filename='error_output.txt'):
+ """Save a string to a file line by line."""
+ with open(filename, 'a', encoding='utf-8') as file:
+ file.write(text + '\n')
+
+
+def majority_vote(input_list):
+ # Use Counter to count occurrences of each element
+ counter = Counter(input_list)
+
+ # Find the element with the maximum count (majority)
+ majority_element = max(counter, key=counter.get)
+
+ # Return the majority element
+ return majority_element
+
+
+def is_float(string):
+ if string.replace(".", "").isnumeric():
+ return True
+ else:
+ return False
+
+
+def save_json(dictionary: Dict[str, Any], save_dir: str) -> None:
+ # Serializing json
+ json_object = json.dumps(dictionary, indent=4, ensure_ascii=False)
+
+ # Writing to sample.json
+ with open(save_dir, "w", encoding='utf-8') as outfile:
+ outfile.write(json_object)
+
+
+def read_json(filepath: str) -> Dict[str, Any]:
+ data = {}
+ with open(filepath, 'r', encoding='utf-8') as file:
+ data = json.load(file)
+ return data
+
+
+def list_to_dict(data: List[Dict[str, Any]]) -> Dict[int, Any]:
+ temp = {}
+ for i, d in enumerate(data):
+ temp[i] = d
+ return temp
+
+
+def load_jsonl(path):
+ data = []
+ with open(path, 'r', encoding='utf-8') as reader:
+ for line in reader:
+ data.append(json.loads(line))
+ return data
+
+
+# def load_jsonl(input_path) -> list:
+# """
+# Read list of objects from a JSON lines file.
+# """
+# data = []
+# with open(input_path, 'r', encoding='utf-8') as f:
+# for line in f:
+# data.append(json.loads(line.rstrip('\n|\r')))
+# print('Loaded {} records from {}'.format(len(data), input_path))
+# return data
+
+def dump_jsonl(data, output_path, append=False):
+ """
+ Write list of objects to a JSON lines file.
+ """
+ mode = 'a+' if append else 'w'
+ with open(output_path, mode, encoding='utf-8') as f:
+ for line in data:
+ json_record = json.dumps(line, ensure_ascii=False)
+ f.write(json_record + '\n')
+ print('Wrote {} records to {}'.format(len(data), output_path))
+
+
+def cosine(u, v):
+ """based on embeddings and calculate cosine similarity"""
+ return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+
+def read_csv(input_file, quotechar=None):
+ with open(input_file, "r", encoding="utf-8") as f:
+ reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+ lines = []
+ for line in reader:
+ lines.append(line)
+ return lines
+
+
+def save_csv(header, data, output_file):
+ with open(output_file, 'w', encoding='UTF8', newline='') as f:
+ writer = csv.writer(f, delimiter='\t')
+ # write the header
+ writer.writerow(header)
+ # write multiple rows
+ writer.writerows(data)
+
+
+def save_array(filename, embeddings):
+ # save embeddings into file
+ with open(filename, 'wb') as f:
+ np.save(f, embeddings)
+
+
+def load_array(filename):
+ with open(filename, 'rb') as f:
+ a = np.load(f)
+ return a
+
+
+def read_txt(input_file):
+ with open(input_file, "r", encoding="utf-8") as f:
+ return f.readlines()
+
+
+def save_txt(data, output_file):
+ with open(output_file, "w", encoding="utf-8") as writer:
+ writer.write("\n".join(data))
+
+
+def clean_text(text):
+ for mark in ['"', '-', '\t', ' ']:
+ for i in [5, 4, 3, 2]:
+ marks = mark * i
+ text = text.replace(marks, '')
+ return text
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/eval_utils.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/eval_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4efea3d83fe19e9de654a5dfdc71338c742d4b4
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/eval_utils.py
@@ -0,0 +1,41 @@
+# code for general evaluation
+
+import numpy as np
+import evaluate
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+
+def evaluate_classification(preds, gold):
+ metric = evaluate.load("bstrai/classification_report")
+ return metric.compute(predictions=preds, references=gold)
+
+def eval_classification(y_true, y_pred, average="macro"):
+ precision, recall, F1, support = precision_recall_fscore_support(y_true, y_pred, average=average)
+ accuracy = accuracy_score(y_true, y_pred)
+
+ metrics = {
+ "accuracy": round(accuracy, 3),
+ "precision": round(precision, 3),
+ "recall": round(recall, 3),
+ "F1": round(F1, 3),
+ }
+ return metrics
+
+
+def eval_binary(y_true, y_pred, pos_label=1, average="binary"):
+ """pos_label: postive label is machine text here, label is 1, human text is 0"""
+ precision, recall, F1, support = precision_recall_fscore_support(
+ y_true, y_pred, pos_label = pos_label, average = average)
+ # accuracy
+ accuracy = accuracy_score(y_true, y_pred)
+ # precison
+ # pre = precision_score(y_true, y_pred, pos_label = pos_label, average = average)
+ # recall
+ # rec = recall_score(y_true, y_pred, pos_label = pos_label, average = average)
+ metrics = {
+ "accuracy": round(accuracy, 3),
+ "precision": round(precision, 3),
+ "recall": round(recall, 3),
+ "F1": round(F1, 3),
+ }
+ return metrics
+
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/nli.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/nli.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d6161ea7568f8688f2a44ea9b7a4a11e95f41c
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/nli.py
@@ -0,0 +1,44 @@
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+
+# Load model directly
+# Sentiment analysis pipeline
+# classifier = pipeline("sentiment-analysis", model="roberta-large-mnli")
+
+tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
+model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
+classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+
+nli_labelmap = {
+ "NEUTRAL": 3,
+ "CONTRADICTION":2,
+ "ENTAILMENT": 1
+}
+
+nli2stance = {
+ "NEUTRAL": 0,
+ "CONTRADICTION": -1,
+ "ENTAILMENT": 1
+}
+
+stance_map = {
+ 'irrelevant': 3,
+ 'refute': 2,
+ 'partially-support': 1,
+ 'completely-support': 1
+}
+
+def nli_infer(premise, hypothesis):
+ # predict one example by nli model
+ try:
+ input = "{}{}".format(premise, hypothesis)
+ pred = classifier(input)
+ # print(pred)
+ except:
+ # token length > 514
+ L = len(premise)
+ premise = premise[:int(L/2)]
+ input = "{}{}".format(premise, hypothesis)
+ pred = classifier(input)
+ # print(pred)
+ # [{'label': 'CONTRADICTION', 'score': 0.9992701411247253}]
+ return nli2stance[pred[0]['label']]
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/openai_api.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/openai_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6bea110a0af6bccd250ac6c8be7bcacfa48afbf
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/openai_api.py
@@ -0,0 +1,75 @@
+import os
+import time
+from openai import OpenAI
+import openai
+
+client = None
+
+
+def init_client():
+ global client
+ if client is None:
+ if openai.api_key is None and 'OPENAI_API_KEY' not in os.environ:
+ print("openai_key not presented, delay to initialize.")
+ return
+ client = OpenAI()
+
+
+def request(
+ user_inputs,
+ model,
+ system_role,
+ temperature=1.0,
+ return_all=False,
+):
+ init_client()
+
+ if type(user_inputs) == str:
+ chat_histories = [{"role": "user", "content": user_inputs}]
+ elif type(user_inputs) == list:
+ if all([type(x) == str for x in user_inputs]):
+ chat_histories = [
+ {
+ "role": "user" if i % 2 == 0 else "assistant", "content": x
+ } for i, x in enumerate(user_inputs)
+ ]
+ elif all([type(x) == dict for x in user_inputs]):
+ chat_histories = user_inputs
+ else:
+ raise ValueError("Invalid input for OpenAI API calling")
+ else:
+ raise ValueError("Invalid input for OpenAI API calling")
+
+
+ messages = [{"role": "system", "content": system_role}] + chat_histories
+
+ response = client.chat.completions.create(
+ model=model,
+ messages=messages,
+ temperature=temperature
+ )
+ if return_all:
+ return response
+ response_str = ''
+ for choice in response.choices:
+ response_str += choice.message.content
+ return response_str
+
+
+def gpt(
+ user_inputs,
+ model,
+ system_role,
+ temperature=1.0,
+ num_retries=3,
+ waiting=1
+):
+ response = None
+ for _ in range(num_retries):
+ try:
+ response = request(user_inputs, model, system_role, temperature=temperature)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(waiting)
+ return response
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/prompt.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c796792186111d827e6a9e9a8d2336af35eaebf
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_utils/prompt.py
@@ -0,0 +1,566 @@
+"""All prompts used for fact-checking subtasks prompting."""
+
+# updates in Dec are donimant of function-based or code-based prompts,
+# to get rid of parsing LLM results
+# ------------------------------------------------------------------------
+# Dec 2023: decompose and decontextualise, return a list
+# ------------------------------------------------------------------------
+DOC_TO_INDEPEDENT_SENTENCES_PROMPT = """
+Your task is to perform sentence segmentation and de-contextualization.
+Let's define a function named process(input:str).
+The return value should be a list of strings, where each string should be a decontextualized sentence.
+For example, if a user call process("Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.").
+You should return a python list without any other words,
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+process("{doc}")
+"""
+
+SENTENCES_TO_CLAIMS_PROMPT = """
+Your task is to decompose the text into atomic claims.
+Let's define a function named decompose(input:str).
+The returned value should be a list of strings, where each string should be a context-independent claim, representing one fact.
+For example, if a user call decompose("Mary is a five-year old girl, she likes playing piano and she doesn't like cookies.").
+You should return a python list without any other words:
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+decompose("{doc}")
+"""
+
+# just sentence splits without decontextualization
+DOC_TO_SENTENCES_PROMPT = """
+Your task is to perform sentence segmentation.
+Let's define a function named split(input:str).
+The return value should be a list of strings, where each string should be a sentence.
+For example, if a user call process("Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.").
+You should return a python list without any other words,
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+split("{doc}")
+"""
+
+# ------------------------------------------------------------------------
+# Dec 2023: identify checkworthy
+# ------------------------------------------------------------------------
+CHECKWORTHY_PROMPT = """
+Your task is to identify whether texts are checkworthy in the context of fact-checking.
+Let's define a function named checkworthy(input: List[str]).
+The return value should be a list of strings, where each string selects from ["Yes", "No"].
+"Yes" means the text is a factual checkworthy statement.
+"No" means that the text is not checkworthy, it might be an opinion, a question, or others.
+For example, if a user call checkworthy(["I think Apple is a good company.", "Friends is a great TV series.", "Are you sure Preslav is a professor in MBZUAI?", "The Stanford Prison Experiment was conducted in the basement of Encina Hall.", "As a language model, I can't provide these info."])
+You should return a python list without any other words,
+["No", "Yes", "No", "Yes", "No"]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+
+checkworthy({texts})
+"""
+
+CHECKWORTHY_PROMPT_BOOL = """
+Your task is to identify whether texts are checkworthy in the context of fact-checking.
+Let's define a function named checkworthy(input: List[str]).
+The return value should be a list of bool values: [True, False].
+True means the text is a factual checkworthy statement.
+False means that the text is not checkworthy, it might be an opinion, a question, or others.
+For example, if a user call checkworthy(["I think Apple is a good company.", "Friends is a great TV series.", "Are you sure Preslav is a professor in MBZUAI?", "The Stanford Prison Experiment was conducted in the basement of Encina Hall.", "As a language model, I can't provide these info."])
+You should return a python list without any other words,
+[False, True, False, True, False]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+
+checkworthy({claims})
+"""
+
+SPECIFY_CHECKWORTHY_CATEGORY_PROMPT = """
+You are a factchecker assistant with task to identify a sentence, whether it is 1. a factual claim; 2. an opinion; 3. not a claim (like a question or a imperative sentence); 4. other categories.
+Let's define a function named checkworthy(input: str).
+The return value should be a python int without any other words, representing index label, where index selects from [1, 2, 3, 4].
+
+For example, if a user call checkworthy("I think Apple is a good company.")
+You should return 2
+If a user call checkworthy("Friends is a great TV series.")
+You should return 1
+If a user call checkworthy("Are you sure Preslav is a professor in MBZUAI?")
+You should return 3
+If a user call checkworthy("As a language model, I can't provide these info.")
+You should return 4
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+checkworthy("{sentence}")
+"""
+
+# ------------------------------------------------------------------------
+# Dec 2023: Verify
+# ------------------------------------------------------------------------
+IDENTIFY_STANCE_PROMPT = """You are given a claim and an evidence text, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim. Choose from the following three options.
+A. The evidence supports the claim.
+B. The evidence refutes the claim.
+C. The evidence is irrelevant to the claim.
+
+For example, you are give Claim: "Preslav is a professor.", Evidence: "Preslav Nakov is a Professor in MBZUAI NLP group, and also the department chair." You should return A
+Pick the correct option A, B, C without other words.
+
+Claim: {claim}
+Evidence: {evidence}"""
+
+IDENTIFY_STANCE_PROMPT_FUNC = """
+Let's define a function named verify(claim:str, evidence:str) -> {-1,0,1}
+You are given a claim and an evidence text as input, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim. Choose from the following three options as the return value.
+1: The evidence supports the claim.
+-1: The evidence refutes the claim.
+0: The evidence is irrelevant to the claim.
+
+For example, when the user call verify(claim="Preslav is a professor.", evidence="Preslav Nakov is a Professor in MBZUAI NLP group, and also the department chair.")
+You should return 1
+Pick the correct option -1, 0, 1 without other words.
+
+verify(claim="{claim}",evidence="{evidence}")"""
+
+
+# , which correspond to the reasoning, whether the given text is factual or not (Boolean - True or False), the factual error present in the text, and the corrected text.
+
+VERIFY_PROMPT = """
+You are given a piece of text. Your task is to identify whether there are any factual errors within the text.
+When you are judging the factuality of the given text, you could reference the provided evidences if needed. The provided evidences may be helpful. Some evidences may contradict to each other. You must be careful when using the evidences to judge the factuality of the given text.
+The response should be a Python dict with four keys - "reasoning", "error", "correction" and "factuality".
+The following is the given text:
+[text]: {claim}
+The following is the provided evidences:
+[evidences]: {evidence}
+You should only respond in format as described below. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '{{'.
+[response format]:
+{{
+ "reasoning": "Why is the given text factual or non-factual? Be careful when you said something is non-factual. When you said something is non-factual, you must provide multiple evidences to support your decision.",
+ "error": "None if the text is factual; otherwise, describe the error in string.",
+ "correction": "A string, the corrected text if there is an error.",
+ "factuality": "An int value, 1 stands for the given text is factual, -1 is for non-factual, and 0 for irrelevant."
+}}
+"""
+# ------------------------------------------
+# Oct 2023
+# ------------------------------------------
+zero_shot_sentence_checkworthiness = """You are a factchecker assistant with task to identify sentences that are checkworthy. Sentence is checkworthy only if it contains factual claims.
+Classify the check-worthiness of these sentences, and output the label yes or no:
+{sentence}
+output:
+"""
+
+zero_shot_claim_checkworthiness = """You are a factchecker assistant with task to identify a sentence, whether it is 1. a factual claim; 2. an opinion; 3. not a claim (like a question or a imperative sentence); 4. other categories. \n
+Output the label index only: \n
+{claim} \n
+output:
+"""
+
+# We find that it is hard for model to distinguish complete support and partial support, merge as the one: support
+zero_shot_claim_evidence_three_stance_prompt = "### Instruction: You are given a claim and an evidence text, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim.\n\n### Input:\n\nClaim: {claim}\n\nEvidence: {evidence}\n\nOptions are as follows:\n A) The evidence supports the claim.\n\n B) The evidence refutes the claim.\n C) The evidence is irrelevant to the claim.\n\n Pick the correct option. \n\n### Final Answer: "
+
+zero_shot_claim_evidence_stance = """Given the evidence \n {evidence}, determine if the following statement is completely supported, partially supported, refuted or is irrelevant: {claim}, choose from four labels: 1. completely support, 2. partially support, 3. refute and 4. irrelevant.
+Return the label index only.
+Label index:
+"""
+
+zero_shot_nli = """Given the premise sentence {}, determine if the following statement is entailed or contradicted or neutral: {}, by three labels: entailment, contradiction, neutral.
+Label:
+"""
+
+zero_shot_edit_response = """Given a document containing factual errors, please correct the errors in the document depending on a corresponding list of factually true claims. Note that preserve the linguistic features and style of the original document, just correct factual errors.
+
+document: {response}
+
+true claims: {claims}
+
+revised document: """
+
+zero_shot_edit_response_given_question = """Given a question, and an answer containing factual errors, please correct the errors in the document depending on a corresponding list of factually true claims. Note that preserve the linguistic features and style of the original document, just correct factual errors.
+
+question: {prompt}
+
+document: {response}
+
+true claims: {claims}
+
+revised document: """
+
+# -------------------------------------------------------------------
+# July 2023: decompose and decontextualise into atomic claims
+# -------------------------------------------------------------------
+# ZERO_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context: {}, please breakdown the following sentence into independent facts and replace pronouns such as it, they, those, these, this, that, with specific entities or events.
+# The sentence is: {}
+# Atomic facts for this sentence are: """
+
+ZERO_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context: {}, please breakdown the following sentence into independent facts.
+The sentence is: {}
+Atomic facts for this sentence are: """
+
+FEW_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context, please breakdown the following sentence into independent facts.
+
+Context: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021. Obama was the first black president in the history of the United States. He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya. Trump was the second black president. He was born in New York City and previously served as a businessman and reality television personality.
+
+The sentence is: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021.
+Atomic facts for this sentence are:
+[
+ "The United States has had two black presidents: Barack Obama and Donald Trump.",
+ "Black president Barack Obama served two terms from 2009 to 2017.",
+ "Black president Donald Trump served one term from 2017 to 2021."
+]
+
+The sentence is: Obama was the first black president in the history of the United States.
+Atomic facts for this sentence are:
+[
+ "Obama was the first black president in the history of the United States."
+]
+
+The sentence is: He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya.
+Atomic facts for this sentence are:
+[
+ "Barack Obama was born in Honolulu, Hawaii.",
+ "Barack Obama mother was from Kansas.",
+ "Barack Obama father was from Kenya."
+]
+
+The sentence is: Trump was the second black president.
+Atomic facts for this sentence are:
+[
+ "Trump was the second black president."
+]
+
+The sentence is: He was born in New York City and previously served as a businessman and reality television personality.
+Atomic facts for this sentence are:
+[
+ "Donald Trump was born in New York City.",
+ "Donald Trump previously served as a businessman",
+ "Donald Trump previously served as a reality television personality."
+]
+
+
+Context: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas. He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975. Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+The sentence is: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas.
+Atomic facts for this sentence are:
+[
+ "In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas."
+]
+
+The sentence is: He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975.
+Atomic facts for this sentence are:
+[
+ "Justice William O. Douglas was born on October 16, 1898."
+ "Justice William O. Douglas served on the Supreme Court from 1939 until his retirement in 1975."
+]
+
+The sentence is: Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+Atomic facts for this sentence are:
+[
+ "Therefore, in 1980, Justice Douglas was still alive."
+ "Justice William O. Douglas would have been the oldest serving justice on the Court in 1980."
+]
+
+
+Context: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group. The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+The sentence is: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group.
+Atomic facts for this sentence are:
+[
+ "There have been only four female presidents of the United States in the country's history.",
+ "It is difficult to determine an average height for four female presidents of the United States."
+]
+
+The sentence is: The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+Atomic facts for this sentence are:
+[
+ "Abigail Adams (1797-1801) is a female president of the United States.",
+ "Marilyn Carlson Nelson (2009-2013) is a female president of the United States.",
+ "Luci Baines Johnson (1973-1977) is a female president of the United States.",
+ "Hillary Clinton (2017-2021) is a female president of the United States."
+]
+
+
+Context: {}
+The sentence is: {}
+Atomic facts for this sentence are:
+"""
+
+# This prompt aims to break the document into decontextualised sentences, and then atomic claims
+# Though it can not decontexlualize sentences, it can better break all sentences than the prompt above
+# combined with using system_role = "You are good at document decomposition and decontextualization."
+# date: 22/10/2023
+FEW_SHOT_DECONTEXTUALIZE_SENTENCE_ATOMIC_CLAIMS = """Depending the context, please break it down into independent sentences, and breakdown the sentence into independent facts.
+Context: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021. Obama was the first black president in the history of the United States. He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya. Trump was the second black president. He was born in New York City and previously served as a businessman and reality television personality.
+
+The sentence is: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021.
+Atomic facts for this sentence are:
+[
+ "The United States has had two black presidents: Barack Obama and Donald Trump.",
+ "Black president Barack Obama served two terms from 2009 to 2017.",
+ "Black president Donald Trump served one term from 2017 to 2021."
+]
+
+The sentence is: Obama was the first black president in the history of the United States.
+Atomic facts for this sentence are:
+[
+ "Obama was the first black president in the history of the United States."
+]
+
+The sentence is: Barack Obama was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya.
+Atomic facts for this sentence are:
+[
+ "Barack Obama was born in Honolulu, Hawaii.",
+ "Barack Obama mother was from Kansas.",
+ "Barack Obama father was from Kenya."
+]
+
+The sentence is: Trump was the second black president.
+Atomic facts for this sentence are:
+[
+ "Trump was the second black president."
+]
+
+The sentence is: Donald Trump was born in New York City and previously served as a businessman and reality television personality.
+Atomic facts for this sentence are:
+[
+ "Donald Trump was born in New York City.",
+ "Donald Trump previously served as a businessman",
+ "Donald Trump previously served as a reality television personality."
+]
+
+
+Context: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas. He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975. Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+The sentence is: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas.
+Atomic facts for this sentence are:
+[
+ "In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas."
+]
+
+The sentence is: Justice William O. Douglas was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975.
+Atomic facts for this sentence are:
+[
+ "Justice William O. Douglas was born on October 16, 1898."
+ "Justice William O. Douglas served on the Supreme Court from 1939 until his retirement in 1975."
+]
+
+The sentence is: Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+Atomic facts for this sentence are:
+[
+ "Therefore, in 1980, Justice Douglas was still alive."
+ "Justice William O. Douglas would have been the oldest serving justice on the Court in 1980."
+]
+
+
+Context: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group. The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+The sentence is: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group.
+Atomic facts for this sentence are:
+[
+ "There have been only four female presidents of the United States in the country's history.",
+ "It is difficult to determine an average height for four female presidents of the United States."
+]
+
+The sentence is: The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+Atomic facts for this sentence are:
+[
+ "Abigail Adams (1797-1801) is a female president of the United States.",
+ "Marilyn Carlson Nelson (2009-2013) is a female president of the United States.",
+ "Luci Baines Johnson (1973-1977) is a female president of the United States.",
+ "Hillary Clinton (2017-2021) is a female president of the United States."
+]
+
+
+Context: {}
+The sentence is: {}
+Atomic facts for this sentence are:
+"""
+
+# -------------------------------------------------------------------
+# April 2023: overall simple pipeline prompts
+# -------------------------------------------------------------------
+DECONTEXTILISATION_PROMPT = """Decompose and decontextualise a document into independently meaningful sentences. This process will make each sentence stand alone that can be verified independently.
+
+Input: Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.
+Output:
+Mary is a five-year old girl.
+Mary likes playing piano.
+Mary doesn't like cookies.
+
+Input: Google began as an online search firm, but it now offers more than 50 Internet services and products, from e-mail and online document creation to software for mobile phones and tablet computers. In addition, its 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+Ouput:
+Google began as an online search firm.
+Google now offers more than 50 Internet services and products.
+Google offers from e-mail and online document creation to software for mobile phones and tablet computers.
+Google 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+
+Input: """
+
+CHECK_WORTHINESS_LABEL_ONLY_PROMPT = """Identify whether this claim is an opinion or factual, and whether it is checkworthy or not in the context of fact-checking. Just return two labels without explanation.
+I think Apple is a good company.
+opinon, not checkworthy
+Preslav is a professor in MBZUAI.
+factual, checkworthy
+Friends is a great TV series.
+opinion, not checkworthy
+The Stanford Prison Experiment was conducted in the basement of Encina Hall.
+factual, checkworthy
+"""
+
+ENTITY_EXTRACTION_PROMPT = """Extract all entities of a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: Google, Internet services, product
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Donald John Trump, American politician, media personality, businessman
+Input: """
+
+QGEN_PROMPT_DEP = """Give a list of queries using for searching related information for a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: What does Google offers now?
+How many service and product does Google offer?
+Google, more than 50 Internet services, products
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Who is Donald John Trump?
+Give information of Donald John Trump.
+Donald John Trump, American politician
+Donald John Trump, media personality
+Donald John Trump, businessman
+Input: """
+
+QGEN_PROMPT = """I will check things you said and ask questions.
+
+You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+To verify it,
+1. I googled: Does your nose switch between nostrils?
+2. I googled: How often does your nostrils switch?
+3. I googled: Why does your nostril switch?
+4. I googled: What is nasal cycle?
+
+You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+To verify it,
+1. I googled: Where was Stanford Prison Experiment was conducted?
+
+You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+To verify it,
+1. I googled: What does Havel-Hakimi algorithm do?
+2. I googled: Who are Havel-Hakimi algorithm named after?
+
+You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+To verify it,
+1. I googled: Who sings the song "Time of My Life"?
+2. I googled: Which film is the song "Time of My Life" from?
+3. I googled: Who produced the song "Time of My Life"?
+
+You said: Kelvin Hopins was suspended from the Labor Party due to his membership in the Conservative Party.
+To verify it,
+1. I googled: Why was Kelvin Hopins suspended from Labor Party?
+
+You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+To verify it,
+1. I googled: What philosophical tradition is social work based on?
+2. I googled: What year does social work have its root in?
+
+You said: {claim}
+To verify it,
+""".strip()
+
+QGEN_PROMPT_FMT = '''
+You need to ask N questions based on the provided claim.
+Here are some examples:
+- Claim:
+Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+- N=4
+- Questions you may response:
+["Does your nose switch between nostrils?", "How often does your nostrils switch?", "Why does your nostril switch?", "What is nasal cycle?"]
+
+- Claim:
+The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+- N=1
+- Questions you may response:
+["Where was Stanford Prison Experiment was conducted?"]
+
+- Claim:
+The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+- N=2
+- Questions you may response:
+["What does Havel-Hakimi algorithm do?", "Who are Havel-Hakimi algorithm named after?"]
+
+Remember, you need to put your questions into a python list so that I will search them with the search engine API, so DON'T RETURN ANY OTHER IRRELEVANT WORDS!
+- Claim:
+{claim}
+- N={n}
+'''.strip()
+
+STANCE_DETECTION_PROMPT = """Determine whether the evidence support the claim or not. Choose label from [support, partial support, refute, other] and explain why.
+Support means we can entail the claim by the evidence.
+Partial support means: part of the information presented in the claim appear in the evidence.
+Refute means that the evidence mention the same event as the claim, but a clear opposite fact. It should be highlighed that under refute, the evidence mentions the fact in the claim, they are closely relevant, but opposite meaning or stance.
+Other means the evidence does not mention anything about the fact described in the claim, such that it neither supports nor refutes the claim.
+
+Claim: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: support, statement 'he is also the founder, CEO and chief engineer of SpaceX' in evidence above supports the claim.
+
+Claim: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Stance: partial support.
+
+Claim: Steve Jobs is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: refute.
+
+Claim: Elon Musk is a professor in The Stanford University.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: other, according to the evidence, I cannot judge whether the claim is true or not, not enough information, the evidence neither supports nor refutes.
+
+Claim: On January 6, 2021, a mob of supporters of former President Donald Trump stormed the U.S. Capitol in an attempt to overturn the 2020 presidential election.
+Evidence: On January 6, 2021, following the defeat of U.S. President Donald Trump in the 2020 presidential election, a mob of his supporters attacked the United States Capitol Building in Washington, D.C. The mob sought to keep Trump in power by preventing a joint session of Congress from counting the electoral college votes to formalize the victory of President-elect Joe Biden.
+Stance: support.
+
+Claim: The 2021 Capitol Hill riots resulted in the deaths of five people, including a Capitol police officer.
+Evidence: Five people died either shortly before, during, or following the riot: one was shot by Capitol Police, another died of a drug overdose, and three died of natural causes.
+Stance: partial support, the evidence supports that fact that five deaths, but not sure whether they include a Capitol police officer or not.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: As of November 10, 2022, over 940 people had been charged in the Capitol breach.
+Stance: refute, evidence and claim are describing the same thing, the number of people who was charged is over 940, while more than 300 in the claim, so the evidence refutes the claim.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: The laptop computer taken from Pelosi's office was taken by 22-year-old Capitol rioter Riley Williams. Williams was arrested and indicted on eight counts, including theft of government property, obstructing an official proceeding, and assaulting or resisting police.
+Stance: other, the evidence demonstrates something relevent to the fact in the claim, but it does not support or refute any information of it.
+
+Claim: {}
+Evidence: {}
+Stance: """
+
+EDITOR_PROMPT = """Fix the claim according to the evidence.
+
+Claim: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+Evidence: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+This suggests 45 minutes switch time in your statement is wrong.
+Fix: Your nose switches back and forth between nostrils. When you sleep, you switch about every 2 hours. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+
+Claim: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+Evidence: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+This suggests General Thomas Hall in your statement is wrong.
+Fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+Claim: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+Evidence: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+This suggests Encina Hall in your statement is wrong.
+Fix: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+Claim: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+Evidence: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+Fix: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. It is named after Vaclav Havel and Samih Hakimi.
+
+Claim: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Phil Ramone.
+Evidence: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+This suggests "Time of My Life" producer name in your statement is wrong.
+Fix: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+
+Claim: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+Evidence: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+Fix: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+Claim: {claim}
+Evidence: {evidence}
+This suggests
+""".strip()
diff --git a/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_vfr.py b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_vfr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f0255db785f700c8454fe5995506a91874f219
--- /dev/null
+++ b/src/openfactcheck/solvers/factcheckgpt/factcheckgpt_vfr.py
@@ -0,0 +1,119 @@
+import json
+from typing import Any
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.prompt import VERIFY_PROMPT
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.data_util import save_to_file
+from .factcheckgpt_utils.prompt import IDENTIFY_STANCE_PROMPT, IDENTIFY_STANCE_PROMPT_FUNC
+from .factcheckgpt_utils.nli import nli_infer
+
+@Solver.register("factcheckgpt_verifier", "claims_with_evidences", "label")
+class FactCheckGPTVerifier(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.stance_model = args.get("stance_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ # self.system_role = args.get("system_role", "You are a helpful factchecker assistant.")
+ self.system_role = "You are a helpful factchecker assistant."
+ self.verify_retries = args.get("verify_retries", 3)
+ self.stance_map = {
+ 1: "support",
+ -1: "refute",
+ 0: "irrelevant"
+ }
+
+ def verify_by_stance(
+ self, claim: str,
+ evidences: list[str],
+ ) -> Any:
+ labels = []
+ for evidence in evidences:
+ labels.append(self.stance(evidence, claim))
+
+ # based on stances of evidence, determine the true/false claim by rules
+ # if there is one evidence supports, we assume it is correct
+ if 1 in labels:
+ return 1
+ # if there isn't support, but refute and irrelevant, we regard as false
+ elif -1 in labels:
+ return -1
+ else:
+ # all irrelevant
+ return 0
+
+ def identify_stance_gpt(self, evidence, claim):
+ user_input = IDENTIFY_STANCE_PROMPT_FUNC.format(claim=claim, evidence=evidence)
+ r = gpt(
+ user_input,
+ model=self.stance_model,
+ system_role=self.system_role,
+ num_retries=self.num_retries
+ )
+ label = 0
+ try:
+ label = eval(r)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ return label
+
+ def stance(self, evidence, claim, model="gpt-3.5-turbo"):
+ """input: a claim and an evidence
+ output: label in [support, refute, irrelevant]"""
+ label = 0
+ if self.stance_model == "nli":
+ label = nli_infer(premise=evidence, hypothesis=claim)
+ elif "gpt" in self.stance_model:
+ label = self.identify_stance_gpt(evidence, claim)
+ else:
+ print("Check the model argument, choose either gpt or nli model")
+ return label
+
+ def verify_claim(self, claim: str, evidences: list[str]) -> dict[str, Any]:
+ results = None
+ user_input = VERIFY_PROMPT.format(claim=claim, evidence=evidences)
+ r = ''
+ for _ in range(self.verify_retries):
+ r = gpt(
+ user_input,
+ model=self.stance_model,
+ system_role=self.system_role,
+ num_retries=self.num_retries,
+ )
+ try:
+ results = eval(r)
+ break
+ except Exception as e:
+ try:
+ results = json.loads(r)
+ except Exception as e:
+ print(f"An unexpected error occurred to parse json {r}: {e}.")
+ save_to_file(r, "verification_error.txt")
+ print(f"An unexpected error occurred to eval {r}: {e}.")
+
+ if isinstance(results, dict):
+ return results
+ else:
+ print(f"Error output {r}. It does not output a dict, return factual label by stance aggregation.")
+ factual_label = self.verify_by_stance(claim, evidences)
+ results = {
+ "reasoning": "",
+ "error": "",
+ "correction": "",
+ "factuality": factual_label
+ }
+ return results
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims_with_evidences = state.get(self.input_name)
+ results = []
+ for claim, evidences in claims_with_evidences.items():
+ result = self.verify_claim(claim, [x[1] for x in evidences])
+ result["claim"] = claim
+ result["evidences"] = evidences
+ results.append(result)
+ state.set(self.output_name, all([x['factuality'] > 0 for x in results]))
+ state.set("detail", results)
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/README.factool.md b/src/openfactcheck/solvers/factool/README.factool.md
new file mode 100644
index 0000000000000000000000000000000000000000..4294e7a6fd4d0556f8c9c7fd20a30bfc27f14384
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/README.factool.md
@@ -0,0 +1,75 @@
+# Factool integration into the LLM Fact Checker DEMO
+
+## Advanced Usage
+
+The **Factool** integration process follows the guidelines and the architecture of the **llm-fact-checker**. The idea followed by the current implementation, is to maximize the compatibility between the newly developed solvers and the ones present in **llm_fact_checker** code itself. Default *evidence* JSON files are produced in the same format and with the same default names. The I/O interfaces in the solvers in both monolith (blackbox) and micro-service implementations, are 100% compatible with the ones of their **GPT** integration (default **llm_fact_checker**) counterparts. The only difference is the *path_save_analysis* parameter in the **factool_blackbox_post_editor** solver, which saves the **Factool** output of the blackbox (monolith) solver to a JSON file.
+Example pipeline has been deployed at ```factool_config.yaml```. The **Factool** blackbox (monolith) integration is guided by ```factool_blackbox_config.yaml```.
+A pipeline with micro-service **Factool** setting:
+```yaml
+openai_key:
+serper_key:
+scraper_key:
+solvers:
+ all_pass_abstain_detector:
+ input_name: response
+ output_name: response
+ factool_decontextualizer:
+ llm_in_use: gpt-4
+ input_name: response
+ output_name: claims
+ factool_evidence_retriever:
+ llm_in_use: gpt-4
+ input_name: claims
+ output_name: evidences
+ factool_claim_examiner:
+ llm_in_use: gpt-4
+ input_name: evidences
+ output_name: claim_info
+ factool_post_editor:
+ input_name: claim_info
+ output_name: claim_info
+ concat_response_generator:
+ input_name: claim_info
+ output_name: output
+```
+Here, the **[OpenAI](https://beta.openai.com/)**, **[Serper](https://serper.dev/)** and **[Scraper](https://www.scraperapi.com/)** API keys are mandatory for the proper functioning of the **Factool** class. Solvers are identical with the well-known solvers from the **GPT** integration. The *llm_in_use parameter* represents the **OpenAI** LLM currently being employed by the **Factool** components.
+The pipeline for the blackbox (monolith) **Factool** is similar, but with less inherent dynamics, employing the **Factool** *class*, instead of it's logically separated components:
+```yaml
+openai_key:
+serper_key:
+scraper_key:
+solvers:
+ all_pass_abstain_detector:
+ input_name: response
+ output_name: response
+ factool_blackbox:
+ llm_in_use: gpt-4
+ input_prompt: question
+ input_name: response
+ output_name: claim_info
+ factool_blackbox_post_editor:
+ path_save_analysis: factool_evidence_analysis.json
+ input_name: claim_info
+ output_name: claim_info
+ concat_response_generator:
+ input_name: claim_info
+ output_name: output
+```
+
+## Example
+
+The following example code encompases the execution of the Factool micro-services pipeline:
+```python
+from pipeline import Pipeline
+from argparse import Namespace
+
+args = Namespace(
+ user_src='../src/solvers',
+ config='../config/factool_config.yaml',
+ output='./truth'
+)
+p = Pipeline(args)
+question = "Who is Alan Turing?"
+response = "Alan Turing used to be Serbian authoritarian leader, mathematician and computer scientist. He used to be a leader of the French Resistance."
+print(p(question=question, response=response))
+```
diff --git a/src/openfactcheck/solvers/factool/__init__.py b/src/openfactcheck/solvers/factool/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/factool/all_pass_abstain_detector.py b/src/openfactcheck/solvers/factool/all_pass_abstain_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbc03360bdbd99efea8f646c201c1682fb5b6790
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/all_pass_abstain_detector.py
@@ -0,0 +1,12 @@
+import logging
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+@Solver.register_solver("all_pass_abstain_detector", "response", "response")
+class AllPassAbstainDetector(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/concat_response_regenerator.py b/src/openfactcheck/solvers/factool/concat_response_regenerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..57cbf5d04855928ce1e595ce751e3cb2176892ab
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/concat_response_regenerator.py
@@ -0,0 +1,18 @@
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("concat_response_generator", "claim_info", "output")
+class ConcatResponseRegenerator(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+
+ edited_claims = [v["edited_claims"] for _, v in claim_info.items()]
+ revised_document = " ".join(edited_claims).strip()
+ # print(revised_document)
+ state.set(self.output_name, revised_document)
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/factool_blackbox.py b/src/openfactcheck/solvers/factool/factool_blackbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..eecae3e2eff4d5a9b55068f843be7a378f15d44e
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_blackbox.py
@@ -0,0 +1,41 @@
+from core import register_solver, FactCheckerState, StandardTaskSolver
+from factool import Factool
+import os
+
+
+##
+#
+# Factool Solver
+#
+# Notes:
+# Factool requires 3 input parameters: prompt, response, and category.
+# Category is always set to 'kbqa' (Knowledge Base Question Answering) for the purposes of this project.
+# Because of employing a pipeline of its own, with specific search engine and analysis tools, Factool requires several API keys to be set as environment variables.
+# That is:
+# openai_key - OpenAI API key (https://beta.openai.com/)
+# serper_key - Serper API key (https://serper.dev/)
+# scrapper_key - Scrapper API key (https://www.scraperapi.com/)
+# Additional parameters:
+# llm_in_use - The OpenAI LLM in use (e.g. gpt-4)
+#
+##
+@register_solver("factool_blackbox", "response", "claim_info")
+class FactoolBlackboxSolver(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.input_prompt = args.get("input_prompt", None)
+ self.gpt_model = self.global_config.get("llm_in_use", "gpt-4")
+ # self.input_prompt = args["input_prompt"] if "input_prompt" in args else None
+ # self.gpt_model = args["llm_in_use"] if "llm_in_use" in args else "gpt-4"
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ prompt = state.get(self.input_prompt)
+ response = state.get(self.input_name)
+
+ factool_instance = Factool(self.gpt_model)
+
+ inputs = [{"prompt": prompt, "response": response, "category": "kbqa"}]
+ claim_info = factool_instance.run(inputs)
+
+ state.set("claim_info", claim_info)
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/factool_blackbox_post_editor.py b/src/openfactcheck/solvers/factool/factool_blackbox_post_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53e87f13d4b66dbfaf4d39e0ce6c7f13b17cb36
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_blackbox_post_editor.py
@@ -0,0 +1,82 @@
+import json
+from core import register_solver, FactCheckerState, StandardTaskSolver
+
+# Factool example.py category=kbqa response protocol
+'''
+{
+ 'average_claim_level_factuality': 0.0,
+ 'average_response_level_factuality': 0.0,
+ 'detailed_information': [
+ {
+ 'prompt': 'Introduce Graham Neubig',
+ 'response': 'Graham Neubig is a professor at MIT',
+ 'category': 'kbqa',
+ 'claims': [
+ {
+ 'claim': 'Graham Neubig is a professor at MIT'
+ }
+ ],
+ 'queries': [
+ [ 'Is Graham Neubig a professor at MIT?', 'Graham Neubig professorship' ]
+ ],
+ 'evidences': [
+ {
+ 'evidence': [ 'I am an Associate Professor at the Carnegie Mellon University Language Technology Institute in the School of Computer Science, and work with a bunch of great ...', 'Missing: MIT? | Show results with:MIT?', 'EI Seminar - Graham Neubig - Learning to Explain and ...', 'Duration: 57:54', 'Posted: Feb 17, 2023', 'I am an Associate Professor at the Carnegie Mellon University Language Technology Institute in the School of Computer Science, and work with a bunch of great ...', 'My research is concerned with language and its role in human communication. In particular, my long-term research goal is to break down barriers in human-human ...', 'Graham Neubig. Associate Professor. Research Interests: Machine Translation · Natural Language Processing · Spoken Language Processing · Machine Learning. My ...', "I am an Associate Professor of Computer Science at Carnegie Mellon University and CEO of… | Learn more about Graham Neubig's work experience, education, ...", 'Graham Neubig received the B.E. degree from the University of Illinois, Urbana ... He is currently an Assistant Professor with Carnegie Mellon University ...' ],
+ 'source': [ 'http://www.phontron.com/', 'http://www.phontron.com/', 'https://youtube.com/watch?v=CtcP5bvODzY', 'https://youtube.com/watch?v=CtcP5bvODzY', 'https://youtube.com/watch?v=CtcP5bvODzY', 'http://www.phontron.com/', 'https://www.phontron.com/research.php', 'https://lti.cs.cmu.edu/people/222217661/graham-neubig', 'https://www.linkedin.com/in/graham-neubig-10b41616b', 'https://ieeexplore.ieee.org/author/37591106000' ]
+ }
+ ],
+ 'claim_level_factuality': [
+ {
+ 'reasoning': 'The given text is non-factual. Multiple pieces of evidence indicate that Graham Neubig is an Associate Professor at the Carnegie Mellon University Language Technology Institute in the School of Computer Science, not at MIT.',
+ 'error': 'Graham Neubig is not a professor at MIT.',
+ 'correction': 'Graham Neubig is a professor at Carnegie Mellon University.',
+ 'factuality': False,
+ 'claim': 'Graham Neubig is a professor at MIT'
+ }
+ ],
+ 'response_level_factuality': False
+ }
+ ]
+}
+'''
+
+##
+#
+# Factool Data Post-Editor
+#
+# Notes:
+# Factool response post-processor. Used to presents the results in human-readable format and to save the analysis in a JSON file.
+#
+##
+@register_solver("factool_blackbox_post_editor", "claim_info", "claim_info")
+class FactoolBlackboxPostEditor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.path_save_analysis = args.get("path_save_analysis","factool_evidence_analysis.json")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+
+ # Restructure some of the output for concatenation (corrected claims)
+ edited_claims = ''
+ for clf in claim_info['detailed_information'][0]['claim_level_factuality']:
+ edited_claims += 'Claim: "' + clf['claim'] + '" => '
+ edited_claims += ('' if (clf['error'] == 'None' or len(clf['error']) == 0) else (clf['error'] + ' '))
+ edited_claims += ('' if (clf['reasoning'] == 'None' or len(clf['reasoning']) == 0) else clf['reasoning'])
+ edited_claims += ((' ' + clf['claim']) if (clf['correction'] == 'None' or len(clf['correction']) == 0) else (' ' + clf['correction']))
+ edited_claims += '\n'
+ edited_claims = edited_claims[:-1]
+ new_claim_info = {}
+ new_claim_info[claim_info['detailed_information'][0]['response']] = {
+ "edited_claims": edited_claims
+ }
+
+ # Serializing json
+ json_object = json.dumps(claim_info, indent=4)
+
+ # Writing to sample.json
+ with open(self.path_save_analysis, "w") as outfile:
+ outfile.write(json_object)
+
+ state.set(self.output_name, new_claim_info)
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/factool_claim_examiner.py b/src/openfactcheck/solvers/factool/factool_claim_examiner.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ac40aa24ae552a3415639c2d2e9a3328405a32
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_claim_examiner.py
@@ -0,0 +1,131 @@
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+from .ftool_utils.chat_api import OpenAIChat
+import yaml
+import os
+import json
+
+
+##
+#
+# Factool Claim Examiner
+#
+# Notes:
+# - This solver is used to examine the claims in a response.
+#
+##
+@register_solver("factool_claim_examiner", "evidences", "claim_info")
+class FactoolClaimExaminer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("llm_in_use", "gpt-4")
+ self.path_save_stance = args.get("path_save_stance", "evidence_stance.json")
+ self.verifications = None
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.verification_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "ftool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["verification"]
+
+ # async def coro (self, factool_instance, claims_in_response, evidences):
+ # self.verifications = await factool_instance.pipelines["kbqa_online"]._verification(claims_in_response, evidences)
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+ # Recover the Factool objects
+ claims_in_response = []
+ queires = []
+ search_outputs_for_claims = []
+ for key, pair in claim_info.items():
+ claim = key or pair["claim"]
+ claims_in_response.append({"claim": claim})
+ queires.append(pair["automatic_queries"])
+ search_outputs_for_claim = []
+ for evidence in pair["evidence_list"]:
+ search_outputs_for_claim.append(
+ {
+ "content": evidence["web_page_snippet_manual"],
+ "source": evidence["url"],
+ }
+ )
+ search_outputs_for_claims.append(search_outputs_for_claim)
+
+ claims_with_evidences = {k: [u['web_page_snippet_manual'] for u in claim_info[k]['evidence_list']] for k in
+ claim_info.keys()}
+ verifications = self._verification(claims_with_evidences)
+
+ # evidences = [
+ # [output["content"] for output in search_outputs_for_claim]
+ # for search_outputs_for_claim in search_outputs_for_claims
+ # ]
+
+ # Attach the verifications (stances) to the claim_info
+ for index, (key, pair) in enumerate(claim_info.items()):
+ # print(f'Verifications: {verifications}\n')
+ # print(f'Verification for claim {key}: Index {index}\n')
+ # print(f'Verification for claim {key}: {verifications[index]}\n')
+ # print(f'Verification for claim {key}: Type = {type(verifications[index])}\n')
+ stance = ""
+ if (
+ type(verifications[index]) == None
+ or verifications[index] == "None"
+ ):
+ stance = claims_in_response[index]["claim"]
+ else:
+ stance = (
+ ""
+ if (
+ verifications[index]["error"] == "None"
+ or len(verifications[index]["error"]) == 0
+ )
+ else (verifications[index]["error"] + " ")
+ )
+ stance += (
+ ""
+ if (
+ verifications[index]["reasoning"] == "None"
+ or len(verifications[index]["reasoning"]) == 0
+ )
+ else verifications[index]["reasoning"]
+ )
+ stance += (
+ claims_in_response[index]["claim"]
+ if (
+ verifications[index]["correction"] == "None"
+ or len(verifications[index]["correction"]) == 0
+ )
+ else (" " + verifications[index]["correction"])
+ )
+ claim_info[key]["stances"] = [stance]
+ for j in range(len(claim_info[key]["evidence_list"])):
+ claim_info[key]["evidence_list"][j]["stance"] = stance
+
+ # write to json file
+ # Serializing json
+ json_object = json.dumps(claim_info, indent=4)
+
+ # Writing to sample.json
+ with open(self.path_save_stance, "w") as outfile:
+ outfile.write(json_object)
+
+ # print(claim_info)
+
+ state.set(self.output_name, claim_info)
+ return True, state
+
+ def _verification(self, claims_with_evidences):
+ messages_list = [
+ [
+ {"role": "system", "content": self.verification_prompt['system']},
+ {"role": "user", "content": self.verification_prompt['user'].format(claim=claim, evidence=str(
+ [e[1] for e in evidence]))},
+ ]
+ for claim, evidence in claims_with_evidences.items()
+ ]
+ return self.gpt.run(messages_list, dict)
diff --git a/src/openfactcheck/solvers/factool/factool_decontextualizer.py b/src/openfactcheck/solvers/factool/factool_decontextualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..279c286861fc65dadeb1e9350f05c12b45ebd88e
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_decontextualizer.py
@@ -0,0 +1,58 @@
+from core import register_solver, StandardTaskSolver, FactCheckerState
+import asyncio
+import nest_asyncio
+from factool import Factool
+from .ftool_utils.chat_api import OpenAIChat
+import yaml
+import os
+from typing import List
+
+
+##
+#
+# Factool Decontextualizer
+#
+# Notes:
+# - This solver is used to extract claims from a response.
+# - The response should be a string.
+#
+##
+@register_solver("factool_decontextualizer", "response", "claims")
+class FactoolDecontextualizer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("llm_in_use", "gpt-3.5-turbo")
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.claim_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "ftool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["claim_extraction"]
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ response = state.get(self.input_name)
+
+ claims = self._claim_extraction(responses=[response])[0]
+
+ extracted_claims = [claim["claim"] for claim in claims]
+
+ state.set(self.output_name, extracted_claims)
+ return True, state
+
+ def _claim_extraction(self, responses):
+ messages_list = [
+ [
+ {"role": "system", "content": self.claim_prompt["system"]},
+ {
+ "role": "user",
+ "content": self.claim_prompt["user"].format(input=response),
+ },
+ ]
+ for response in responses
+ ]
+ return self.gpt.run(messages_list, List)
diff --git a/src/openfactcheck/solvers/factool/factool_evidence_retriever.py b/src/openfactcheck/solvers/factool/factool_evidence_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..b86e16e44ed08650e3f1c74f796fc4d09a625df5
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_evidence_retriever.py
@@ -0,0 +1,100 @@
+from core import register_solver, StandardTaskSolver, FactCheckerState
+from typing import List, Dict, Any
+import json
+from .ftool_utils.chat_api import OpenAIChat
+from .ftool_utils.search_api import GoogleSerperAPIWrapper
+import yaml
+import os
+
+##
+#
+# Factool Evidence Retriever
+#
+# Notes:
+# - This solver is used to retrieve evidences (online content + its sources) for a list of claims.
+# - The claims should be a list of strings.
+# - The evidences are saved in a JSON file.
+#
+##
+@register_solver("factool_evidence_retriever", "claims", "evidences")
+class FactoolEvidenceRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("llm_in_use", "gpt-4")
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.path_save_evidence = args.get("path_save_evidence", "evidence.json")
+ # self.path_save_evidence = args["path_save_evidence"] if "path_save_evidence" in args else "evidence.json"
+ self.queries = None
+ self.search_outputs_for_claims = None
+
+ self.query_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "ftool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["query_generation"]
+
+ self.search_engine = GoogleSerperAPIWrapper(snippet_cnt=10)
+
+
+ # async def coro_queries (self, factool_instance, claims_in_response):
+ # self.queries = await factool_instance.pipelines["kbqa_online"]._query_generation(claims_in_response)
+ # async def coro_search_outputs_for_claims (self, factool_instance):
+ # self.search_outputs_for_claims = await factool_instance.pipelines["kbqa_online"].tool.run(self.queries)
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ queries = self._query_generation(claims=claims)
+ search_outputs_for_claims = self.search_engine.run(queries)
+
+
+ evidences: Dict[str, Dict[str, Any]] = {}
+ for i, claim in enumerate(claims):
+ evidence_list: List[dict] = []
+ for j, search_outputs_for_claim in enumerate(
+ search_outputs_for_claims[i]
+ ):
+ evidence_list.append(
+ {
+ "evidence_id": j,
+ "web_page_snippet_manual": search_outputs_for_claim["content"],
+ "query": [queries[i]],
+ "url": search_outputs_for_claim["source"],
+ "web_text": [],
+ }
+ )
+ evidences[claim] = {
+ "claim": claim,
+ "automatic_queries": queries[i],
+ "evidence_list": evidence_list,
+ }
+
+ # write to json file
+ # Serializing json
+ json_object = json.dumps(evidences, indent=4)
+
+ # Writing to sample.json
+ with open(self.path_save_evidence, "w") as outfile:
+ outfile.write(json_object)
+
+ # print(evidences)
+
+ state.set(self.output_name, evidences)
+ return True, state
+
+ def _query_generation(self, claims):
+ messages_list = [
+ [
+ {"role": "system", "content": self.query_prompt["system"]},
+ {
+ "role": "user",
+ "content": self.query_prompt["user"].format(input=claim),
+ },
+ ]
+ for claim in claims
+ ]
+ return self.gpt.run(messages_list, List)
diff --git a/src/openfactcheck/solvers/factool/factool_example.py b/src/openfactcheck/solvers/factool/factool_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e89ced67d507e2d978b500dfabfcbd1d57e1483
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_example.py
@@ -0,0 +1,18 @@
+import os
+from pipeline import Pipeline
+from argparse import Namespace
+
+# Base directory where the script is located
+base_dir = os.path.abspath(os.path.dirname(__file__))
+
+args = Namespace(
+ user_src=os.path.join(base_dir),
+ config=os.path.join(base_dir, "../../config/factool_config.yaml"),
+ output=os.path.join(base_dir, "../../../output")
+)
+
+p = Pipeline(args)
+question = "Who is Alan Turing?"
+response = "Alan Turing was a British mathematician, logician, cryptanalyst, and computer scientist. He was highly influential in the development of theoretical computer science."
+
+print(p(question=question, response=response))
diff --git a/src/openfactcheck/solvers/factool/factool_post_editor.py b/src/openfactcheck/solvers/factool/factool_post_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f04b8bb922db18231b01b2a87ea648a8b9470b1
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/factool_post_editor.py
@@ -0,0 +1,23 @@
+from core import register_solver, FactCheckerState, StandardTaskSolver
+
+##
+#
+# Factool Data Post-Editor
+#
+# Notes:
+# Factool response post-processor. Used to presents the results in human-readable format and to save the analysis in a JSON file.
+#
+##
+@register_solver("factool_post_editor", "claim_info", "claim_info")
+class FactoolPostEditor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+
+ for key, pair in claim_info.items():
+ claim_info[key]['edited_claims'] = claim_info[key]['stances'][0]
+
+ state.set(self.output_name, claim_info)
+ return True, state
diff --git a/src/openfactcheck/solvers/factool/ftool_utils/__init__.py b/src/openfactcheck/solvers/factool/ftool_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/factool/ftool_utils/chat_api.py b/src/openfactcheck/solvers/factool/ftool_utils/chat_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ec67b8b8264a2c7c34bafc6b0e3cd056d808b0
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/ftool_utils/chat_api.py
@@ -0,0 +1,224 @@
+# the async version is adapted from https://gist.github.com/neubig/80de662fb3e225c18172ec218be4917a
+
+from __future__ import annotations
+
+import os
+import yaml
+import openai
+import ast
+import pdb
+import asyncio
+from typing import Any, List
+import os
+import pathlib
+import openai
+from openai import OpenAI, AsyncOpenAI
+import re
+
+
+# from factool.env_config import factool_env_config
+
+# env
+# openai.api_key = factool_env_config.openai_api_key
+
+class OpenAIChat():
+ def __init__(
+ self,
+ model_name='gpt-3.5-turbo',
+ max_tokens=2500,
+ temperature=0,
+ top_p=1,
+ request_timeout=120,
+ ):
+ if 'gpt' not in model_name:
+ openai.api_base = "http://localhost:8000/v1"
+ else:
+ #openai.api_base = "https://api.openai.com/v1"
+ openai.api_key = os.environ.get("OPENAI_API_KEY", None)
+ assert openai.api_key is not None, "Please set the OPENAI_API_KEY environment variable."
+ assert openai.api_key !='', "Please set the OPENAI_API_KEY environment variable."
+ self.client = AsyncOpenAI()
+
+ self.config = {
+ 'model_name': model_name,
+ 'max_tokens': max_tokens,
+ 'temperature': temperature,
+ 'top_p': top_p,
+ 'request_timeout': request_timeout,
+ }
+
+ def extract_list_from_string(self, input_string):
+ # pattern = r'\[.*\]'
+ # result = re.search(pattern, input_string)
+ # if result:
+ # return result.group()
+ # else:
+ # return None
+ start_index = input_string.find('[')
+ end_index = input_string.rfind(']')
+
+ if start_index != -1 and end_index != -1 and start_index < end_index:
+ return input_string[start_index:end_index + 1]
+ else:
+ return None
+
+ def extract_dict_from_string(self, input_string):
+ start_index = input_string.find('{')
+ end_index = input_string.rfind('}')
+
+ if start_index != -1 and end_index != -1 and start_index < end_index:
+ return input_string[start_index:end_index + 1]
+ else:
+ return None
+
+ def _boolean_fix(self, output):
+ return output.replace("true", "True").replace("false", "False")
+
+ def _type_check(self, output, expected_type):
+ try:
+ output_eval = ast.literal_eval(output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ except:
+ '''
+ if(expected_type == List):
+ valid_output = self.extract_list_from_string(output)
+ output_eval = ast.literal_eval(valid_output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ elif(expected_type == dict):
+ valid_output = self.extract_dict_from_string(output)
+ output_eval = ast.literal_eval(valid_output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ '''
+ return None
+
+ async def dispatch_openai_requests(
+ self,
+ messages_list,
+ ) -> list[str]:
+ """Dispatches requests to OpenAI API asynchronously.
+
+ Args:
+ messages_list: List of messages to be sent to OpenAI ChatCompletion API.
+ Returns:
+ List of responses from OpenAI API.
+ """
+ async def _request_with_retry(messages, retry=3):
+ for _ in range(retry):
+ try:
+ response = await self.client.chat.completions.create(
+ model=self.config['model_name'],
+ messages=messages,
+ max_tokens=self.config['max_tokens'],
+ temperature=self.config['temperature'],
+ top_p=self.config['top_p']
+ )
+ return response
+ except openai.RateLimitError:
+ await asyncio.sleep(1)
+ except openai.Timeout:
+ await asyncio.sleep(1)
+ except openai.APIError:
+ await asyncio.sleep(1)
+ # except openai.err
+
+
+ # except openai.error.RateLimitError:
+ # print('Rate limit error, waiting for 40 second...')
+ # await asyncio.sleep(40)
+ # except openai.error.APIError:
+ # print('API error, waiting for 1 second...')
+ # await asyncio.sleep(1)
+ # except openai.error.Timeout:
+ # print('Timeout error, waiting for 1 second...')
+ # await asyncio.sleep(1)
+ # except openai.error.ServiceUnavailableError:
+ # print('Service unavailable error, waiting for 3 second...')
+ # await asyncio.sleep(3)
+ # except openai.error.APIConnectionError:
+ # print('API Connection error, waiting for 3 second...')
+ # await asyncio.sleep(3)
+
+ return None
+
+ async_responses = [
+ _request_with_retry(messages)
+ for messages in messages_list
+ ]
+
+ return await asyncio.gather(*async_responses)
+
+ def run(self, messages_list, expected_type):
+ retry = 1
+ responses = [None for _ in range(len(messages_list))]
+ messages_list_cur_index = [i for i in range(len(messages_list))]
+
+ while retry > 0 and len(messages_list_cur_index) > 0:
+ print(f'{retry} retry left...')
+ messages_list_cur = [messages_list[i] for i in messages_list_cur_index]
+
+ predictions = asyncio.run(self.dispatch_openai_requests(
+ messages_list=messages_list_cur,
+ ))
+
+ preds = [self._type_check(self._boolean_fix(prediction.choices[0].message.content), expected_type) if prediction is not None else None for prediction in predictions]
+ finised_index = []
+ for i, pred in enumerate(preds):
+ if pred is not None:
+ responses[messages_list_cur_index[i]] = pred
+ finised_index.append(messages_list_cur_index[i])
+
+ messages_list_cur_index = [i for i in messages_list_cur_index if i not in finised_index]
+
+ retry -= 1
+
+ return responses
+
+# class OpenAIEmbed():
+# def __init__():
+# openai.api_key = os.environ.get("OPENAI_API_KEY", None)
+# assert openai.api_key is not None, "Please set the OPENAI_API_KEY environment variable."
+# assert openai.api_key != '', "Please set the OPENAI_API_KEY environment variable."
+
+# async def create_embedding(self, text, retry=3):
+# for _ in range(retry):
+# try:
+# response = await openai.Embedding.acreate(input=text, model="text-embedding-ada-002")
+# return response
+# except openai.error.RateLimitError:
+# print('Rate limit error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# except openai.error.APIError:
+# print('API error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# except openai.error.Timeout:
+# print('Timeout error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# return None
+
+# async def process_batch(self, batch, retry=3):
+# tasks = [self.create_embedding(text, retry=retry) for text in batch]
+# return await asyncio.gather(*tasks)
+
+# if __name__ == "__main__":
+# chat = OpenAIChat(model_name='llama-2-7b-chat-hf')
+
+# predictions = asyncio.run(chat.async_run(
+# messages_list=[
+# [{"role": "user", "content": "show either 'ab' or '['a']'. Do not do anything else."}],
+# ] * 20,
+# expected_type=List,
+# ))
+
+# print(predictions)
+ # Usage
+ # embed = OpenAIEmbed()
+ # batch = ["string1", "string2", "string3", "string4", "string5", "string6", "string7", "string8", "string9", "string10"] # Your batch of strings
+ # embeddings = asyncio.run(embed.process_batch(batch, retry=3))
+ # for embedding in embeddings:
+ # print(embedding["data"][0]["embedding"])
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/factool/ftool_utils/prompts.yaml b/src/openfactcheck/solvers/factool/ftool_utils/prompts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e4ed08d6d2aefcf7f1d473c392a502db8d64a67
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/ftool_utils/prompts.yaml
@@ -0,0 +1,67 @@
+claim_extraction:
+ system: |-
+ You are a brilliant assistant.
+ user: |-
+ You are given a piece of text that includes knowledge claims. A claim is a statement that asserts something as true or false, which can be verified by humans. Your task is to accurately identify and extract every claim stated in the provided text. Then, resolve any coreference (pronouns or other referring expressions) in the claim for clarity. Each claim should be concise (less than 15 words) and self-contained.
+ Your response MUST be a list of dictionaries. Each dictionary should contains the key "claim", which correspond to the extracted claim (with all coreferences resolved).
+ You MUST only respond in the format as described below. DO NOT RESPOND WITH ANYTHING ELSE. ADDING ANY OTHER EXTRA NOTES THAT VIOLATE THE RESPONSE FORMAT IS BANNED. START YOUR RESPONSE WITH '['.
+ [response format]:
+ [
+ {{
+ "claim": "Ensure that the claim is fewer than 15 words and conveys a complete idea. Resolve any coreference (pronouns or other referring expressions) in the claim for clarity",
+ }},
+ ...
+ ]
+
+ Here are two examples:
+ [text]: Tomas Berdych defeated Gael Monfis 6-1, 6-4 on Saturday. The sixth-seed reaches Monte Carlo Masters final for the first time . Berdych will face either Rafael Nadal or Novak Djokovic in the final.
+ [response]: [{{"claim": "Tomas Berdych defeated Gael Monfis 6-1, 6-4"}}, {{"claim": "Tomas Berdych defeated Gael Monfis 6-1, 6-4 on Saturday"}}, {{"claim": "Tomas Berdych reaches Monte Carlo Masters final"}}, {{"claim": "Tomas Berdych is the sixth-seed"}}, {{"claim": "Tomas Berdych reaches Monte Carlo Masters final for the first time"}}, {{"claim": "Berdych will face either Rafael Nadal or Novak Djokovic"}}, {{"claim": "Berdych will face either Rafael Nadal or Novak Djokovic in the final"}}]
+
+ [text]: Tinder only displays the last 34 photos - but users can easily see more. Firm also said it had improved its mutual friends feature.
+ [response]: [{{"claim": "Tinder only displays the last photos"}}, {{"claim": "Tinder only displays the last 34 photos"}}, {{"claim": "Tinder users can easily see more photos"}}, {{"claim": "Tinder said it had improved its feature"}}, {{"claim": "Tinder said it had improved its mutual friends feature"}}]
+
+ Now complete the following,ONLY RESPONSE IN A LIST FORMAT, NO OTHER WORDS!!!:
+ [text]: {input}
+ [response]:
+
+query_generation:
+ system: |-
+ You are a query generator that generates effective and concise search engine queries to verify a given claim. You only response in a python list format(NO OTHER WORDS!)
+ user: |-
+ You are a query generator designed to help users verify a given claim using search engines. Your primary task is to generate a Python list of two effective and skeptical search engine queries. These queries should assist users in critically evaluating the factuality of a provided claim using search engines.
+ You should only respond in format as described below (a Python list of queries). PLEASE STRICTLY FOLLOW THE FORMAT. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '['.
+ [response format]: ['query1', 'query2']
+
+ Here are three examples:
+ claim: The CEO of twitter is Bill Gates.
+ response: ["Who is the CEO of twitter?", "CEO Twitter"]
+
+ claim: Michael Phelps is the most decorated Olympian of all time.
+ response: ["Who is the most decorated Olympian of all time?", "Michael Phelps"]
+
+ claim: ChatGPT is created by Google.
+ response: ["Who created ChatGPT?", "ChatGPT"]
+
+ Now complete the following(ONLY RESPONSE IN A LIST FORMAT, DO NOT RETURN OTHER WORDS!!! START YOUR RESPONSE WITH '[' AND END WITH ']'):
+ claim: {input}
+ response:
+
+verification:
+ system: |-
+ You are a brilliant assistant.
+ user: |-
+ You are given a piece of text. Your task is to identify whether there are any factual errors within the text.
+ When you are judging the factuality of the given text, you could reference the provided evidences if needed. The provided evidences may be helpful. Some evidences may contradict to each other. You must be careful when using the evidences to judge the factuality of the given text.
+ The response should be a dictionary with three keys - "reasoning", "factuality", "error", and "correction", which correspond to the reasoning, whether the given text is factual or not (Boolean - True or False), the factual error present in the text, and the corrected text.
+ The following is the given text
+ [text]: {claim}
+ The following is the provided evidences
+ [evidences]: {evidence}
+ You should only respond in format as described below. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '{{'.
+ [response format]:
+ {{
+ "reasoning": "Why is the given text factual or non-factual? Be careful when you said something is non-factual. When you said something is non-factual, you must provide multiple evidences to support your decision.",
+ "error": "None if the text is factual; otherwise, describe the error.",
+ "correction": "The corrected text if there is an error.",
+ "factuality": True if the given text is factual, False otherwise.
+ }}
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/factool/ftool_utils/search_api.py b/src/openfactcheck/solvers/factool/ftool_utils/search_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a90c176312ca29937512b0c33127f28fd9b98a9e
--- /dev/null
+++ b/src/openfactcheck/solvers/factool/ftool_utils/search_api.py
@@ -0,0 +1,115 @@
+import asyncio
+import json
+import os
+import numpy as np
+import jsonlines
+import pdb
+import aiohttp
+
+
+class GoogleSerperAPIWrapper():
+ """Wrapper around the Serper.dev Google Search API.
+ You can create a free API key at https://serper.dev.
+ To use, you should have the environment variable ``SERPER_API_KEY``
+ set with your API key, or pass `serper_api_key` as a named parameter
+ to the constructor.
+ Example:
+ .. code-block:: python
+ from langchain import GoogleSerperAPIWrapper
+ google_serper = GoogleSerperAPIWrapper()
+ """
+
+ def __init__(self, snippet_cnt=10) -> None:
+ self.k = snippet_cnt
+ self.gl = "us"
+ self.hl = "en"
+ self.serper_api_key = os.environ.get("SERPER_API_KEY", None)
+ assert self.serper_api_key is not None, "Please set the SERPER_API_KEY environment variable."
+ assert self.serper_api_key != '', "Please set the SERPER_API_KEY environment variable."
+
+ async def _google_serper_search_results(self, session, search_term: str, gl: str, hl: str) -> dict:
+ headers = {
+ "X-API-KEY": self.serper_api_key or "",
+ "Content-Type": "application/json",
+ }
+ params = {"q": search_term, "gl": gl, "hl": hl}
+ async with session.post(
+ "https://google.serper.dev/search", headers=headers, params=params, raise_for_status=True
+ ) as response:
+ return await response.json()
+
+ def _parse_results(self, results):
+ snippets = []
+
+ if results.get("answerBox"):
+ answer_box = results.get("answerBox", {})
+ if answer_box.get("answer"):
+ element = {"content": answer_box.get("answer"), "source": "None"}
+ return [element]
+ elif answer_box.get("snippet"):
+ element = {"content": answer_box.get("snippet").replace("\n", " "), "source": "None"}
+ return [element]
+ elif answer_box.get("snippetHighlighted"):
+ element = {"content": answer_box.get("snippetHighlighted"), "source": "None"}
+ return [element]
+
+ if results.get("knowledgeGraph"):
+ kg = results.get("knowledgeGraph", {})
+ title = kg.get("title")
+ entity_type = kg.get("type")
+ if entity_type:
+ element = {"content": f"{title}: {entity_type}", "source": "None"}
+ snippets.append(element)
+ description = kg.get("description")
+ if description:
+ element = {"content": description, "source": "None"}
+ snippets.append(element)
+ for attribute, value in kg.get("attributes", {}).items():
+ element = {"content": f"{attribute}: {value}", "source": "None"}
+ snippets.append(element)
+
+ for result in results["organic"][: self.k]:
+ if "snippet" in result:
+ element = {"content": result["snippet"], "source": result["link"]}
+ snippets.append(element)
+ for attribute, value in result.get("attributes", {}).items():
+ element = {"content": f"{attribute}: {value}", "source": result["link"]}
+ snippets.append(element)
+
+ if len(snippets) == 0:
+ element = {"content": "No good Google Search Result was found", "source": "None"}
+ return [element]
+
+ # keep only the first k snippets
+ snippets = snippets[:int(self.k / 2)]
+
+ return snippets
+
+ async def parallel_searches(self, search_queries, gl, hl):
+ async with aiohttp.ClientSession() as session:
+ tasks = [self._google_serper_search_results(session, query, gl, hl) for query in search_queries]
+ search_results = await asyncio.gather(*tasks, return_exceptions=True)
+ return search_results
+
+ def run(self, queries):
+ """Run query through GoogleSearch and parse result."""
+ flattened_queries = []
+
+ for sublist in queries:
+ if sublist is None:
+ sublist = ['None', 'None']
+ for item in sublist:
+ flattened_queries.append(item)
+ results = asyncio.run(self.parallel_searches(flattened_queries, gl=self.gl, hl=self.hl))
+ snippets_list = []
+ for i in range(len(results)):
+ snippets_list.append(self._parse_results(results[i]))
+ snippets_split = [snippets_list[i] + snippets_list[i + 1] for i in range(0, len(snippets_list), 2)]
+ return snippets_split
+
+# class google_search():
+# def __init__(self, snippet_cnt):
+# self.serper = GoogleSerperAPIWrapper(snippet_cnt=snippet_cnt)
+
+# def run(self, queries):
+# return asyncio.run(self.serper.run(queries))
diff --git a/src/openfactcheck/solvers/rarr_solvers/__init__.py b/src/openfactcheck/solvers/rarr_solvers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/rarr_solvers/prompts/__init__.py b/src/openfactcheck/solvers/rarr_solvers/prompts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/rarr_solvers/prompts/hallucination_prompts.py b/src/openfactcheck/solvers/rarr_solvers/prompts/hallucination_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..49acfdc91ebcb3aa268cbcf1fddf0b44113c630d
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/prompts/hallucination_prompts.py
@@ -0,0 +1,13 @@
+"""Prompts for generating hallucinations."""
+
+EVIDENCE_HALLUCINATION = """Generate a paragraph that answers the question.
+
+Question: What is New York-Style pizza?
+Text: New York-style pizza has slices that are large and wide with a thin crust that is foldable yet crispy. It is traditionally topped with tomato sauce and mozzarella cheese, with any extra toppings placed on top of the cheese.
+
+Question: When did the first McDonald's open?
+Text: The McDonald's brothers opened their first McDonald's restaurant in 1940 in San Bernardino, California. Originally, a carhop drive-in system was used to serve customers. The initial menu items were centered around barbecue and the first name the brothers called their business was "McDonald's Famous Barbecue."
+
+Question: {query}
+Text:
+""".strip()
diff --git a/src/openfactcheck/solvers/rarr_solvers/prompts/rarr_prompts.py b/src/openfactcheck/solvers/rarr_solvers/prompts/rarr_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9b05ef90d032e8c8d9275256d92e03d48eaa6a
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/prompts/rarr_prompts.py
@@ -0,0 +1,323 @@
+"""All prompts used for RARR prompting."""
+
+QGEN_PROMPT = """I will check things you said and ask questions.
+
+You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+To verify it,
+1. I googled: Does your nose switch between nostrils?
+2. I googled: How often does your nostrils switch?
+3. I googled: Why does your nostril switch?
+4. I googled: What is nasal cycle?
+
+You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+To verify it,
+1. I googled: Where was Stanford Prison Experiment was conducted?
+
+You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+To verify it,
+1. I googled: What does Havel-Hakimi algorithm do?
+2. I googled: Who are Havel-Hakimi algorithm named after?
+
+You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+To verify it,
+1. I googled: Who sings the song "Time of My Life"?
+2. I googled: Which film is the song "Time of My Life" from?
+3. I googled: Who produced the song "Time of My Life"?
+
+You said: Kelvin Hopins was suspended from the Labor Party due to his membership in the Conservative Party.
+To verify it,
+1. I googled: Why was Kelvin Hopins suspended from Labor Party?
+
+You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+To verify it,
+1. I googled: What philosophical tradition is social work based on?
+2. I googled: What year does social work have its root in?
+
+You said: {claim}
+To verify it,
+""".strip()
+
+CONTEXTUAL_QGEN_PROMPT = """I will check things you said and ask questions.
+
+Context: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes.
+You said: This is to prevent a buildup of mucus. It's called the nasal cycle.
+To verify what you just said,
+1. I googled: Why does your nostril switch during sleep?
+2. I googled: What is nasal cycle?
+3. I googled: What is the nostril switching during sleep called?
+
+Context: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford's psychology building.
+You said: It is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+To verify what you just said,
+1. I googled: What type of experiment was the Stanford Prison Experiment?
+2. I googled: What was the objective of the Stanford Prison Experiment?
+
+Context: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+You said: It is named after Václav Havel and Samih Hakimi.
+To verify what you just said,
+1. I googled: Who are Havel-Hakimi algorithm named after?
+
+Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+You said: The song was produced by Michael Lloyd in the same year.
+To verify what you just said,
+1. I googled: Who produced the song "Time of My Life"?
+2. I googled: When was the song "Time of My Life" by Bill Medley produced?
+
+Context: The Late Show with Stephen Colbert is an American late-night talk show hosted by Stephen Colbert, which premiered on September 8, 2015.
+You said: Produced by Spartina Productions and CBS Television Studios, it is the second iteration of CBS' Late Show franchise.
+To verify what you just said,
+1. I googled: Who produces "The Late Show with Stephen Colbert"?
+2. I googled: What are the iterations of CBS' Late Show franchise?
+
+Context: Super Mario Sunshine was released on GameCube in 2002. In the game, Mario uses a tool strapped to his back called FLUDD, which stands for The Flash Liquidizer Ultra Dousing Device.
+You said: It can be used to spray water at objects or enemies. This allows Mario to change his movements, kill enemies, or clean up hazards on the floor.
+To verify what you just said,
+1. I googled: What is the main function of FLUDD in Super Mario Sunshine?
+2. I googled: What can FLUDD in Super Mario Sunshine be used on?
+
+Context: {context}
+You said: {claim}
+To verify what you just said,
+""".strip()
+
+AGREEMENT_GATE_PROMPT = """I will check some things you said.
+
+1. You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+2. I checked: How often do your nostrils switch?
+3. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+4. Reasoning: The article said the nose’s switching time is about every 2 hours, and you said the nose's switching time is about every 45 minutes.
+5. Therefore: This disagrees with what you said.
+
+1. You said: The Little House books were written by Laura Ingalls Wilder. The books were published by HarperCollins.
+2. I checked: Who published the Little House books?
+3. I found this article: These are the books that started it all -- the stories that captured the hearts and imaginations of children and young adults worldwide. Written by Laura Ingalls Wilder and published by HarperCollins, these beloved books remain a favorite to this day.
+4. Reasoning: The article said the Little House books were published by HarperCollins and you said the books were published by HarperCollins.
+5. Therefore: This agrees with what you said.
+
+1. You said: Real Chance of Love was an American reality TV show. Season 2 of the show was won by Cali, who chose to be with Chance.
+2. I checked: Who won season 2 of Real Chance of Love?
+3. I found this article: Real Chance of Love 2: Back in the Saddle is the second season of the VH1 reality television dating series Real Chance of Love. Ahmad Givens (Real) and Kamal Givens (Chance), former contestants on I Love New York are the central figures.
+4. Reasoning: The article doesn't answer the question and you said that Cali won season 2 of Real Chance of Love.
+5. Therefore: This is irrelevant to what you said.
+
+1. You said: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+2. I checked: Where was Stanford Prison Experiment conducted?
+3. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+4. Reasoning: The article said the Stanford Prison Experiment was conducted in Jordan Hall and you said the Stanford Prison Experiment was conducted in Jordan Hall.
+5. Therefore: This agrees with what you said.
+
+1. You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+2. I checked: When did social work have its roots?
+3. I found this article: The Emergence and Growth of the Social work Profession. Social work’s roots were planted in the 1880s, when charity organization societies (COS) were created to organize municipal voluntary relief associations and settlement houses were established.
+4. Reasoning: The article said social work has its roots planted in the 1880s and you said social work has its root in the 1800s.
+5. Therefore: This disagrees with what you said.
+
+1. You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+2. I checked: What is the Havel-Hakimi algorithm?
+3. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+4. Reasoning: The article said the Havel-Hakimi algorithm is for constructing a special solution if a simple graph for the given degree sequence exists and you said the Havel-Hakimi algorithm is for converting the adjacency matrix of a graph.
+5. Therefore: This disagrees with what you said.
+
+1. You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+2. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+3. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+4. Reasoning: The article said that a demo was produced by Michael Lloyd and you said "Time of My Life" was produced by Michael Lloyd.
+5. Therefore: This agrees with what you said.
+
+1. You said: Tiger Woods is the only player who has won the most green jackets. He has won four times. The Green Jacket is one of the most coveted prizes in all of golf.
+2. I checked: What is the Green Jacket in golf?
+3. I found this article: The green jacket is a classic, three-button, single-breasted and single-vent, featuring the Augusta National Golf Club logo on the left chest pocket. The logo also appears on the brass buttons.
+4. Reasoning: The article said the Green Jacket is a classic three-button single-breasted and single-vent and you said the Green Jacket is one of the most coveted prizes in all of golf.
+5. Therefore: This is irrelevant to what you said.
+
+1. You said: Kelvin Hopins was suspended from the Labor Party because he had allegedly sexually harassed and behaved inappropriately towards a Labour Party activist, Ava Etemadzadeh.
+2. I checked: Why was Kelvin Hopins suspeneded from the Labor Party?
+3. I found this article: A former Labour MP has left the party before an inquiry into sexual harassment allegations against him was able to be concluded, the party has confirmed. Kelvin Hopkins was accused in 2017 of inappropriate physical contact and was suspended by the Labour party pending an investigation.
+4. Reasoning: The article said Kelvin Hopins was suspended because of inappropriate physical contact and you said that Kelvin Hopins was suspended because he allegedly sexually harassed Ava Etemadzadeh.
+5. Therefore: This agrees with what you said.
+
+1. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Smith.
+2. I checked: Who led the British side in the battle of Lexington and Concord?
+3. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+4. Reasoning: The article said the British side was led by Lieutenant Colonel Francis Smith and you said the British side was led by General Thomas Smith.
+5. Therefore: This disagrees with what you said.
+
+1. You said: {claim}
+2. I checked: {query}
+3. I found this article: {evidence}
+4. Reasoning:
+""".strip()
+
+CONTEXTUAL_AGREEMENT_GATE_PROMPT = """I will check some things you said.
+
+1. Context: Your nose switches back and forth between nostrils. It's called the nasal cycle. This is to prevent a buildup of mucus.
+2. You said: When you sleep, you switch about every 45 minutes.
+3. I checked: How often do your nostrils switch?
+4. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+5. Reasoning: The article said the nose’s switching time is about every 2 hours, and you said the nose's switching time is about every 45 minutes.
+6. Therefore: This disagrees with what you said.
+
+1. Context: The Little House books is a series of American children's novels.
+2. You said: The books were published by HarperCollins.
+3. I checked: Who published the Little House books?
+4. I found this article: These are the books that started it all -- the stories that captured the hearts and imaginations of children and young adults orldwide. Written by Laura Ingalls Wilder and published by HarperCollins, these beloved books remain a favorite to this day.
+5. Reasoning: The article said the Little House books were published by HarperCollins and you said the books were published by HarperCollins.
+6. Therefore: This agrees with what you said.
+
+1. Context: Real Chance of Love was an American reality TV show.
+2. You said: Season 2 of the show was won by Cali, who chose to be with Chance.
+3. I checked: Who won season 2 of Real Chance of Love?
+4. I found this article: Real Chance of Love 2: Back in the Saddle is the second season of the VH1 reality television dating series Real Chance of Love. Ahmad Givens (Real) and Kamal Givens (Chance), former contestants on I Love New York are the central figures.
+5. Reasoning: The article doesn't answer the question and you said that Cali won season 2 of Real Chance of Love.
+6. Therefore: This is irrelevant to what you said.
+
+1. Context: The Stanford Prison Experiment is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+2. You said: It was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+3. I checked: Where was Stanford Prison Experiment conducted?
+4. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+5. Reasoning: The article said the Stanford Prison Experiment was conducted in Jordan Hall and you said the Stanford Prison Experiment was conducted in Jordan Hall.
+6. Therefore: This agrees with what you said.
+
+1. Context: Social work is a profession that is based in the philosophical tradition of humanism.
+2. You said: It is an intellectual discipline that has its roots in the 1800s.
+3. I checked: When did social work have its roots?
+4. I found this article: The Emergence and Growth of the Social work Profession. Social work’s roots were planted in the 1880s, when charity organization societies (COS) were created to organize municipal voluntary relief associations and settlement houses were established.
+5. Reasoning: The article said social work has its roots planted in the 1880s and you said social work has its root in the 1800s.
+6. Therefore: This disagrees with what you said.
+
+1. Context: The Havel-Hakimi algorithm is named after Václav Havel and Samih Hakimi.
+2. You said: It is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+3. I checked: What is the Havel-Hakimi algorithm?
+4. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+5. Reasoning: The article said the Havel-Hakimi algorithm is for constructing a special solution if a simple graph for the given degree sequence exists and you said the Havel-Hakimi algorithm is for converting the adjacency matrix of a graph.
+6. Therefore: This disagrees with what you said.
+
+1. Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+2. You said: The song was produced by Michael Lloyd.
+3. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+4. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+5. Reasoning: The article said that a demo was produced by Michael Lloyd and you said "Time of My Life" was produced by Michael Lloyd.
+6. Therefore: This agrees with what you said.
+
+1. Context: Tiger Woods is the only player who has won the most green jackets. He has won four times.
+2. You said: The Green Jacket is one of the most coveted prizes in all of golf.
+3. I checked: What is the Green Jacket in golf?
+4. I found this article: The green jacket is a classic, three-button, single-breasted and single-vent, featuring the Augusta National Golf Club logo on the left chest pocket. The logo also appears on the brass buttons.
+5. Reasoning: The article said the Green Jacket is a classic three-button single-breasted and single-vent and you said the Green Jacket is one of the most coveted prizes in all of golf.
+6. Therefore: This is irrelevant to what you said.
+
+1. Context: Kelvin Hopins was suspended from the Labor Party.
+2. You said: This was because he had allegedly sexually harassed and behaved inappropriately towards a Labour Party activist, Ava Etemadzadeh.
+3. I checked: Why was Kelvin Hopins suspeneded from the Labor Party?
+4. I found this article: A former Labour MP has left the party before an inquiry into sexual harassment allegations against him was able to be concluded, the party has confirmed. Kelvin Hopkins was accused in 2017 of inappropriate physical contact and was suspended by the Labour party pending an investigation.
+5. Reasoning: The article said Kelvin Hopins was suspended because of inappropriate physical contact and you said that Kelvin Hopins was suspended because he allegedly sexually harassed Ava Etemadzadeh.
+6. Therefore: This agrees with what you said.
+
+1. Context: The Battles of Lexington and Concord, fought on April 19, 1775, kicked off the American Revolutionary War (1775-83).
+2. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Smith.
+3. I checked: Who led the British side in the battle of Lexington and Concord?
+4. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+5. Reasoning: The article said the British side was led by Lieutenant Colonel Francis Smith and you said the British side was led by General Thomas Smith.
+6. Therefore: This disagrees with what you said.
+
+1. Context: {context}
+2. You said: {claim}
+3. I checked: {query}
+4. I found this article: {evidence}
+5. Reasoning:
+""".strip()
+
+EDITOR_PROMPT = """I will fix some things you said.
+
+1. You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+2. I checked: How often do your nostrils switch?
+3. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+4. This suggests 45 minutes switch time in your statement is wrong.
+5. My fix: Your nose switches back and forth between nostrils. When you sleep, you switch about every 2 hours. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+
+1. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+2. I checked: Who led the British side in the battle of Lexington and Concord?
+3. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+4. This suggests General Thomas Hall in your statement is wrong.
+5. My fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+1. You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+2. I checked: Where was Stanford Prison Experiment conducted?
+3. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+4. This suggests Encina Hall in your statement is wrong.
+5. My fix: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+1. You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+2. I checked: What is the Havel-Hakimi algorithm?
+3. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+4. This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+5. My fix: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. It is named after Vaclav Havel and Samih Hakimi.
+
+1. You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Phil Ramone.
+2. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+3. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+4. This suggests "Time of My Life" producer name in your statement is wrong.
+5. My fix: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+
+1. You said: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+2. I checked: What is the area of Phoenix Market City in Pune?
+3. I found this article: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+4. This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+5. My fix: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+1. You said: {claim}
+2. I checked: {query}
+3. I found this article: {evidence}
+4. This suggests
+""".strip()
+
+CONTEXTUAL_EDITOR_PROMPT = """I will fix some things you said.
+
+1. Context: Your nose switches back and forth between nostrils. It's called the nasal cycle. This is to prevent a buildup of mucus.
+2. You said: When you sleep, you switch about every 45 minutes.
+3. I checked: How often do your nostrils switch?
+4. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+5. This suggests 45 minutes switch time in your statement is wrong.
+6. My fix: When you sleep, you switch about every 2 hours.
+
+1. Context: The Battles of Lexington and Concord, fought on April 19, 1775, kicked off the American Revolutionary War (1775-83).
+2. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+3. I checked: Who led the British side in the battle of Lexington and Concord?
+4. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+5. This suggests General Thomas Hall in your statement is wrong.
+6. My fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+1. Context: The Stanford Prison Experiment is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+2. You said: It was conducted in the basement of Encina Hall, Stanford’s psychology building.
+3. I checked: Where was Stanford Prison Experiment conducted?
+4. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+5. This suggests Encina Hall in your statement is wrong.
+6. My fix: It was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+1. Context: The Havel-Hakimi algorithm is named after Václav Havel and Samih Hakimi.
+2. You said: It is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+3.. I checked: What is the Havel-Hakimi algorithm?
+4. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+5. This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+6. My fix: It is an algorithm for constructing a special solution if a simple graph for the given degree sequence exists, or proving that one cannot find a positive answer.
+
+1. Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+2. You said: The song was produced by Phil Ramone.
+3. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+4. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+5. This suggests "Time of My Life" producer name in your statement is wrong.
+6. My fix: The song was produced by Michael Lloyd.
+
+1. Context: Phoenix Market City Pune is located on 21 acres of prime property in Pune.
+2. You said: Phoenix Market City is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+3. I checked: What is the area of Phoenix Market City in Pune?
+4. I found this article: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+5. This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+6. My fix: Phoenix Market City is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+1. Context: {context}
+2. You said: {claim}
+3. I checked: {query}
+4. I found this article: {evidence}
+5. This suggests
+""".strip()
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py b/src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..02cca6f715692ae74b5b91aef628739affde45ea
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py
@@ -0,0 +1,40 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+import random
+import string
+from .rarr_utils import agreement_gate
+from .prompts import rarr_prompts
+
+
+@register_solver("rarr_agreement_gate", "claims_with_evidences", "claims_with_gates")
+class RARRAgreementGate(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
+ self.model = self.global_config.get("model", "text-davinci-003")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ for claim, contents in claims.items():
+ context = contents.get("context", None)
+ evidences = contents.get("evidences", [])[:self.max_evidences_per_question]
+ gates = []
+ for evidence in evidences:
+ gate = agreement_gate.run_agreement_gate(
+ claim=claim,
+ context=context,
+ query=evidence['query'],
+ evidence=evidence['text'],
+ model=self.model,
+ prompt=rarr_prompts.CONTEXTUAL_AGREEMENT_GATE_PROMPT
+ if context else rarr_prompts.AGREEMENT_GATE_PROMPT
+ )
+ gates.append(gate)
+ contents['gates'] = gates
+
+ state.set(self.output_name, claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_concat_response_regenerator.py b/src/openfactcheck/solvers/rarr_solvers/rarr_concat_response_regenerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c24fe0fc8cb2d6db4d2d74919272dcb9ab1d4e2
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_concat_response_regenerator.py
@@ -0,0 +1,16 @@
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("rarr_concat_response_generator", "revised_claims", "output")
+class RARRConcatResponseRegenerator(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ revised_document = " ".join(list(claims.values())).strip()
+ # print(revised_document)
+ state.set(self.output_name, revised_document)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_editor.py b/src/openfactcheck/solvers/rarr_solvers/rarr_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..613ea7fba7aa3905f11eabf94cc56b702c2de417
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_editor.py
@@ -0,0 +1,77 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+import random
+import string
+from .rarr_utils import agreement_gate, editor, evidence_selection
+from .prompts import rarr_prompts
+import Levenshtein
+
+
+@register_solver("rarr_editor", "claims_with_evidences", "revised_claims")
+class RARREditor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("model", "text-davinci-003")
+ # self.model = args.get("model", "text-davinci-003")
+ self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
+ self.max_edit_ratio = args.get("max_edit_ratio", 100)
+ self.output_claim_only = args.get("output_claim_only", False)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ final_result = {}
+ for claim, contents in claims.items():
+ context = contents.get("context", None)
+ evidences = contents.get("evidences", [])[:self.max_evidences_per_question]
+ agreement_gates = []
+ revision_steps = []
+ claim_for_iterative_revision = claim
+ for evidence in evidences:
+ gate = agreement_gate.run_agreement_gate(
+ claim=claim_for_iterative_revision,
+ context=context,
+ query=evidence['query'],
+ evidence=evidence['text'],
+ model=self.model,
+ prompt=rarr_prompts.CONTEXTUAL_AGREEMENT_GATE_PROMPT
+ if context else rarr_prompts.AGREEMENT_GATE_PROMPT
+ )
+ agreement_gates.append(gate)
+
+ if gate['is_open']:
+ edited_claim = editor.run_rarr_editor(
+ claim=claim_for_iterative_revision,
+ context=context,
+ query=evidence['query'],
+ evidence=evidence['text'],
+ model=self.model,
+ prompt=rarr_prompts.CONTEXTUAL_EDITOR_PROMPT
+ if context
+ else rarr_prompts.EDITOR_PROMPT,
+ )['text']
+ if Levenshtein.distance(claim, edited_claim) / len(claim) <= self.max_edit_ratio:
+ claim_for_iterative_revision = edited_claim
+ revision_steps.append({"text": claim_for_iterative_revision})
+ result = {
+ "context": context,
+ "text": claim,
+ "questions": contents['questions'],
+ "evidences_for_questions": evidences,
+ "revisions": [
+ {
+ "original_text": claim,
+ "revised_text": revision_steps[-1]["text"],
+ "evidences": evidences,
+ "agreement_gates": agreement_gates,
+ "revision_steps": revision_steps,
+ }
+ ],
+ }
+ selected_evidences = evidence_selection.select_evidences(result)
+ result['selected_evidences'] = selected_evidences
+ final_result[claim] = result['revisions'][0]['revised_text'] if self.output_claim_only else result
+ state.set(self.output_name, final_result)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py b/src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..599aee6b43780b6c40712463719db6c7d5708dbc
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py
@@ -0,0 +1,35 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+import random
+import string
+from .rarr_utils.hallucination import run_evidence_hallucination
+from .prompts.hallucination_prompts import EVIDENCE_HALLUCINATION
+
+
+@register_solver("llm_retriever", "claims_with_questions", "claims_with_evidences")
+class LLMRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("model", "text-davinci-003")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ for claim, contents in claims.items():
+ questions = contents.get("questions", [])
+ evidences = []
+ for question in questions:
+ evidences.append(
+ run_evidence_hallucination(
+ question,
+ model=self.model,
+ prompt=EVIDENCE_HALLUCINATION
+ )
+ )
+ claims[claim]['evidences'] = evidences
+
+ state.set(self.output_name, claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py b/src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..67eb07ace36990b2ae6012d2981f2b2de59bc0de
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py
@@ -0,0 +1,43 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+import random
+import string
+import os
+import time
+from typing import List
+import openai
+from .rarr_utils.question_generation import run_rarr_question_generation
+from .prompts import rarr_prompts
+
+
+@register_solver("rarr_question_generator", "claims_with_context", "claims_with_questions")
+class RARRQuestionGenerator(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("model", "text-davinci-003")
+ self.temperature_qgen = args.get("temperature_qgen", 0.7)
+ self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ # should be DICT[Str, DICT[Str, Any]]
+ if type(claims) == list:
+ claims = {c: dict() for c in claims}
+ for claim, contents in claims.items():
+ context = contents.get("context", None)
+ claims[claim]['questions'] = run_rarr_question_generation(
+ claim=claim,
+ context=context,
+ model=self.model,
+ prompt=rarr_prompts.CONTEXTUAL_QGEN_PROMPT
+ if context
+ else rarr_prompts.QGEN_PROMPT,
+ temperature=self.temperature_qgen,
+ num_rounds=self.num_rounds_qgen,
+ )
+
+ state.set(self.output_name, claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_search_engine_retriever.py b/src/openfactcheck/solvers/rarr_solvers/rarr_search_engine_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a83a50dc700dbcae9e436542718ba0aa7eafc8f
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_search_engine_retriever.py
@@ -0,0 +1,39 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+import random
+import string
+from .rarr_utils import search
+
+
+@register_solver("search_engine_retriever", "claims_with_questions", "claims_with_evidences")
+class SearchEngineRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
+ self.max_sentences_per_passage = args.get("max_sentences_per_passage", 4)
+ self.sliding_distance = args.get("sliding_distance", 1)
+ self.max_passages_per_search_result = args.get("max_passages_per_search_result", 1)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ for claim, contents in claims.items():
+ questions = contents.get("questions", [])
+ evidences = []
+ for question in questions:
+ evidences.extend(
+ search.run_search(
+ query=question,
+ max_search_results_per_query=self.max_search_results_per_query,
+ max_sentences_per_passage=self.max_sentences_per_passage,
+ sliding_distance=self.sliding_distance,
+ max_passages_per_search_result_to_return=self.max_passages_per_search_result,
+ )
+ )
+ claims[claim]['evidences'] = evidences
+
+ state.set(self.output_name, claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/__init__.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/agreement_gate.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/agreement_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b89c0a1356edfeb30313456c68fb1cbbdd669a
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/agreement_gate.py
@@ -0,0 +1,85 @@
+"""Utils for running the agreement gate."""
+import os
+import time
+from typing import Any, Dict, Tuple
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> Tuple[bool, str, str]:
+ """Extract the agreement gate state and the reasoning from the GPT-3 API response.
+
+ Our prompt returns questions as a string with the format of an ordered list.
+ This function parses this response in a list of questions.
+
+ Args:
+ api_response: Agreement gate response from GPT-3.
+ Returns:
+ is_open: Whether the agreement gate is open.
+ reason: The reasoning for why the agreement gate is open or closed.
+ decision: The decision of the status of the gate in string form.
+ """
+ api_response = api_response.strip().split("\n")
+ if len(api_response) < 2:
+ reason = "Failed to parse."
+ decision = None
+ is_open = False
+ else:
+ reason = api_response[0]
+ decision = api_response[1].split("Therefore:")[-1].strip()
+ is_open = "disagrees" in api_response[1]
+ return is_open, reason, decision
+
+
+def run_agreement_gate(
+ claim: str,
+ query: str,
+ evidence: str,
+ model: str,
+ prompt: str,
+ context: str = None,
+ num_retries: int = 5,
+) -> Dict[str, Any]:
+ """Checks if a provided evidence contradicts the claim given a query.
+
+ Checks if the answer to a query using the claim contradicts the answer using the
+ evidence. If so, we open the agreement gate, which means that we allow the editor
+ to edit the claim. Otherwise the agreement gate is closed.
+
+ Args:
+ claim: Text to check the validity of.
+ query: Query to guide the validity check.
+ evidence: Evidence to judge the validity of the claim against.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ gate: A dictionary with the status of the gate and reasoning for decision.
+ """
+ if context:
+ gpt3_input = prompt.format(
+ context=context, claim=claim, query=query, evidence=evidence
+ ).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim, query=query, evidence=evidence).strip()
+
+ for _ in range(num_retries):
+ try:
+ response = openai.Completion.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=256,
+ stop=["\n\n"],
+ logit_bias={"50256": -100}, # Don't allow <|endoftext|> to be generated
+ )
+ break
+ except openai.error.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ is_open, reason, decision = parse_api_response(response.choices[0].text)
+ gate = {"is_open": is_open, "reason": reason, "decision": decision}
+ return gate
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/editor.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f045d6fc088c80e3ed7e19a6793be262b3c6989
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/editor.py
@@ -0,0 +1,77 @@
+"""Utils for running the editor."""
+import os
+import time
+from typing import Dict, Union
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> str:
+ """Extract the agreement gate state and the reasoning from the GPT-3 API response.
+
+ Our prompt returns a reason for the edit and the edit in two consecutive lines.
+ Only extract out the edit from the second line.
+
+ Args:
+ api_response: Editor response from GPT-3.
+ Returns:
+ edited_claim: The edited claim.
+ """
+ api_response = api_response.strip().split("\n")
+ if len(api_response) < 2:
+ print("Editor error.")
+ return None
+ edited_claim = api_response[1].split("My fix:")[-1].strip()
+ return edited_claim
+
+
+def run_rarr_editor(
+ claim: str,
+ query: str,
+ evidence: str,
+ model: str,
+ prompt: str,
+ context: str = None,
+ num_retries: int = 5,
+) -> Dict[str, str]:
+ """Runs a GPT-3 editor on the claim given a query and evidence to support the edit.
+
+ Args:
+ claim: Text to edit.
+ query: Query to guide the editing.
+ evidence: Evidence to base the edit on.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ edited_claim: The edited claim.
+ """
+ if context:
+ gpt3_input = prompt.format(
+ context=context, claim=claim, query=query, evidence=evidence
+ ).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim, query=query, evidence=evidence).strip()
+
+ for _ in range(num_retries):
+ try:
+ response = openai.Completion.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=512,
+ stop=["\n\n"],
+ )
+ break
+ except openai.error.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ edited_claim = parse_api_response(response.choices[0].text)
+ # If there was an error in GPT-3 generation, return the claim.
+ if not edited_claim:
+ edited_claim = claim
+ output = {"text": edited_claim}
+ return output
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/evidence_selection.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/evidence_selection.py
new file mode 100644
index 0000000000000000000000000000000000000000..92138b6a88cb49b3214e5e0b9ff9dee57004c40a
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/evidence_selection.py
@@ -0,0 +1,89 @@
+import itertools
+from typing import Any, Dict, List
+
+import torch
+from sentence_transformers import CrossEncoder
+
+PASSAGE_RANKER = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+)
+
+
+def compute_score_matrix(
+ questions: List[str], evidences: List[str]
+) -> List[List[float]]:
+ """Scores the relevance of all evidence against all questions using a CrossEncoder.
+
+ Args:
+ questions: A list of unique questions.
+ evidences: A list of unique evidences.
+ Returns:
+ score_matrix: A 2D list list of question X evidence relevance scores.
+ """
+ score_matrix = []
+ for q in questions:
+ evidence_scores = PASSAGE_RANKER.predict([(q, e) for e in evidences]).tolist()
+ score_matrix.append(evidence_scores)
+ return score_matrix
+
+
+def question_coverage_objective_fn(
+ score_matrix: List[List[float]], evidence_indices: List[int]
+) -> float:
+ """Given (query, evidence) scores and a subset of evidence, return the coverage.
+
+ Given all pairwise query and evidence scores, and a subset of the evidence
+ specified by indices, return a value indicating how well this subset of evidence
+ covers (i.e., helps answer) all questions.
+
+ Args:
+ score_matrix: A 2D list list of question X evidence relevance scores.
+ evidence_indicies: A subset of the evidence to to get the coverage score of.
+ Returns:
+ total: The coverage we would get by using the subset of evidence in
+ `evidence_indices` over all questions.
+ """
+ # Compute sum_{question q} max_{selected evidence e} score(q, e).
+ # This encourages all questions to be explained by at least one evidence.
+ total = 0.0
+ for scores_for_question in score_matrix:
+ total += max(scores_for_question[j] for j in evidence_indices)
+ return total
+
+
+def select_evidences(
+ example: Dict[str, Any], max_selected: int = 5, prefer_fewer: bool = False
+) -> List[Dict[str, Any]]:
+ """Selects the set of evidence that maximizes information converage over the claim.
+
+ Args:
+ example: The result of running the editing pipeline on one claim.
+ max_selected: Maximum number of evidences to select.
+ prefer_fewer: If True and the maximum objective value can be achieved by
+ fewer evidences than `max_selected`, prefer selecting fewer evidences.
+ Returns:
+ selected_evidences: Selected evidences that serve as the attribution report.
+ """
+ questions = sorted(set(example["questions"]))
+ evidences = sorted(set(e["text"] for e in example["revisions"][0]["evidences"]))
+ num_evidences = len(evidences)
+ if not num_evidences:
+ return []
+
+ score_matrix = compute_score_matrix(questions, evidences)
+
+ best_combo = tuple()
+ best_objective_value = float("-inf")
+ max_selected = min(max_selected, num_evidences)
+ min_selected = 1 if prefer_fewer else max_selected
+ for num_selected in range(min_selected, max_selected + 1):
+ for combo in itertools.combinations(range(num_evidences), num_selected):
+ objective_value = question_coverage_objective_fn(score_matrix, combo)
+ if objective_value > best_objective_value:
+ best_combo = combo
+ best_objective_value = objective_value
+
+ selected_evidences = [{"text": evidences[idx]} for idx in best_combo]
+ return selected_evidences
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/hallucination.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/hallucination.py
new file mode 100644
index 0000000000000000000000000000000000000000..88908ec71188f281df62f55365b679a8fc12347d
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/hallucination.py
@@ -0,0 +1,44 @@
+"""Utils for generating fake evidence given a query."""
+import os
+import time
+from typing import Dict
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def run_evidence_hallucination(
+ query: str,
+ model: str,
+ prompt: str,
+ num_retries: int = 5,
+) -> Dict[str, str]:
+ """Generates a fake piece of evidence via LLM given the question.
+
+ Args:
+ query: Query to guide the validity check.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ output: A potentially inaccurate piece of evidence.
+ """
+ gpt3_input = prompt.format(query=query).strip()
+ for _ in range(num_retries):
+ try:
+ response = openai.Completion.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=256,
+ stop=["\n", "\n\n"],
+ )
+ break
+ except openai.error.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ hallucinated_evidence = response.choices[0].text.strip()
+ output = {"text": hallucinated_evidence, "query": query}
+ return output
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/question_generation.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/question_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f545bb937e98663380f182d00d55deb8feb07a
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/question_generation.py
@@ -0,0 +1,82 @@
+"""Utils for running question generation."""
+import os
+import time
+from typing import List
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> List[str]:
+ """Extract questions from the GPT-3 API response.
+
+ Our prompt returns questions as a string with the format of an ordered list.
+ This function parses this response in a list of questions.
+
+ Args:
+ api_response: Question generation response from GPT-3.
+ Returns:
+ questions: A list of questions.
+ """
+ search_string = "I googled:"
+ questions = []
+ for question in api_response.split("\n"):
+ # Remove the search string from each question
+ if search_string not in question:
+ continue
+ question = question.split(search_string)[1].strip()
+ questions.append(question)
+
+ return questions
+
+
+def run_rarr_question_generation(
+ claim: str,
+ model: str,
+ prompt: str,
+ temperature: float,
+ num_rounds: int,
+ context: str = None,
+ num_retries: int = 5,
+) -> List[str]:
+ """Generates questions that interrogate the information in a claim.
+
+ Given a piece of text (claim), we use GPT-3 to generate questions that question the
+ information in the claim. We run num_rounds of sampling to get a diverse set of questions.
+
+ Args:
+ claim: Text to generate questions off of.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ temperature: Temperature to use for sampling questions. 0 represents greedy deconding.
+ num_rounds: Number of times to sample questions.
+ Returns:
+ questions: A list of questions.
+ """
+ if context:
+ gpt3_input = prompt.format(context=context, claim=claim).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim).strip()
+
+ questions = set()
+ for _ in range(num_rounds):
+ for _ in range(num_retries):
+ try:
+ response = openai.Completion.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=temperature,
+ max_tokens=256,
+ )
+ cur_round_questions = parse_api_response(
+ response.choices[0].text.strip()
+ )
+ questions.update(cur_round_questions)
+ break
+ except openai.error.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(1)
+
+ questions = list(sorted(questions))
+ return questions
diff --git a/src/openfactcheck/solvers/rarr_solvers/rarr_utils/search.py b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..401d53c2e2d1b77d0cc05a2b8fa76681526db3e6
--- /dev/null
+++ b/src/openfactcheck/solvers/rarr_solvers/rarr_utils/search.py
@@ -0,0 +1,219 @@
+"""Utils for searching a query and returning top passages from search results."""
+import concurrent.futures
+import itertools
+import os
+import random
+from typing import Any, Dict, List, Tuple
+
+import bs4
+import requests
+import spacy
+import torch
+from sentence_transformers import CrossEncoder
+
+PASSAGE_RANKER = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device="cpu",
+)
+SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search/"
+SUBSCRIPTION_KEY = os.getenv("AZURE_SEARCH_KEY")
+TOKENIZER = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer"])
+
+
+def chunk_text(
+ text: str,
+ sentences_per_passage: int,
+ filter_sentence_len: int,
+ sliding_distance: int = None,
+) -> List[str]:
+ """Chunks text into passages using a sliding window.
+
+ Args:
+ text: Text to chunk into passages.
+ sentences_per_passage: Number of sentences for each passage.
+ filter_sentence_len: Maximum number of chars of each sentence before being filtered.
+ sliding_distance: Sliding distance over the text. Allows the passages to have
+ overlap. The sliding distance cannot be greater than the window size.
+ Returns:
+ passages: Chunked passages from the text.
+ """
+ if not sliding_distance or sliding_distance > sentences_per_passage:
+ sliding_distance = sentences_per_passage
+ assert sentences_per_passage > 0 and sliding_distance > 0
+
+ passages = []
+ try:
+ doc = TOKENIZER(text[:500000]) # Take 500k chars to not break tokenization.
+ sents = [
+ s.text
+ for s in doc.sents
+ if len(s.text) <= filter_sentence_len # Long sents are usually metadata.
+ ]
+ for idx in range(0, len(sents), sliding_distance):
+ passages.append(" ".join(sents[idx : idx + sentences_per_passage]))
+ except UnicodeEncodeError as _: # Sometimes run into Unicode error when tokenizing.
+ print("Unicode error when using Spacy. Skipping text.")
+
+ return passages
+
+
+def is_tag_visible(element: bs4.element) -> bool:
+ """Determines if an HTML element is visible.
+
+ Args:
+ element: A BeautifulSoup element to check the visiblity of.
+ returns:
+ Whether the element is visible.
+ """
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ] or isinstance(element, bs4.element.Comment):
+ return False
+ return True
+
+
+def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
+ """Scrapes a URL for all text information.
+
+ Args:
+ url: URL of webpage to scrape.
+ timeout: Timeout of the requests call.
+ Returns:
+ web_text: The visible text of the scraped URL.
+ url: URL input.
+ """
+ # Scrape the URL
+ try:
+ response = requests.get(url, timeout=timeout)
+ response.raise_for_status()
+ except requests.exceptions.RequestException as _:
+ return None, url
+
+ # Extract out all text from the tags
+ try:
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
+ texts = soup.findAll(text=True)
+ # Filter out invisible text from the page.
+ visible_text = filter(is_tag_visible, texts)
+ except Exception as _:
+ return None, url
+
+ # Returns all the text concatenated as a string.
+ web_text = " ".join(t.strip() for t in visible_text).strip()
+ # Clean up spacing.
+ web_text = " ".join(web_text.split())
+ return web_text, url
+
+
+def search_bing(query: str, timeout: float = 3) -> List[str]:
+ """Searches the query using Bing.
+ Args:
+ query: Search query.
+ timeout: Timeout of the requests call.
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+ headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY}
+ params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
+ response = requests.get(SEARCH_URL, headers=headers, params=params, timeout=timeout)
+ response.raise_for_status()
+
+ response = response.json()
+ search_results = [r["url"] for r in response["webPages"]["value"]]
+ return search_results
+
+
+def run_search(
+ query: str,
+ cached_search_results: List[str] = None,
+ max_search_results_per_query: int = 3,
+ max_sentences_per_passage: int = 5,
+ sliding_distance: int = 1,
+ max_passages_per_search_result_to_return: int = 1,
+ timeout: float = 3,
+ randomize_num_sentences: bool = False,
+ filter_sentence_len: int = 250,
+ max_passages_per_search_result_to_score: int = 30,
+) -> List[Dict[str, Any]]:
+ """Searches the query on a search engine and returns the most relevant information.
+
+ Args:
+ query: Search query.
+ max_search_results_per_query: Maximum number of search results to get return.
+ max_sentences_per_passage: Maximum number of sentences for each passage.
+ filter_sentence_len: Maximum length of a sentence before being filtered.
+ sliding_distance: Sliding distance over the sentences of each search result.
+ Used to extract passages.
+ max_passages_per_search_result_to_score: Maxinum number of passages to score for
+ each search result.
+ max_passages_per_search_result_to_return: Maximum number of passages to return
+ for each search result.
+ Returns:
+ retrieved_passages: Top retrieved passages for the search query.
+ """
+ if cached_search_results is not None:
+ search_results = cached_search_results
+ else:
+ search_results = search_bing(query, timeout=timeout)
+
+ # Scrape search results in parallel
+ with concurrent.futures.ThreadPoolExecutor() as e:
+ scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
+ # Remove URLs if we weren't able to scrape anything or if they are a PDF.
+ scraped_results = [r for r in scraped_results if r[0] and ".pdf" not in r[1]]
+
+ # Iterate through the scraped results and extract out the most useful passages.
+ retrieved_passages = []
+ for webtext, url in scraped_results[:max_search_results_per_query]:
+ if randomize_num_sentences:
+ sents_per_passage = random.randint(1, max_sentences_per_passage)
+ else:
+ sents_per_passage = max_sentences_per_passage
+
+ # Chunk the extracted text into passages.
+ passages = chunk_text(
+ text=webtext,
+ sentences_per_passage=sents_per_passage,
+ filter_sentence_len=filter_sentence_len,
+ sliding_distance=sliding_distance,
+ )
+ passages = passages[:max_passages_per_search_result_to_score]
+ if not passages:
+ continue
+
+ # Score the passages by relevance to the query using a cross-encoder.
+ scores = PASSAGE_RANKER.predict([(query, p) for p in passages]).tolist()
+ passage_scores = list(zip(passages, scores))
+
+ # Take the top passages_per_search passages for the current search result.
+ passage_scores.sort(key=lambda x: x[1], reverse=True)
+ for passage, score in passage_scores[:max_passages_per_search_result_to_return]:
+ retrieved_passages.append(
+ {
+ "text": passage,
+ "url": url,
+ "query": query,
+ "sents_per_passage": sents_per_passage,
+ "retrieval_score": score, # Cross-encoder score as retr score
+ }
+ )
+
+ if retrieved_passages:
+ # Sort all retrieved passages by the retrieval score.
+ retrieved_passages = sorted(
+ retrieved_passages, key=lambda d: d["retrieval_score"], reverse=True
+ )
+
+ # Normalize the retreival scores into probabilities
+ scores = [r["retrieval_score"] for r in retrieved_passages]
+ probs = torch.nn.functional.softmax(torch.Tensor(scores), dim=-1).tolist()
+ for prob, passage in zip(probs, retrieved_passages):
+ passage["score"] = prob
+
+ return retrieved_passages
diff --git a/src/openfactcheck/solvers/solvers.py b/src/openfactcheck/solvers/solvers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/tutorial_solvers/__init__.py b/src/openfactcheck/solvers/tutorial_solvers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/tutorial_solvers/all_pass_abstain_detector.py b/src/openfactcheck/solvers/tutorial_solvers/all_pass_abstain_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc3fd7d818dfcb3c6b34354353d1bcbf03938755
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/all_pass_abstain_detector.py
@@ -0,0 +1,14 @@
+import logging
+
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("all_pass_abstain_detector", "response", "response")
+class AllPassAbstainDetector(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/chatgpt_claim_examiner.py b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_claim_examiner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3962e0bfc8d29b076ced92a302eb64775144874
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_claim_examiner.py
@@ -0,0 +1,54 @@
+# from claim_examiner import ClaimExaminer
+from utils.prompt_base import STANCE_DETECTION_PROMPT
+from utils.api import chatgpt
+import openai
+import time
+import json
+from core.task_solver import StandardTaskSolver
+from core.fact_check_state import FactCheckerState
+from core import register_solver
+
+
+@register_solver("chat_gpt_claim_examiner", "evidences", "claim_info")
+class ChatGPTClaimExaminer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.path_save_stance = args.get("path_save_stance", "evidence_stance.json")
+ self.num_retries = args.get("num_retries", 3)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+
+ claim_info = state.get("evidences")
+
+ for key, pair in claim_info.items():
+ claim = pair['claim']
+ evids = pair['evidence_list']
+ if len(evids) == 0:
+ claim_info[key]["stances"] = []
+ continue
+
+ temp = []
+ for i, evid in enumerate(evids):
+ user_input = STANCE_DETECTION_PROMPT.format(claim, evid["web_page_snippet_manual"])
+ for _ in range(self.num_retries):
+ try:
+ stance = chatgpt(user_input)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(1)
+ # print("Claim: {} \n Evidence: {} \n Stance: {}".format(claim, evid, stance))
+ evids[i]["stance"] = stance
+ temp.append(stance)
+ claim_info[key]["stances"] = temp
+
+ # write to json file
+ # Serializing json
+ json_object = json.dumps(claim_info, indent=4)
+
+ # Writing to sample.json
+ with open(self.path_save_stance, "w") as outfile:
+ outfile.write(json_object)
+
+ state.set("claim_info", claim_info)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/chatgpt_decontextulizer.py b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_decontextulizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..15aebb0f4b6872fccb425f7d40f6870c67f6d459
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_decontextulizer.py
@@ -0,0 +1,32 @@
+from .utils.prompt_base import DECONTEXTILISATION_PROMPT
+from .utils.api import chatgpt
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("chatgpt_decontextualizer", "sentences", "claims")
+class ChatGPTDecontextualizer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ sentences = state.get(self.input_name)
+
+ if sentences is None:
+ raise ValueError(f"sentences is required for {self}")
+
+ results = []
+ for sentence in sentences:
+ user_input = DECONTEXTILISATION_PROMPT + sentence
+ decontextualised_claims = chatgpt(user_input)
+ print(decontextualised_claims)
+
+ decontextualised_claims = decontextualised_claims.split("\n")
+ decontextualised_claims = [claim.strip() for claim in decontextualised_claims if not claim.strip() == ""]
+ decontextualised_claims = decontextualised_claims[1:] # skip 'Output:'
+ print("{} decontextualised claims.".format(len(decontextualised_claims)))
+ results.extend(decontextualised_claims)
+
+ state.set(self.output_name, results)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/chatgpt_post_editor.py b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_post_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f6dfeb93612fb6ca0551f288e299f6a5fbf2bf1
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_post_editor.py
@@ -0,0 +1,83 @@
+from .utils.prompt_base import EDITOR_PROMPT
+from .utils.api import chatgpt
+import openai
+import time
+import json
+from core.task_solver import StandardTaskSolver
+from core.fact_check_state import FactCheckerState
+from core import register_solver
+
+
+@register_solver("chatgpt_post_editor", "claim_info", "claim_info")
+class ChatGPTPostEditor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.path_save_edited_claims = args.get("path_save_edited_claims", "evidence_stance_edit.json")
+ self.num_retries = args.get("num_retries", 3)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+
+ for key, pair in claim_info.items():
+ claim = pair['claim'].strip()
+ evids = pair['evidence_list']
+ stance_explanation = pair['stances']
+ # For not checkworthy claims, there is not a list of evidence (stances)
+ if len(evids) == 0:
+ claim_info[key].set({"edited_claims": claim, "operation": "no-check, no-edit"})
+ continue
+
+ # For checkworthy claims, with a list of evidence (stances)
+ stance_label = [s.split()[0][:-1].lower() for s in stance_explanation]
+ # print(key, stance_label)
+ # rules to determine whether and how to edit:
+ # if there is one support among stances, claim is regarded as true
+ if "support" in stance_label:
+ claim_info[key]['edited_claims'] = claim
+ claim_info[key]['operation'] = "true, no-edit"
+ # claim_info[key].set({"edited_claims": claim, "operation": "true, no-edit"})
+ # if all stances are other, not direct/relevant to refute/support the claim, we delete it
+ elif all([True for l in stance_label if l == "other"]):
+ claim_info[key]['edited_claims'] = ''
+ claim_info[key]['operation'] = "no-relevant-evidence, delete"
+ # claim_info[key].set({"edited_claims": '', "operation": "no-relevant-evidence, delete"})
+ # deal with refute-label with not mention explanation, these evidence is similar to other, just label is ambiguous
+ elif all([True for l in stance_explanation if ("other," in l or " not mention" in l)]):
+ claim_info[key]['edited_claims'] = ''
+ claim_info[key]['operation'] = "no-relevant-evidence, delete"
+ # claim_info[key].set({"edited_claims": '', "operation": "no-relevant-evidence, delete"})
+ else:
+ for i, s in enumerate(stance_label):
+ if s == "other":
+ continue
+ elif s == "refute" and "not mention" in stance_explanation[i]:
+ continue
+ elif s == "refute" or s == "partial support":
+ user_input = EDITOR_PROMPT.format(claim, evids[i])
+ for _ in range(self.num_retries):
+ try:
+ edits = chatgpt(user_input)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(1)
+ # update claim to revised claim as well
+ claim = edits
+ claim_info[key]["edited_claims"] = edits
+ claim_info[key]["operation"] = "false (refute or partial support), edit"
+ # claim_info[key].set(
+ # {"edited_claims": edits, "operation": "false (refute or partial support), edit"})
+ else:
+ print(claim)
+ print(s, evids[i])
+
+ # write to json file
+ # Serializing json
+ json_object = json.dumps(claim_info, indent=4)
+
+ # Writing to sample.json
+ with open(self.path_save_edited_claims, "w") as outfile:
+ outfile.write(json_object)
+
+ state.set(self.output_name, claim_info)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/chatgpt_worthiness_filter.py b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_worthiness_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b16077e40cb4bc8253eb85c5757bb2764f90af
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/chatgpt_worthiness_filter.py
@@ -0,0 +1,49 @@
+from .utils.prompt_base import CHECK_WORTHINESS_LABEL_ONLY_PROMPT
+from .utils.api import chatgpt
+from typing import List, Tuple
+from argparse import Namespace
+from core.task_solver import StandardTaskSolver
+from core.fact_check_state import FactCheckerState
+from core import register_solver
+
+
+@register_solver("chatgpt_worthiness_filter", "claims", "claims")
+class ChatGPTWorthinessFilter(StandardTaskSolver):
+ def __init__(self, args: Namespace):
+ super().__init__(args)
+
+ # string to format labels
+ def convert_checkworthy_output_to_labels(self, label: str) -> bool:
+ # factual_labels, checkworthy_labels = [], []
+ # for label in labels:
+ #
+ # factual_labels.append(opinion_vs_factual)
+ # checkworthy_labels.append(checkworthy)
+ #
+ # print(factual_labels)
+ # print(checkworthy_labels)
+ label = label.lower()
+ if label[-1] == ".":
+ label = label[:-1]
+ opinion_vs_factual, checkworthy = label.split(",")
+ if "fact" in opinion_vs_factual:
+ opinion_vs_factual = "factual"
+ else:
+ opinion_vs_factual = "opinion"
+
+ if "not" in checkworthy or opinion_vs_factual == "opinion":
+ checkworthy = False
+ else:
+ checkworthy = True
+ return checkworthy
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ valid_claims = []
+ for claim in claims:
+ response = chatgpt(CHECK_WORTHINESS_LABEL_ONLY_PROMPT + claim)
+ if self.convert_checkworthy_output_to_labels(response):
+ valid_claims.append(claim)
+
+ state.set(self.output_name, valid_claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/concat_response_regenerator.py b/src/openfactcheck/solvers/tutorial_solvers/concat_response_regenerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..57cbf5d04855928ce1e595ce751e3cb2176892ab
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/concat_response_regenerator.py
@@ -0,0 +1,18 @@
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("concat_response_generator", "claim_info", "output")
+class ConcatResponseRegenerator(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claim_info = state.get(self.input_name)
+
+ edited_claims = [v["edited_claims"] for _, v in claim_info.items()]
+ revised_document = " ".join(edited_claims).strip()
+ # print(revised_document)
+ state.set(self.output_name, revised_document)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/search_engine_evidence_retriever.py b/src/openfactcheck/solvers/tutorial_solvers/search_engine_evidence_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..f740fe8f2dccc779792d76a7ba3c4703bbfdd0bf
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/search_engine_evidence_retriever.py
@@ -0,0 +1,191 @@
+from .utils.api import chatgpt, search_google, search_bing
+import openai
+import time
+from .utils.prompt_base import QGEN_PROMPT
+from typing import List, Dict, Any
+from .utils.web_util import scrape_url, select_doc_by_keyword_coverage, select_passages_by_semantic_similarity
+import json
+from core import register_solver
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+
+
+@register_solver("search_engine_evidence_retriever", "claims", "evidences")
+class SearchEngineEvidenceRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.search_engine = args.get("search_engine", "google")
+ self.search_engine_func = {
+ "google": search_google,
+ "bing": search_bing
+ }.get(self.search_engine, "google")
+
+ self.url_merge_method = args.get("url_merge_method", "union")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ queries = self.generate_questions_as_query(claims)
+ evidences = self.search_evidence(claims, queries)
+ state.set(self.output_name, evidences)
+ return True, state
+
+ # generate questions and queries based on a claim
+ def generate_questions_as_query(self, claims,
+ num_retries: int = 3) -> List[list]:
+ """
+ num_retries: the number of retries when error occurs during openai api calling
+ """
+ query_list = []
+ for i, claim in enumerate(claims):
+ for _ in range(num_retries):
+ try:
+ response = chatgpt(QGEN_PROMPT + claim)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(1)
+ query_list.append(response)
+ # print(response)
+ # print("\n")
+
+ # convert openai output: a string into a list of questions/queries
+ # not check-worthy claims: query response is set as "", accordingly return a []
+ # other responses are split into a list of questions/queries
+ automatic_query_list = []
+ for query in query_list:
+ if query == "":
+ automatic_query_list.append([])
+ else:
+ new_tmp = []
+ tmp = query.split("\n")
+ for q in tmp:
+ q = q.strip()
+ if q == "" or q == "Output:":
+ continue
+ elif q[:6] == "Output":
+ q = q[7:].strip()
+ new_tmp.append(q)
+ automatic_query_list.append(new_tmp)
+
+ return automatic_query_list
+
+ # ----------------------------------------------------------
+ # Evidence Retrieval
+ # ----------------------------------------------------------
+ def collect_claim_url_list(self, queries: List[str]) -> List[str]:
+ """
+ collect urls for a claim given the query list:
+ queries: a list of queries or questions for a claim
+ search_engine: use which search engine to retrieve evidence, google or bing
+ url_union_or_intersection: url operation, to merge all -> 'union' or obtain intersection
+ intersection urls tend to be what is not expected, less relevant
+ """
+ if len(queries) == 0:
+ print("Invalid queries: []")
+ return None
+
+ urls_list: List[list] = [] # initial list of urls for all queries
+ url_query_dict: Dict[str, list] = {} # url as key, and list of queries corresponding to this url as value.
+ url_union, url_intersection = [], []
+
+ for query in queries:
+ urls = self.search_engine_func(query)
+ urls_list.append(urls)
+
+ for i, urls in enumerate(urls_list):
+ for url in urls:
+ if url_query_dict.get(url) is None:
+ url_query_dict[url] = [queries[i]]
+ else:
+ url_query_dict[url] = url_query_dict[url] + [queries[i]]
+
+ if self.url_merge_method == "union":
+ for urls in urls_list:
+ url_union += urls
+ url_union = list(set(url_union))
+ assert (len(url_union) == len(url_query_dict.keys()))
+ return list(url_query_dict.keys()), url_query_dict
+ elif self.url_merge_method == "intersection":
+ url_intersection = urls_list[0]
+ for urls in urls_list[1:]:
+ url_intersection = list(set(url_intersection).intersection(set(urls)))
+ return url_intersection, url_query_dict
+ else:
+ print("Invalid url operation, please choose from 'union' and 'intersection'.")
+ return None, url_query_dict
+
+ def search_evidence(self,
+ decontextualised_claims: List[str],
+ automatic_query_list: List[list],
+ path_save_evidence: str = "evidence.json",
+ save_web_text: bool = False) -> Dict[str, Dict[str, Any]]:
+
+ assert (len(decontextualised_claims) == len(automatic_query_list))
+
+ claim_info: Dict[str, Dict[str, Any]] = {}
+ for i, claim in enumerate(decontextualised_claims):
+ queries = automatic_query_list[i]
+ if len(queries) == 0:
+ claim_info[claim] = {"claim": claim, "automatic_queries": queries, "evidence_list": []}
+ print("Claim: {} This is an opinion, not check-worthy.".format(claim))
+ continue
+
+ # for each checkworthy claim, first gather urls of related web pages
+ urls, url_query_dict = self.collect_claim_url_list(queries)
+
+ docs: List[dict] = []
+ for j, url in enumerate(urls):
+ web_text, _ = scrape_url(url)
+ if not web_text is None:
+ docs.append({"query": url_query_dict[url], "url": url, "web_text": web_text})
+ else:
+ continue
+ print("Claim: {}\nWe retrieved {} urls, {} web pages are accessible.".format(claim, len(urls), len(docs)))
+
+ # we can directly use the first k of url_query_dict, as it is the list of google returned.
+ # Here, we select the most relevent top-k docs against the claim by keyword coverage
+ # return index of selected documents as the order in docs
+ if len(docs) != 0:
+ docs_text = [d['web_text'] for d in docs]
+ selected_docs_index = select_doc_by_keyword_coverage(claim, docs_text)
+ print(selected_docs_index)
+ else:
+ # no related web articles collected for this claim, continue to next claim
+ claim_info[claim] = {"claim": claim, "automatic_queries": queries, "evidence_list": []}
+ continue
+
+ selected_docs = [docs_text[i] for i in selected_docs_index]
+ # score corresponding passages and select the top-5 passages
+ # return the text of passages; and a list of doc ids for each passage.
+ # ids here is as the total number and order in selected_docs_index such as in [4, 25, 28, 32, 33]
+ topk_passages, passage_doc_id = select_passages_by_semantic_similarity(claim, selected_docs)
+
+ # recover doc_id to original index in docs which records detailed information of a doc
+ passage_doc_index = []
+ for ids in passage_doc_id:
+ passage_doc_index.append([selected_docs_index[id] for id in ids])
+
+ # evidence list
+ evidence_list: List[dict] = []
+ for pid, p in enumerate(topk_passages):
+ doc_ids = passage_doc_index[pid]
+ if save_web_text:
+ evidence_list.append({"evidence_id": pid, "web_page_snippet_manual": p,
+ "query": [docs[doc_id]["query"] for doc_id in doc_ids],
+ "url": [docs[doc_id]["url"] for doc_id in doc_ids],
+ "web_text": [docs[doc_id]["web_text"] for doc_id in doc_ids], })
+ else:
+ evidence_list.append({"evidence_id": pid, "web_page_snippet_manual": p,
+ "query": [docs[doc_id]["query"] for doc_id in doc_ids],
+ "url": [docs[doc_id]["url"] for doc_id in doc_ids],
+ "web_text": [], })
+ claim_info[claim] = {"claim": claim, "automatic_queries": queries, "evidence_list": evidence_list}
+
+ # write to json file
+ # Serializing json
+ json_object = json.dumps(claim_info, indent=4)
+
+ # Writing to sample.json
+ with open(path_save_evidence, "w") as outfile:
+ outfile.write(json_object)
+ return claim_info
diff --git a/src/openfactcheck/solvers/tutorial_solvers/spacy_response_decomposer.py b/src/openfactcheck/solvers/tutorial_solvers/spacy_response_decomposer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6559b695b7c8b66803d1d4074a548b14ce7a4e2
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/spacy_response_decomposer.py
@@ -0,0 +1,22 @@
+import logging
+
+import spacy
+from core.fact_check_state import FactCheckerState
+from core.task_solver import StandardTaskSolver
+from core import register_solver
+
+
+@register_solver("spacy_response_decomposer", 'response', 'sentences')
+class SpacyResponseDecomposer(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ spacy_model = args.get("spacy_model", "en_core_web_sm")
+ self.spacy_processor = spacy.load(spacy_model)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ response = state.get(self.input_name)
+ doc = self.spacy_processor(response)
+ sentences = [str(sent).strip() for sent in doc.sents]
+ logging.info("The document is split into {} sentences.".format(len(sentences)))
+ state.set(self.output_name, sentences)
+ return True, state
diff --git a/src/openfactcheck/solvers/tutorial_solvers/utils/__init__.py b/src/openfactcheck/solvers/tutorial_solvers/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/tutorial_solvers/utils/api.py b/src/openfactcheck/solvers/tutorial_solvers/utils/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ff629a7680a9028cde7376db780f6c3247029b
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/utils/api.py
@@ -0,0 +1,132 @@
+import os
+import re
+import openai
+from openai import OpenAI
+import requests
+from typing import Any, Dict, List, Tuple
+
+# ----------------------------------------------------------
+# OpenAI ChatGPT and davicci-text
+# ----------------------------------------------------------
+client = None
+def init_client():
+ global client
+ if client is None:
+ if openai.api_key is None and 'OPENAI_API_KEY' not in os.environ:
+ print("openai_key not presented, delay to initialize.")
+ return
+ client = OpenAI()
+
+def chatgpt(user_input):
+ response = client.chat.completions.create(
+ model="gpt-3.5-turbo",
+ messages=[
+ {"role": "system", "content": "You are a NLP expert that is good at fact checking"},
+ {"role": "user", "content": user_input},
+ ]
+ )
+
+ result = ''
+ for choice in response.choices:
+ result += choice.message.content
+
+ return result
+
+def davinci(prompt):
+ # Set up the model and prompt
+ model_engine = "text-davinci-003"
+
+ # Generate a response
+ completion = client.completions.create(
+ engine=model_engine,
+ prompt=prompt,
+ max_tokens=1024,
+ n=1,
+ stop=None,
+ temperature=0.5,
+ )
+
+ response = completion.choices[0].text
+ return response
+
+# ----------------------------------------------------------
+# Bing Search
+# ----------------------------------------------------------
+BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search/"
+SUBSCRIPTION_KEY = "" # fill your bing api key
+
+def search_bing(query: str, timeout: float = 3) -> List[str]:
+ """Searches the query using Bing.
+ Args:
+ query: Search query.
+ timeout: Timeout of the requests call.
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+
+ headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY}
+ params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
+ response = requests.get(BING_SEARCH_URL, headers=headers, params=params, timeout=timeout)
+ response.raise_for_status()
+
+ response = response.json()
+ search_results = [r["url"] for r in response["webPages"]["value"]]
+ return search_results
+
+# Test Bing search
+# search_results = search_bing("What are the different awards that Preslav Nakov has received")
+# print(search_results)
+
+
+# ----------------------------------------------------------
+# Google Search
+# ----------------------------------------------------------
+def search_google(query: str, num_web_pages: int = 10, save_url: str = '') -> List[str]:
+ """Searches the query using Google.
+ Args:
+ query: Search query.
+ num_web_pages: the number of web pages to request.
+ save_url: path to save returned urls, such as 'urls.txt'
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+ query = query.replace(" ", "+")
+
+ # set headers: Google returns different web-pages according to agent device
+ # desktop user-agent
+ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
+ # mobile user-agent
+ MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
+ headers = {'User-Agent': USER_AGENT}
+
+ # set language
+ # set the Google interface language, use &hl=XX
+ # set the preferred language of the search results, use &lr=lang_XX
+ # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
+ lang = "en"
+
+ # scrape google results
+ urls = []
+ for page in range(0, num_web_pages, 10):
+ # here page is google search's bottom page meaning, click 2 -> start=10
+ # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+ url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
+ r = requests.get(url, headers=headers)
+ # collect all urls by regular expression
+ # how to do if I just want to have the returned top-k pages?
+ urls += re.findall('href="(https?://.*?)"', r.text)
+
+ # set to remove repeated urls
+ urls = list(set(urls))
+
+ # save all url into a txt file
+ if not save_url == "":
+ with open(save_url, 'w') as file:
+ for url in urls:
+ file.write(url + '\n')
+ return urls
+
+# Test google search
+# query = "Google Company Introduction"
+# urls = search_google(query)
+# print(len(urls))
diff --git a/src/openfactcheck/solvers/tutorial_solvers/utils/prompt_base.py b/src/openfactcheck/solvers/tutorial_solvers/utils/prompt_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c68644ceacce97af2a8e8b0267da58b65246f237
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/utils/prompt_base.py
@@ -0,0 +1,133 @@
+"""All prompts used for DFC prompting."""
+
+DECONTEXTILISATION_PROMPT = """Decompose and decontextualise a document into independently meaningful sentences. This process will make each sentence stand alone that can be verified independently.
+
+Input: Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.
+Output:
+Mary is a five-year old girl.
+Mary likes playing piano.
+Mary doesn't like cookies.
+
+Input: Google began as an online search firm, but it now offers more than 50 Internet services and products, from e-mail and online document creation to software for mobile phones and tablet computers. In addition, its 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+Ouput:
+Google began as an online search firm.
+Google now offers more than 50 Internet services and products.
+Google offers from e-mail and online document creation to software for mobile phones and tablet computers.
+Google 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+
+Input: """
+
+
+CHECK_WORTHINESS_LABEL_ONLY_PROMPT = """Identify whether this claim is an opinion or factual, and whether it is checkworthy or not in the context of fact-checking. Just return two labels without explanation.
+I think Apple is a good company.
+opinon, not checkworthy
+Preslav is a professor in MBZUAI.
+factual, checkworthy
+Friends is a great TV series.
+opinion, not checkworthy
+The Stanford Prison Experiment was conducted in the basement of Encina Hall.
+factual, checkworthy
+"""
+
+
+ENTITY_EXTRACTION_PROMPT = """Extract all entities of a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: Google, Internet services, product
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Donald John Trump, American politician, media personality, businessman
+Input: """
+
+
+QGEN_PROMPT = """Give a list of queries using for searching related information for a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: What does Google offers now?
+How many service and product does Google offer?
+Google, more than 50 Internet services, products
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Who is Donald John Trump?
+Give information of Donald John Trump.
+Donald John Trump, American politician
+Donald John Trump, media personality
+Donald John Trump, businessman
+Input: """
+
+
+STANCE_DETECTION_PROMPT = """Determine whether the evidence support the claim or not. Choose label from [support, partial support, refute, other] and explain why.
+Support means we can entail the claim by the evidence.
+Partial support means: part of the information presented in the claim appear in the evidence.
+Refute means that the evidence mention the same event as the claim, but a clear opposite fact. It should be highlighed that under refute, the evidence mentions the fact in the claim, they are closely relevant, but opposite meaning or stance.
+Other means the evidence does not mention anything about the fact described in the claim, such that it neither supports nor refutes the claim.
+
+Claim: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: support, statement 'he is also the founder, CEO and chief engineer of SpaceX' in evidence above supports the claim.
+
+Claim: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Stance: partial support.
+
+Claim: Steve Jobs is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: refute.
+
+Claim: Elon Musk is a professor in The Stanford University.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: other, according to the evidence, I cannot judge whether the claim is true or not, not enough information, the evidence neither supports nor refutes.
+
+Claim: On January 6, 2021, a mob of supporters of former President Donald Trump stormed the U.S. Capitol in an attempt to overturn the 2020 presidential election.
+Evidence: On January 6, 2021, following the defeat of U.S. President Donald Trump in the 2020 presidential election, a mob of his supporters attacked the United States Capitol Building in Washington, D.C. The mob sought to keep Trump in power by preventing a joint session of Congress from counting the electoral college votes to formalize the victory of President-elect Joe Biden.
+Stance: support.
+
+Claim: The 2021 Capitol Hill riots resulted in the deaths of five people, including a Capitol police officer.
+Evidence: Five people died either shortly before, during, or following the riot: one was shot by Capitol Police, another died of a drug overdose, and three died of natural causes.
+Stance: partial support, the evidence supports that fact that five deaths, but not sure whether they include a Capitol police officer or not.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: As of November 10, 2022, over 940 people had been charged in the Capitol breach.
+Stance: refute, evidence and claim are describing the same thing, the number of people who was charged is over 940, while more than 300 in the claim, so the evidence refutes the claim.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: The laptop computer taken from Pelosi's office was taken by 22-year-old Capitol rioter Riley Williams. Williams was arrested and indicted on eight counts, including theft of government property, obstructing an official proceeding, and assaulting or resisting police.
+Stance: other, the evidence demonstrates something relevent to the fact in the claim, but it does not support or refute any information of it.
+
+Claim: {}
+Evidence: {}
+Stance: """
+
+
+EDITOR_PROMPT = """Fix the claim according to the evidence.
+
+Claim: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+Evidence: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+This suggests 45 minutes switch time in your statement is wrong.
+Fix: Your nose switches back and forth between nostrils. When you sleep, you switch about every 2 hours. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+
+Claim: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+Evidence: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+This suggests General Thomas Hall in your statement is wrong.
+Fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+Claim: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+Evidence: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+This suggests Encina Hall in your statement is wrong.
+Fix: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+Claim: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+Evidence: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+Fix: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. It is named after Vaclav Havel and Samih Hakimi.
+
+Claim: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Phil Ramone.
+Evidence: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+This suggests "Time of My Life" producer name in your statement is wrong.
+Fix: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+
+Claim: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+Evidence: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+Fix: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+Claim: {claim}
+Evidence: {evidence}
+This suggests
+""".strip()
diff --git a/src/openfactcheck/solvers/tutorial_solvers/utils/web_util.py b/src/openfactcheck/solvers/tutorial_solvers/utils/web_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dea522d0d4074ae5f44e3c89654d4162cb7f14d
--- /dev/null
+++ b/src/openfactcheck/solvers/tutorial_solvers/utils/web_util.py
@@ -0,0 +1,179 @@
+import bs4
+import spacy
+import requests
+from collections import Counter
+from string import punctuation
+from typing import List, Dict, Tuple, Any
+
+
+def is_tag_visible(element: bs4.element) -> bool:
+ """Determines if an HTML element is visible.
+
+ Args:
+ element: A BeautifulSoup element to check the visiblity of.
+ returns:
+ Whether the element is visible.
+ """
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ] or isinstance(element, bs4.element.Comment):
+ return False
+ return True
+
+
+def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
+ """Scrapes a URL for all text information.
+
+ Args:
+ url: URL of webpage to scrape.
+ timeout: Timeout of the requests call.
+ Returns:
+ web_text: The visible text of the scraped URL.
+ url: URL input.
+ """
+ # Scrape the URL
+ try:
+ response = requests.get(url, timeout=timeout)
+ response.raise_for_status()
+ except requests.exceptions.RequestException as _:
+ print("URL Require Error.")
+ return None, url
+
+ # Extract out all text from the tags
+ try:
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
+ texts = soup.findAll(text=True)
+ # Filter out invisible text from the page.
+ visible_text = filter(is_tag_visible, texts)
+ except Exception as _:
+ print("BS4 Error.")
+ return None, url
+
+ # Returns all the text concatenated as a string.
+ web_text = " ".join(t.strip() for t in visible_text).strip()
+ # Clean up spacing.
+ web_text = " ".join(web_text.split())
+ return web_text, url
+
+
+def get_hotwords(text: str, top_k: int = 10) -> List[str]:
+ """# extract key words for a text, return most frequent topk keywords
+ """
+ nlp = spacy.load("en_core_web_sm")
+ pos_tag = ['PROPN', 'ADJ', 'NOUN']
+ doc = nlp(text.lower())
+
+ result = []
+ for token in doc:
+ if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
+ continue
+ if(token.pos_ in pos_tag):
+ result.append(token.text)
+
+ most_common_list = Counter(result).most_common(top_k)
+ keywords = [item[0] for item in most_common_list]
+ return keywords
+
+
+def select_doc_by_keyword_coverage(claim: str, docs: List[str],
+ top_k_keywords: int = 10, top_k_docs: int = 5) -> List[int]:
+ """count how many keywords appeared in this document len(appeared_keywords)
+ sort documents by the count that represents the degree of coverage of the claim for the doc
+ return index of top-k docs"""
+ # get keywords in the claim.
+ keywords = get_hotwords(claim, top_k_keywords)
+
+ # how many keywords are contained in each doc
+ counts = []
+ for doc in docs:
+ doc = doc.lower() # as all keywords are lowercase
+ count = [1 for word in keywords if word in doc]
+ counts.append(sum(count))
+
+ # we keep the docs that contain the most keywords, as we aim to cut off lots of unrelevant docs
+ max_count = max(counts)
+ selected_docs_index = [i for i in range(len(docs)) if counts[i] == max_count]
+ if len(selected_docs_index) < top_k_docs:
+ # we sort docs by coverage, then keep top-K
+ docs_index_sorted_coverage = sorted(range(len(counts)), key=lambda k: counts[k], reverse=True)
+ selected_docs_index = docs_index_sorted_coverage[:top_k_docs]
+
+ print("There are {} web pages selected.".format(len(selected_docs_index)))
+ return selected_docs_index
+
+
+def chunk_text(text: str, sentences_per_passage: int,
+ filter_sentence_len: int, sliding_distance: int = None) -> List[str]:
+ """Chunks text into passages using a sliding window.
+
+ Args:
+ text: Text to chunk into passages.
+ sentences_per_passage: Number of sentences for each passage.
+ filter_sentence_len: Maximum number of chars of each sentence before being filtered.
+ sliding_distance: Sliding distance over the text. Allows the passages to have
+ overlap. The sliding distance cannot be greater than the window size.
+ Returns:
+ passages: Chunked passages from the text.
+ """
+ TOKENIZER = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer"])
+ if not sliding_distance or sliding_distance > sentences_per_passage:
+ sliding_distance = sentences_per_passage
+ assert sentences_per_passage > 0 and sliding_distance > 0
+
+ passages = []
+ try:
+ doc = TOKENIZER(text[:500000]) # Take 500k chars to not break tokenization.
+ sents = [
+ s.text
+ for s in doc.sents
+ if len(s.text) <= filter_sentence_len # Long sents are usually metadata.
+ ]
+ for idx in range(0, len(sents), sliding_distance):
+ passages.append(" ".join(sents[idx : idx + sentences_per_passage]))
+ except UnicodeEncodeError as _: # Sometimes run into Unicode error when tokenizing.
+ print("Unicode error when using Spacy. Skipping text.")
+
+ return passages
+
+
+def select_passages_by_semantic_similarity(claim: str, selected_docs: List[str],
+ max_sentences_per_passage: int = 3, filter_sentence_len: int = 250,
+ sliding_distance: int = 3, top_k_passage: int = 5) -> Tuple[list, list]:
+ passages: List[str] = []
+ for doc in selected_docs:
+ # RARR default setting (5, 250, 1) for chunk
+ snippets = chunk_text(doc, max_sentences_per_passage, filter_sentence_len, sliding_distance)
+ passages.extend(snippets)
+ passages = list(set(passages)) # remove repeated ones
+ print("{} snippets of text are splitted.".format(len(passages)))
+
+ # score each snippet of text against claim
+ nlp = spacy.load("en_core_web_sm")
+ claim = nlp(claim)
+ sim = []
+ for p in passages:
+ sim.append(claim.similarity(nlp(p)))
+
+ # sort by similarity score and keep topk
+ index_sorted_sim = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True)
+ topk_passages = [passages[i] for i in index_sorted_sim[:top_k_passage]]
+
+ # find docs of topk_passages: one passage may occur in multiple docs
+ passage_doc_id: List[list] = []
+ for p in topk_passages:
+ temp = []
+ for id, doc in enumerate(selected_docs):
+ if p in doc:
+ temp.append(id)
+ # if fail to find docs of this passage, just pass.
+ # this will lead some [], [], [] in evidence list for this snippet of text
+ if len(temp) == 0:
+ print("Error in matching selected passage to its docs!")
+ passage_doc_id.append(temp)
+
+ return topk_passages, passage_doc_id
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_cp.py b/src/openfactcheck/solvers/webservice/factcheckgpt_cp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a43b07a55d6755898de53e1fc22464af4be3a78
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_cp.py
@@ -0,0 +1,70 @@
+import nltk
+import spacy
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.data_util import save_to_file
+from .factcheckgpt_utils.prompt import DOC_TO_INDEPEDENT_SENTENCES_PROMPT, SENTENCES_TO_CLAIMS_PROMPT, \
+ DOC_TO_SENTENCES_PROMPT, CHECKWORTHY_PROMPT_BOOL, SPECIFY_CHECKWORTHY_CATEGORY_PROMPT
+
+@Solver.register("factcheckgpt_claimprocessor", "response", "claims")
+class FactCheckGPTClaimProcessor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("factcheckgpt_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ self.mode = args.get("mode", "independent_sentences")
+ self.decompose_system_role = "You are good at decomposing and decontextualizing text."
+ self.worthines_filter_system_role = "You are a helpful factchecker assistant."
+ self.rule_based_method = args.get("rule_based_tool", "spacy")
+ self.spacy_model = args.get("spacy_model", "en_core_web_sm")
+ self.prompt = {
+ "sentences": DOC_TO_SENTENCES_PROMPT,
+ "independent_sentences": DOC_TO_INDEPEDENT_SENTENCES_PROMPT,
+ "claims": SENTENCES_TO_CLAIMS_PROMPT
+ }.get(self.mode, DOC_TO_INDEPEDENT_SENTENCES_PROMPT)
+ nlp = spacy.load(self.spacy_model)
+ self.rule_based_tool = {
+ "nltk": lambda x: [x.strip() for x in nltk.sent_tokenize(x) if len(x.strip()) >= 3],
+ "spacy": lambda x: [x.text.strip() for x in nlp(x).sents if len(x.text.strip()) >= 3]
+ }.get(self.rule_based_method, "nltk")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ # We have merged the text decomposer and worthiness filter here.
+ response = state.get(self.input_name)
+ claims = [response]
+
+ user_input = self.prompt.format(doc=response).strip()
+ r = gpt(user_input, model=self.model, system_role=self.decompose_system_role, num_retries=self.num_retries)
+ try:
+ claims = eval(r)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ save_to_file(r)
+
+ if not isinstance(claims, list):
+ print(
+ f"{self.model} output {r}. It does not output a list of sentences correctly, return rule-based split results.")
+ claims = self.rule_based_tool(response)
+
+ worthiness = [True] * len(claims)
+ user_input = CHECKWORTHY_PROMPT_BOOL.format(claims=claims)
+ response = gpt(user_input, model=self.model, system_role=self.worthines_filter_system_role,
+ num_retries=self.num_retries)
+ # TODO refine check worthiness prompt, value returned not reasonable.
+ try:
+ worthiness = eval(response)
+ assert len(worthiness) == len(claims)
+ except AssertionError as e:
+ print(f"An unexpected error occurred: {e}")
+ print(f"There are {len(claims)} texts, while {len(worthiness)} checkworthy predictions.")
+ return False, state
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, state
+
+ valid_claims = list(map(lambda x: x[1], filter(lambda x: x[0], zip(worthiness, claims))))
+ state.set(self.output_name, valid_claims)
+ return True, state
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_rtv.py b/src/openfactcheck/solvers/webservice/factcheckgpt_rtv.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e7a70e6ef318329c78c4ad1009b19c1cfade79
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_rtv.py
@@ -0,0 +1,322 @@
+import re
+import bs4
+import torch
+import spacy
+import backoff
+import requests
+import itertools
+import numpy as np
+import concurrent.futures
+from copy import deepcopy
+from openai import RateLimitError
+from sentence_transformers import CrossEncoder
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.prompt import QGEN_PROMPT, QGEN_PROMPT_FMT
+from .factcheckgpt_utils.data_util import save_txt, save_json
+
+@Solver.register("factcheckgpt_retriever", "claims", "claims_with_evidences")
+class FactCheckGPTRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("factcheckgpt_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ self.tokenizer = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer"])
+ self.question_duplicate_model = CrossEncoder(
+ 'navteca/quora-roberta-base',
+ device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ )
+ self.passage_ranker = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
+ )
+ # self.system_role = args.get("system_role", "You are a student full of curiosity")
+ self.qgen_system_role = "You are a student full of curiosity"
+ self.n_questions = args.get("n_questions", 5)
+ self.question_gen_round = args.get("question_gen_round", 1)
+ self.qgen_temp = args.get("qgen_temp", 0.7)
+ self.search_timeout = args.get("search_timeout", 10)
+ self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
+ self.max_passages_per_search_result_to_return = args.get("max_passages_per_search_result_to_return", 3)
+ self.sentences_per_passage = args.get("sentences_per_passage", 5)
+ self.max_passages_per_question = args.get("max_passages_per_question", 5)
+ self.max_aggregated_evidences = args.get("max_aggregated_evidences", 5)
+ self.question_persist_path = args.get("question_persist_path", 'questions.txt')
+ self.snippets_persist_path = args.get("snippets_persist_path", "passage.json")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+ claims_with_evidences = {}
+ for i, claim in enumerate(claims):
+ evidences = self.get_web_evidences_for_claim(claim)
+ claims_with_evidences[claim] = [(q, e['text']) for q, e in evidences['aggregated']]
+ state.set(self.output_name, claims_with_evidences)
+ return True, state
+
+ def generate_questions(self, claim, max_loop=5):
+ questions = []
+ while len(questions) <= 0:
+ questions = self.run_question_generation(claim)
+ if len(questions) >= 0:
+ questions = self.remove_duplicate_questions(questions)
+ save_txt(questions, self.question_persist_path)
+ return questions
+
+ def retrieve_documents(self, questions):
+ snippets = {}
+ for question in questions:
+ retrieved_passages = self.get_relevant_snippets(question)
+ snippets[question] = sorted(
+ retrieved_passages,
+ key=lambda x: x['retrieval_score'],
+ reverse=True
+ )[:self.max_passages_per_question]
+ save_json(snippets, self.snippets_persist_path)
+ return snippets
+
+ def get_web_evidences_for_claim(self, claim):
+ evidences = dict()
+ evidences["aggregated"] = list()
+ questions = self.generate_questions(claim)
+ snippets = self.retrieve_documents(questions)
+ evidences["question_wise"] = snippets
+ total_snippets = sum(list(map(lambda x: len(x), snippets.values())))
+ if total_snippets == 0:
+ raise RuntimeError("No passages are retrieved, check your network...")
+ if total_snippets > self.max_aggregated_evidences:
+ while len(evidences["aggregated"]) < self.max_aggregated_evidences:
+ for key in evidences["question_wise"]:
+ # Take top evidences for each question
+ if len(evidences["question_wise"][key]) > 0:
+ index = int(len(evidences["aggregated"]) / len(evidences["question_wise"]))
+ evidence = evidences["question_wise"][key][index]
+ evidences["aggregated"].append((key, evidence))
+ else:
+ evidences["aggregated"] = itertools.chain.from_iterable(
+ [[(q, e) for e in es] for q, es in snippets.items()]
+ )
+ return evidences
+
+ @backoff.on_exception(backoff.expo, RateLimitError)
+ def run_question_generation(self, claim):
+ questions = set()
+ for _ in range(self.question_gen_round):
+ user_input = QGEN_PROMPT_FMT.format(claim=claim, n=self.n_questions)
+ response = gpt(
+ user_input,
+ model=self.model,
+ system_role=self.qgen_system_role,
+ num_retries=self.num_retries,
+ temperature=self.qgen_temp
+ )
+ try:
+ cur_round_questions = set(eval(response))
+ questions.update(cur_round_questions)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ questions = list(sorted(questions))
+ return questions
+
+ def remove_duplicate_questions(self, all_questions):
+ qset = [all_questions[0]]
+ for question in all_questions[1:]:
+ q_list = [(q, question) for q in qset]
+ scores = self.question_duplicate_model.predict(q_list)
+ if np.max(scores) < 0.60:
+ qset.append(question)
+ return qset
+
+ def scrape_url(self, url: str, timeout: float = 3) -> tuple[str, str]:
+ """Scrapes a URL for all text information.
+
+ Args:
+ url: URL of webpage to scrape.
+ timeout: Timeout of the requests call.
+ Returns:
+ web_text: The visible text of the scraped URL.
+ url: URL input.
+ """
+ # Scrape the URL
+ try:
+ response = requests.get(url, timeout=timeout)
+ response.raise_for_status()
+ except requests.exceptions.RequestException as _:
+ print("URL Error", url)
+ return None, url
+
+ # Extract out all text from the tags
+ try:
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
+ texts = soup.findAll(text=True)
+ # Filter out invisible text from the page.
+ visible_text = filter(self.is_tag_visible, texts)
+ except Exception as _:
+ print("Parsing Error", response.text)
+ return None, url
+
+ # Returns all the text concatenated as a string.
+ web_text = " ".join(t.strip() for t in visible_text).strip()
+ # Clean up spacing.
+ web_text = " ".join(web_text.split())
+ return web_text, url
+
+ def is_tag_visible(self, element: bs4.element) -> bool:
+ """Determines if an HTML element is visible.
+
+ Args:
+ element: A BeautifulSoup element to check the visiblity of.
+ returns:
+ Whether the element is visible.
+ """
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ] or isinstance(element, bs4.element.Comment):
+ return False
+ return True
+
+ def search_google(self, query: str, num_web_pages: int = 10, timeout: int = 6, save_url: str = '') -> list[str]:
+ """Searches the query using Google.
+ Args:
+ query: Search query.
+ num_web_pages: the number of web pages to request.
+ save_url: path to save returned urls, such as 'urls.txt'
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+ query = query.replace(" ", "+")
+
+ # set headers: Google returns different web-pages according to agent device
+ # desktop user-agent
+ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
+ # mobile user-agent
+ MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
+ headers = {'User-Agent': USER_AGENT}
+
+ # set language
+ # set the Google interface language, use &hl=XX
+ # set the preferred language of the search results, use &lr=lang_XX
+ # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
+ lang = "en"
+
+ # scrape google results
+ urls = []
+ for page in range(0, num_web_pages, 10):
+ # here page is google search's bottom page meaning, click 2 -> start=10
+ # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+ url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
+ r = requests.get(url, headers=headers, timeout=timeout)
+ # collect all urls by regular expression
+ # how to do if I just want to have the returned top-k pages?
+ urls += re.findall('href="(https?://.*?)"', r.text)
+
+ # set to remove repeated urls
+ urls = list(set(urls))
+
+ # save all url into a txt file
+ if not save_url == "":
+ with open(save_url, 'w') as file:
+ for url in urls:
+ file.write(url + '\n')
+ return urls
+
+ def chunk_text(
+ self,
+ text: str,
+ tokenizer,
+ sentences_per_passage: int = 5,
+ filter_sentence_len: int = 250,
+ sliding_distance: int = 2,
+ ) -> list[str]:
+ """Chunks text into passages using a sliding window.
+
+ Args:
+ text: Text to chunk into passages.
+ sentences_per_passage: Number of sentences for each passage.
+ filter_sentence_len: Maximum number of chars of each sentence before being filtered.
+ sliding_distance: Sliding distance over the text. Allows the passages to have
+ overlap. The sliding distance cannot be greater than the window size.
+ Returns:
+ passages: Chunked passages from the text.
+ """
+ if not sliding_distance or sliding_distance > sentences_per_passage:
+ sliding_distance = sentences_per_passage
+ assert sentences_per_passage > 0 and sliding_distance > 0
+
+ passages = []
+ try:
+ doc = tokenizer(text[:500000]) # Take 500k chars to not break tokenization.
+ sents = [
+ s.text.replace("\n", " ")
+ for s in doc.sents
+ if len(s.text) <= filter_sentence_len # Long sents are usually metadata.
+ ]
+ for idx in range(0, len(sents), sliding_distance):
+ passages.append(
+ (" ".join(sents[idx: idx + sentences_per_passage]), idx, idx + sentences_per_passage - 1))
+ except UnicodeEncodeError as _: # Sometimes run into Unicode error when tokenizing.
+ print("Unicode error when using Spacy. Skipping text.")
+
+ return passages
+
+ def get_relevant_snippets(
+ self,
+ query,
+ ):
+ search_results = self.search_google(query, timeout=self.search_timeout)
+
+ with concurrent.futures.ThreadPoolExecutor() as e:
+ scraped_results = e.map(self.scrape_url, search_results, itertools.repeat(self.search_timeout))
+ # Remove URLs if we weren't able to scrape anything or if they are a PDF.
+ scraped_results = [r for r in scraped_results if r[0] and ".pdf" not in r[1]]
+ # print("Num Bing Search Results: ", len(scraped_results))
+ retrieved_passages = list()
+ for webtext, url in scraped_results[:self.max_search_results_per_query]:
+ passages = self.chunk_text(
+ text=webtext,
+ tokenizer=self.tokenizer,
+ sentences_per_passage=self.sentences_per_passage
+ )
+ if not passages:
+ continue
+
+ # Score the passages by relevance to the query using a cross-encoder.
+ scores = self.passage_ranker.predict([(query, p[0]) for p in passages]).tolist()
+ # Take the top passages_per_search passages for the current search result.
+ passage_scores = sorted(zip(passages, scores), reverse=True, key=lambda x: x[1])
+
+ relevant_items = list()
+ for passage_item, score in passage_scores:
+ overlap = False
+ if len(relevant_items) > 0:
+ for item in relevant_items:
+ if passage_item[1] >= item[1] and passage_item[1] <= item[2]:
+ overlap = True
+ break
+ if passage_item[2] >= item[1] and passage_item[2] <= item[2]:
+ overlap = True
+ break
+
+ # Only consider top non-overlapping relevant passages to maximise for information
+ if not overlap:
+ relevant_items.append(deepcopy(passage_item))
+ retrieved_passages.append(
+ {
+ "text": passage_item[0],
+ "url": url,
+ "sents_per_passage": self.sentences_per_passage,
+ "retrieval_score": score, # Cross-encoder score as retr score
+ }
+ )
+ if len(relevant_items) >= self.max_passages_per_search_result_to_return:
+ break
+ # print("Total snippets extracted: ", len(retrieved_passages))
+ return retrieved_passages
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/__init__.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/data_util.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/data_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1099285c6e2d493da717517d3f8cf5ed20fc1acb
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/data_util.py
@@ -0,0 +1,136 @@
+import csv
+import json
+import numpy as np
+from collections import Counter
+from typing import Dict, List, Any
+
+
+def save_to_file(text, filename='error_output.txt'):
+ """Save a string to a file line by line."""
+ with open(filename, 'a', encoding='utf-8') as file:
+ file.write(text + '\n')
+
+
+def majority_vote(input_list):
+ # Use Counter to count occurrences of each element
+ counter = Counter(input_list)
+
+ # Find the element with the maximum count (majority)
+ majority_element = max(counter, key=counter.get)
+
+ # Return the majority element
+ return majority_element
+
+
+def is_float(string):
+ if string.replace(".", "").isnumeric():
+ return True
+ else:
+ return False
+
+
+def save_json(dictionary: Dict[str, Any], save_dir: str) -> None:
+ # Serializing json
+ json_object = json.dumps(dictionary, indent=4, ensure_ascii=False)
+
+ # Writing to sample.json
+ with open(save_dir, "w", encoding='utf-8') as outfile:
+ outfile.write(json_object)
+
+
+def read_json(filepath: str) -> Dict[str, Any]:
+ data = {}
+ with open(filepath, 'r', encoding='utf-8') as file:
+ data = json.load(file)
+ return data
+
+
+def list_to_dict(data: List[Dict[str, Any]]) -> Dict[int, Any]:
+ temp = {}
+ for i, d in enumerate(data):
+ temp[i] = d
+ return temp
+
+
+def load_jsonl(path):
+ data = []
+ with open(path, 'r', encoding='utf-8') as reader:
+ for line in reader:
+ data.append(json.loads(line))
+ return data
+
+
+# def load_jsonl(input_path) -> list:
+# """
+# Read list of objects from a JSON lines file.
+# """
+# data = []
+# with open(input_path, 'r', encoding='utf-8') as f:
+# for line in f:
+# data.append(json.loads(line.rstrip('\n|\r')))
+# print('Loaded {} records from {}'.format(len(data), input_path))
+# return data
+
+def dump_jsonl(data, output_path, append=False):
+ """
+ Write list of objects to a JSON lines file.
+ """
+ mode = 'a+' if append else 'w'
+ with open(output_path, mode, encoding='utf-8') as f:
+ for line in data:
+ json_record = json.dumps(line, ensure_ascii=False)
+ f.write(json_record + '\n')
+ print('Wrote {} records to {}'.format(len(data), output_path))
+
+
+def cosine(u, v):
+ """based on embeddings and calculate cosine similarity"""
+ return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+
+def read_csv(input_file, quotechar=None):
+ with open(input_file, "r", encoding="utf-8") as f:
+ reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+ lines = []
+ for line in reader:
+ lines.append(line)
+ return lines
+
+
+def save_csv(header, data, output_file):
+ with open(output_file, 'w', encoding='UTF8', newline='') as f:
+ writer = csv.writer(f, delimiter='\t')
+ # write the header
+ writer.writerow(header)
+ # write multiple rows
+ writer.writerows(data)
+
+
+def save_array(filename, embeddings):
+ # save embeddings into file
+ with open(filename, 'wb') as f:
+ np.save(f, embeddings)
+
+
+def load_array(filename):
+ with open(filename, 'rb') as f:
+ a = np.load(f)
+ return a
+
+
+def read_txt(input_file):
+ with open(input_file, "r", encoding="utf-8") as f:
+ return f.readlines()
+
+
+def save_txt(data, output_file):
+ with open(output_file, "w", encoding="utf-8") as writer:
+ writer.write("\n".join(data))
+
+
+def clean_text(text):
+ for mark in ['"', '-', '\t', ' ']:
+ for i in [5, 4, 3, 2]:
+ marks = mark * i
+ text = text.replace(marks, '')
+ return text
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/eval_utils.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/eval_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4efea3d83fe19e9de654a5dfdc71338c742d4b4
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/eval_utils.py
@@ -0,0 +1,41 @@
+# code for general evaluation
+
+import numpy as np
+import evaluate
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+
+def evaluate_classification(preds, gold):
+ metric = evaluate.load("bstrai/classification_report")
+ return metric.compute(predictions=preds, references=gold)
+
+def eval_classification(y_true, y_pred, average="macro"):
+ precision, recall, F1, support = precision_recall_fscore_support(y_true, y_pred, average=average)
+ accuracy = accuracy_score(y_true, y_pred)
+
+ metrics = {
+ "accuracy": round(accuracy, 3),
+ "precision": round(precision, 3),
+ "recall": round(recall, 3),
+ "F1": round(F1, 3),
+ }
+ return metrics
+
+
+def eval_binary(y_true, y_pred, pos_label=1, average="binary"):
+ """pos_label: postive label is machine text here, label is 1, human text is 0"""
+ precision, recall, F1, support = precision_recall_fscore_support(
+ y_true, y_pred, pos_label = pos_label, average = average)
+ # accuracy
+ accuracy = accuracy_score(y_true, y_pred)
+ # precison
+ # pre = precision_score(y_true, y_pred, pos_label = pos_label, average = average)
+ # recall
+ # rec = recall_score(y_true, y_pred, pos_label = pos_label, average = average)
+ metrics = {
+ "accuracy": round(accuracy, 3),
+ "precision": round(precision, 3),
+ "recall": round(recall, 3),
+ "F1": round(F1, 3),
+ }
+ return metrics
+
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/nli.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/nli.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d6161ea7568f8688f2a44ea9b7a4a11e95f41c
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/nli.py
@@ -0,0 +1,44 @@
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+
+# Load model directly
+# Sentiment analysis pipeline
+# classifier = pipeline("sentiment-analysis", model="roberta-large-mnli")
+
+tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
+model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
+classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+
+nli_labelmap = {
+ "NEUTRAL": 3,
+ "CONTRADICTION":2,
+ "ENTAILMENT": 1
+}
+
+nli2stance = {
+ "NEUTRAL": 0,
+ "CONTRADICTION": -1,
+ "ENTAILMENT": 1
+}
+
+stance_map = {
+ 'irrelevant': 3,
+ 'refute': 2,
+ 'partially-support': 1,
+ 'completely-support': 1
+}
+
+def nli_infer(premise, hypothesis):
+ # predict one example by nli model
+ try:
+ input = "{}{}".format(premise, hypothesis)
+ pred = classifier(input)
+ # print(pred)
+ except:
+ # token length > 514
+ L = len(premise)
+ premise = premise[:int(L/2)]
+ input = "{}{}".format(premise, hypothesis)
+ pred = classifier(input)
+ # print(pred)
+ # [{'label': 'CONTRADICTION', 'score': 0.9992701411247253}]
+ return nli2stance[pred[0]['label']]
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/openai_api.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/openai_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6bea110a0af6bccd250ac6c8be7bcacfa48afbf
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/openai_api.py
@@ -0,0 +1,75 @@
+import os
+import time
+from openai import OpenAI
+import openai
+
+client = None
+
+
+def init_client():
+ global client
+ if client is None:
+ if openai.api_key is None and 'OPENAI_API_KEY' not in os.environ:
+ print("openai_key not presented, delay to initialize.")
+ return
+ client = OpenAI()
+
+
+def request(
+ user_inputs,
+ model,
+ system_role,
+ temperature=1.0,
+ return_all=False,
+):
+ init_client()
+
+ if type(user_inputs) == str:
+ chat_histories = [{"role": "user", "content": user_inputs}]
+ elif type(user_inputs) == list:
+ if all([type(x) == str for x in user_inputs]):
+ chat_histories = [
+ {
+ "role": "user" if i % 2 == 0 else "assistant", "content": x
+ } for i, x in enumerate(user_inputs)
+ ]
+ elif all([type(x) == dict for x in user_inputs]):
+ chat_histories = user_inputs
+ else:
+ raise ValueError("Invalid input for OpenAI API calling")
+ else:
+ raise ValueError("Invalid input for OpenAI API calling")
+
+
+ messages = [{"role": "system", "content": system_role}] + chat_histories
+
+ response = client.chat.completions.create(
+ model=model,
+ messages=messages,
+ temperature=temperature
+ )
+ if return_all:
+ return response
+ response_str = ''
+ for choice in response.choices:
+ response_str += choice.message.content
+ return response_str
+
+
+def gpt(
+ user_inputs,
+ model,
+ system_role,
+ temperature=1.0,
+ num_retries=3,
+ waiting=1
+):
+ response = None
+ for _ in range(num_retries):
+ try:
+ response = request(user_inputs, model, system_role, temperature=temperature)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(waiting)
+ return response
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_utils/prompt.py b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c796792186111d827e6a9e9a8d2336af35eaebf
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_utils/prompt.py
@@ -0,0 +1,566 @@
+"""All prompts used for fact-checking subtasks prompting."""
+
+# updates in Dec are donimant of function-based or code-based prompts,
+# to get rid of parsing LLM results
+# ------------------------------------------------------------------------
+# Dec 2023: decompose and decontextualise, return a list
+# ------------------------------------------------------------------------
+DOC_TO_INDEPEDENT_SENTENCES_PROMPT = """
+Your task is to perform sentence segmentation and de-contextualization.
+Let's define a function named process(input:str).
+The return value should be a list of strings, where each string should be a decontextualized sentence.
+For example, if a user call process("Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.").
+You should return a python list without any other words,
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+process("{doc}")
+"""
+
+SENTENCES_TO_CLAIMS_PROMPT = """
+Your task is to decompose the text into atomic claims.
+Let's define a function named decompose(input:str).
+The returned value should be a list of strings, where each string should be a context-independent claim, representing one fact.
+For example, if a user call decompose("Mary is a five-year old girl, she likes playing piano and she doesn't like cookies.").
+You should return a python list without any other words:
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+decompose("{doc}")
+"""
+
+# just sentence splits without decontextualization
+DOC_TO_SENTENCES_PROMPT = """
+Your task is to perform sentence segmentation.
+Let's define a function named split(input:str).
+The return value should be a list of strings, where each string should be a sentence.
+For example, if a user call process("Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.").
+You should return a python list without any other words,
+["Mary is a five-year old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+split("{doc}")
+"""
+
+# ------------------------------------------------------------------------
+# Dec 2023: identify checkworthy
+# ------------------------------------------------------------------------
+CHECKWORTHY_PROMPT = """
+Your task is to identify whether texts are checkworthy in the context of fact-checking.
+Let's define a function named checkworthy(input: List[str]).
+The return value should be a list of strings, where each string selects from ["Yes", "No"].
+"Yes" means the text is a factual checkworthy statement.
+"No" means that the text is not checkworthy, it might be an opinion, a question, or others.
+For example, if a user call checkworthy(["I think Apple is a good company.", "Friends is a great TV series.", "Are you sure Preslav is a professor in MBZUAI?", "The Stanford Prison Experiment was conducted in the basement of Encina Hall.", "As a language model, I can't provide these info."])
+You should return a python list without any other words,
+["No", "Yes", "No", "Yes", "No"]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+
+checkworthy({texts})
+"""
+
+CHECKWORTHY_PROMPT_BOOL = """
+Your task is to identify whether texts are checkworthy in the context of fact-checking.
+Let's define a function named checkworthy(input: List[str]).
+The return value should be a list of bool values: [True, False].
+True means the text is a factual checkworthy statement.
+False means that the text is not checkworthy, it might be an opinion, a question, or others.
+For example, if a user call checkworthy(["I think Apple is a good company.", "Friends is a great TV series.", "Are you sure Preslav is a professor in MBZUAI?", "The Stanford Prison Experiment was conducted in the basement of Encina Hall.", "As a language model, I can't provide these info."])
+You should return a python list without any other words,
+[False, True, False, True, False]
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+
+checkworthy({claims})
+"""
+
+SPECIFY_CHECKWORTHY_CATEGORY_PROMPT = """
+You are a factchecker assistant with task to identify a sentence, whether it is 1. a factual claim; 2. an opinion; 3. not a claim (like a question or a imperative sentence); 4. other categories.
+Let's define a function named checkworthy(input: str).
+The return value should be a python int without any other words, representing index label, where index selects from [1, 2, 3, 4].
+
+For example, if a user call checkworthy("I think Apple is a good company.")
+You should return 2
+If a user call checkworthy("Friends is a great TV series.")
+You should return 1
+If a user call checkworthy("Are you sure Preslav is a professor in MBZUAI?")
+You should return 3
+If a user call checkworthy("As a language model, I can't provide these info.")
+You should return 4
+Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!
+
+checkworthy("{sentence}")
+"""
+
+# ------------------------------------------------------------------------
+# Dec 2023: Verify
+# ------------------------------------------------------------------------
+IDENTIFY_STANCE_PROMPT = """You are given a claim and an evidence text, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim. Choose from the following three options.
+A. The evidence supports the claim.
+B. The evidence refutes the claim.
+C. The evidence is irrelevant to the claim.
+
+For example, you are give Claim: "Preslav is a professor.", Evidence: "Preslav Nakov is a Professor in MBZUAI NLP group, and also the department chair." You should return A
+Pick the correct option A, B, C without other words.
+
+Claim: {claim}
+Evidence: {evidence}"""
+
+IDENTIFY_STANCE_PROMPT_FUNC = """
+Let's define a function named verify(claim:str, evidence:str) -> {-1,0,1}
+You are given a claim and an evidence text as input, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim. Choose from the following three options as the return value.
+1: The evidence supports the claim.
+-1: The evidence refutes the claim.
+0: The evidence is irrelevant to the claim.
+
+For example, when the user call verify(claim="Preslav is a professor.", evidence="Preslav Nakov is a Professor in MBZUAI NLP group, and also the department chair.")
+You should return 1
+Pick the correct option -1, 0, 1 without other words.
+
+verify(claim="{claim}",evidence="{evidence}")"""
+
+
+# , which correspond to the reasoning, whether the given text is factual or not (Boolean - True or False), the factual error present in the text, and the corrected text.
+
+VERIFY_PROMPT = """
+You are given a piece of text. Your task is to identify whether there are any factual errors within the text.
+When you are judging the factuality of the given text, you could reference the provided evidences if needed. The provided evidences may be helpful. Some evidences may contradict to each other. You must be careful when using the evidences to judge the factuality of the given text.
+The response should be a Python dict with four keys - "reasoning", "error", "correction" and "factuality".
+The following is the given text:
+[text]: {claim}
+The following is the provided evidences:
+[evidences]: {evidence}
+You should only respond in format as described below. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '{{'.
+[response format]:
+{{
+ "reasoning": "Why is the given text factual or non-factual? Be careful when you said something is non-factual. When you said something is non-factual, you must provide multiple evidences to support your decision.",
+ "error": "None if the text is factual; otherwise, describe the error in string.",
+ "correction": "A string, the corrected text if there is an error.",
+ "factuality": "An int value, 1 stands for the given text is factual, -1 is for non-factual, and 0 for irrelevant."
+}}
+"""
+# ------------------------------------------
+# Oct 2023
+# ------------------------------------------
+zero_shot_sentence_checkworthiness = """You are a factchecker assistant with task to identify sentences that are checkworthy. Sentence is checkworthy only if it contains factual claims.
+Classify the check-worthiness of these sentences, and output the label yes or no:
+{sentence}
+output:
+"""
+
+zero_shot_claim_checkworthiness = """You are a factchecker assistant with task to identify a sentence, whether it is 1. a factual claim; 2. an opinion; 3. not a claim (like a question or a imperative sentence); 4. other categories. \n
+Output the label index only: \n
+{claim} \n
+output:
+"""
+
+# We find that it is hard for model to distinguish complete support and partial support, merge as the one: support
+zero_shot_claim_evidence_three_stance_prompt = "### Instruction: You are given a claim and an evidence text, and you need to decide whether the evidence supports, refutes, or is irrelevant to the claim.\n\n### Input:\n\nClaim: {claim}\n\nEvidence: {evidence}\n\nOptions are as follows:\n A) The evidence supports the claim.\n\n B) The evidence refutes the claim.\n C) The evidence is irrelevant to the claim.\n\n Pick the correct option. \n\n### Final Answer: "
+
+zero_shot_claim_evidence_stance = """Given the evidence \n {evidence}, determine if the following statement is completely supported, partially supported, refuted or is irrelevant: {claim}, choose from four labels: 1. completely support, 2. partially support, 3. refute and 4. irrelevant.
+Return the label index only.
+Label index:
+"""
+
+zero_shot_nli = """Given the premise sentence {}, determine if the following statement is entailed or contradicted or neutral: {}, by three labels: entailment, contradiction, neutral.
+Label:
+"""
+
+zero_shot_edit_response = """Given a document containing factual errors, please correct the errors in the document depending on a corresponding list of factually true claims. Note that preserve the linguistic features and style of the original document, just correct factual errors.
+
+document: {response}
+
+true claims: {claims}
+
+revised document: """
+
+zero_shot_edit_response_given_question = """Given a question, and an answer containing factual errors, please correct the errors in the document depending on a corresponding list of factually true claims. Note that preserve the linguistic features and style of the original document, just correct factual errors.
+
+question: {prompt}
+
+document: {response}
+
+true claims: {claims}
+
+revised document: """
+
+# -------------------------------------------------------------------
+# July 2023: decompose and decontextualise into atomic claims
+# -------------------------------------------------------------------
+# ZERO_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context: {}, please breakdown the following sentence into independent facts and replace pronouns such as it, they, those, these, this, that, with specific entities or events.
+# The sentence is: {}
+# Atomic facts for this sentence are: """
+
+ZERO_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context: {}, please breakdown the following sentence into independent facts.
+The sentence is: {}
+Atomic facts for this sentence are: """
+
+FEW_SHOT_SENTENCE_TO_ATOMIC_CLAIMS = """Depending the context, please breakdown the following sentence into independent facts.
+
+Context: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021. Obama was the first black president in the history of the United States. He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya. Trump was the second black president. He was born in New York City and previously served as a businessman and reality television personality.
+
+The sentence is: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021.
+Atomic facts for this sentence are:
+[
+ "The United States has had two black presidents: Barack Obama and Donald Trump.",
+ "Black president Barack Obama served two terms from 2009 to 2017.",
+ "Black president Donald Trump served one term from 2017 to 2021."
+]
+
+The sentence is: Obama was the first black president in the history of the United States.
+Atomic facts for this sentence are:
+[
+ "Obama was the first black president in the history of the United States."
+]
+
+The sentence is: He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya.
+Atomic facts for this sentence are:
+[
+ "Barack Obama was born in Honolulu, Hawaii.",
+ "Barack Obama mother was from Kansas.",
+ "Barack Obama father was from Kenya."
+]
+
+The sentence is: Trump was the second black president.
+Atomic facts for this sentence are:
+[
+ "Trump was the second black president."
+]
+
+The sentence is: He was born in New York City and previously served as a businessman and reality television personality.
+Atomic facts for this sentence are:
+[
+ "Donald Trump was born in New York City.",
+ "Donald Trump previously served as a businessman",
+ "Donald Trump previously served as a reality television personality."
+]
+
+
+Context: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas. He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975. Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+The sentence is: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas.
+Atomic facts for this sentence are:
+[
+ "In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas."
+]
+
+The sentence is: He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975.
+Atomic facts for this sentence are:
+[
+ "Justice William O. Douglas was born on October 16, 1898."
+ "Justice William O. Douglas served on the Supreme Court from 1939 until his retirement in 1975."
+]
+
+The sentence is: Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+Atomic facts for this sentence are:
+[
+ "Therefore, in 1980, Justice Douglas was still alive."
+ "Justice William O. Douglas would have been the oldest serving justice on the Court in 1980."
+]
+
+
+Context: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group. The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+The sentence is: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group.
+Atomic facts for this sentence are:
+[
+ "There have been only four female presidents of the United States in the country's history.",
+ "It is difficult to determine an average height for four female presidents of the United States."
+]
+
+The sentence is: The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+Atomic facts for this sentence are:
+[
+ "Abigail Adams (1797-1801) is a female president of the United States.",
+ "Marilyn Carlson Nelson (2009-2013) is a female president of the United States.",
+ "Luci Baines Johnson (1973-1977) is a female president of the United States.",
+ "Hillary Clinton (2017-2021) is a female president of the United States."
+]
+
+
+Context: {}
+The sentence is: {}
+Atomic facts for this sentence are:
+"""
+
+# This prompt aims to break the document into decontextualised sentences, and then atomic claims
+# Though it can not decontexlualize sentences, it can better break all sentences than the prompt above
+# combined with using system_role = "You are good at document decomposition and decontextualization."
+# date: 22/10/2023
+FEW_SHOT_DECONTEXTUALIZE_SENTENCE_ATOMIC_CLAIMS = """Depending the context, please break it down into independent sentences, and breakdown the sentence into independent facts.
+Context: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021. Obama was the first black president in the history of the United States. He was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya. Trump was the second black president. He was born in New York City and previously served as a businessman and reality television personality.
+
+The sentence is: The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021.
+Atomic facts for this sentence are:
+[
+ "The United States has had two black presidents: Barack Obama and Donald Trump.",
+ "Black president Barack Obama served two terms from 2009 to 2017.",
+ "Black president Donald Trump served one term from 2017 to 2021."
+]
+
+The sentence is: Obama was the first black president in the history of the United States.
+Atomic facts for this sentence are:
+[
+ "Obama was the first black president in the history of the United States."
+]
+
+The sentence is: Barack Obama was born in Honolulu, Hawaii, to a mother from Kansas and a father from Kenya.
+Atomic facts for this sentence are:
+[
+ "Barack Obama was born in Honolulu, Hawaii.",
+ "Barack Obama mother was from Kansas.",
+ "Barack Obama father was from Kenya."
+]
+
+The sentence is: Trump was the second black president.
+Atomic facts for this sentence are:
+[
+ "Trump was the second black president."
+]
+
+The sentence is: Donald Trump was born in New York City and previously served as a businessman and reality television personality.
+Atomic facts for this sentence are:
+[
+ "Donald Trump was born in New York City.",
+ "Donald Trump previously served as a businessman",
+ "Donald Trump previously served as a reality television personality."
+]
+
+
+Context: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas. He was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975. Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+The sentence is: In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas.
+Atomic facts for this sentence are:
+[
+ "In 1980, the oldest justice on the United States Supreme Court was Justice William O. Douglas."
+]
+
+The sentence is: Justice William O. Douglas was born on October 16, 1898, and served on the Supreme Court from 1939 until his retirement in 1975.
+Atomic facts for this sentence are:
+[
+ "Justice William O. Douglas was born on October 16, 1898."
+ "Justice William O. Douglas served on the Supreme Court from 1939 until his retirement in 1975."
+]
+
+The sentence is: Therefore, in 1980, Justice Douglas was still alive and would have been the oldest serving justice on the Court at that time.
+Atomic facts for this sentence are:
+[
+ "Therefore, in 1980, Justice Douglas was still alive."
+ "Justice William O. Douglas would have been the oldest serving justice on the Court in 1980."
+]
+
+
+Context: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group. The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+The sentence is: There have been only four female presidents of the United States in the country's history, so it is difficult to determine an average height for this group.
+Atomic facts for this sentence are:
+[
+ "There have been only four female presidents of the United States in the country's history.",
+ "It is difficult to determine an average height for four female presidents of the United States."
+]
+
+The sentence is: The four female presidents were: \r\n1.Abigail Adams (1797-1801) \r\n2.Marilyn Carlson Nelson (2009-2013) \r\n3.Luci Baines Johnson (1973-1977) \r\n4.Hillary Clinton (2017-2021)
+Atomic facts for this sentence are:
+[
+ "Abigail Adams (1797-1801) is a female president of the United States.",
+ "Marilyn Carlson Nelson (2009-2013) is a female president of the United States.",
+ "Luci Baines Johnson (1973-1977) is a female president of the United States.",
+ "Hillary Clinton (2017-2021) is a female president of the United States."
+]
+
+
+Context: {}
+The sentence is: {}
+Atomic facts for this sentence are:
+"""
+
+# -------------------------------------------------------------------
+# April 2023: overall simple pipeline prompts
+# -------------------------------------------------------------------
+DECONTEXTILISATION_PROMPT = """Decompose and decontextualise a document into independently meaningful sentences. This process will make each sentence stand alone that can be verified independently.
+
+Input: Mary is a five-year old girl. She likes playing piano. She doesn't like cookies.
+Output:
+Mary is a five-year old girl.
+Mary likes playing piano.
+Mary doesn't like cookies.
+
+Input: Google began as an online search firm, but it now offers more than 50 Internet services and products, from e-mail and online document creation to software for mobile phones and tablet computers. In addition, its 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+Ouput:
+Google began as an online search firm.
+Google now offers more than 50 Internet services and products.
+Google offers from e-mail and online document creation to software for mobile phones and tablet computers.
+Google 2012 acquisition of Motorola Mobility put it in the position to sell hardware in the form of mobile phones.
+
+Input: """
+
+CHECK_WORTHINESS_LABEL_ONLY_PROMPT = """Identify whether this claim is an opinion or factual, and whether it is checkworthy or not in the context of fact-checking. Just return two labels without explanation.
+I think Apple is a good company.
+opinon, not checkworthy
+Preslav is a professor in MBZUAI.
+factual, checkworthy
+Friends is a great TV series.
+opinion, not checkworthy
+The Stanford Prison Experiment was conducted in the basement of Encina Hall.
+factual, checkworthy
+"""
+
+ENTITY_EXTRACTION_PROMPT = """Extract all entities of a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: Google, Internet services, product
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Donald John Trump, American politician, media personality, businessman
+Input: """
+
+QGEN_PROMPT_DEP = """Give a list of queries using for searching related information for a claim.
+Input: Google now offers more than 50 Internet services and products.
+Output: What does Google offers now?
+How many service and product does Google offer?
+Google, more than 50 Internet services, products
+Input: Donald John Trump is an American politician, media personality, and businessman.
+Output: Who is Donald John Trump?
+Give information of Donald John Trump.
+Donald John Trump, American politician
+Donald John Trump, media personality
+Donald John Trump, businessman
+Input: """
+
+QGEN_PROMPT = """I will check things you said and ask questions.
+
+You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+To verify it,
+1. I googled: Does your nose switch between nostrils?
+2. I googled: How often does your nostrils switch?
+3. I googled: Why does your nostril switch?
+4. I googled: What is nasal cycle?
+
+You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+To verify it,
+1. I googled: Where was Stanford Prison Experiment was conducted?
+
+You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+To verify it,
+1. I googled: What does Havel-Hakimi algorithm do?
+2. I googled: Who are Havel-Hakimi algorithm named after?
+
+You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+To verify it,
+1. I googled: Who sings the song "Time of My Life"?
+2. I googled: Which film is the song "Time of My Life" from?
+3. I googled: Who produced the song "Time of My Life"?
+
+You said: Kelvin Hopins was suspended from the Labor Party due to his membership in the Conservative Party.
+To verify it,
+1. I googled: Why was Kelvin Hopins suspended from Labor Party?
+
+You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+To verify it,
+1. I googled: What philosophical tradition is social work based on?
+2. I googled: What year does social work have its root in?
+
+You said: {claim}
+To verify it,
+""".strip()
+
+QGEN_PROMPT_FMT = '''
+You need to ask N questions based on the provided claim.
+Here are some examples:
+- Claim:
+Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+- N=4
+- Questions you may response:
+["Does your nose switch between nostrils?", "How often does your nostrils switch?", "Why does your nostril switch?", "What is nasal cycle?"]
+
+- Claim:
+The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+- N=1
+- Questions you may response:
+["Where was Stanford Prison Experiment was conducted?"]
+
+- Claim:
+The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+- N=2
+- Questions you may response:
+["What does Havel-Hakimi algorithm do?", "Who are Havel-Hakimi algorithm named after?"]
+
+Remember, you need to put your questions into a python list so that I will search them with the search engine API, so DON'T RETURN ANY OTHER IRRELEVANT WORDS!
+- Claim:
+{claim}
+- N={n}
+'''.strip()
+
+STANCE_DETECTION_PROMPT = """Determine whether the evidence support the claim or not. Choose label from [support, partial support, refute, other] and explain why.
+Support means we can entail the claim by the evidence.
+Partial support means: part of the information presented in the claim appear in the evidence.
+Refute means that the evidence mention the same event as the claim, but a clear opposite fact. It should be highlighed that under refute, the evidence mentions the fact in the claim, they are closely relevant, but opposite meaning or stance.
+Other means the evidence does not mention anything about the fact described in the claim, such that it neither supports nor refutes the claim.
+
+Claim: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: support, statement 'he is also the founder, CEO and chief engineer of SpaceX' in evidence above supports the claim.
+
+Claim: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the founder, CEO and chief engineer of SpaceX.
+Stance: partial support.
+
+Claim: Steve Jobs is the founder, CEO and chief engineer of SpaceX.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: refute.
+
+Claim: Elon Musk is a professor in The Stanford University.
+Evidence: Elon Musk is the owner and CEO of Twitter, and he is also the founder, CEO and chief engineer of SpaceX.
+Stance: other, according to the evidence, I cannot judge whether the claim is true or not, not enough information, the evidence neither supports nor refutes.
+
+Claim: On January 6, 2021, a mob of supporters of former President Donald Trump stormed the U.S. Capitol in an attempt to overturn the 2020 presidential election.
+Evidence: On January 6, 2021, following the defeat of U.S. President Donald Trump in the 2020 presidential election, a mob of his supporters attacked the United States Capitol Building in Washington, D.C. The mob sought to keep Trump in power by preventing a joint session of Congress from counting the electoral college votes to formalize the victory of President-elect Joe Biden.
+Stance: support.
+
+Claim: The 2021 Capitol Hill riots resulted in the deaths of five people, including a Capitol police officer.
+Evidence: Five people died either shortly before, during, or following the riot: one was shot by Capitol Police, another died of a drug overdose, and three died of natural causes.
+Stance: partial support, the evidence supports that fact that five deaths, but not sure whether they include a Capitol police officer or not.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: As of November 10, 2022, over 940 people had been charged in the Capitol breach.
+Stance: refute, evidence and claim are describing the same thing, the number of people who was charged is over 940, while more than 300 in the claim, so the evidence refutes the claim.
+
+Claim: More than 300 people have been charged with crimes related to the riots.
+Evidence: The laptop computer taken from Pelosi's office was taken by 22-year-old Capitol rioter Riley Williams. Williams was arrested and indicted on eight counts, including theft of government property, obstructing an official proceeding, and assaulting or resisting police.
+Stance: other, the evidence demonstrates something relevent to the fact in the claim, but it does not support or refute any information of it.
+
+Claim: {}
+Evidence: {}
+Stance: """
+
+EDITOR_PROMPT = """Fix the claim according to the evidence.
+
+Claim: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+Evidence: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+This suggests 45 minutes switch time in your statement is wrong.
+Fix: Your nose switches back and forth between nostrils. When you sleep, you switch about every 2 hours. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+
+Claim: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+Evidence: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+This suggests General Thomas Hall in your statement is wrong.
+Fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+Claim: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+Evidence: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+This suggests Encina Hall in your statement is wrong.
+Fix: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+Claim: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+Evidence: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+Fix: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. It is named after Vaclav Havel and Samih Hakimi.
+
+Claim: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Phil Ramone.
+Evidence: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+This suggests "Time of My Life" producer name in your statement is wrong.
+Fix: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+
+Claim: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+Evidence: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+Fix: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+Claim: {claim}
+Evidence: {evidence}
+This suggests
+""".strip()
diff --git a/src/openfactcheck/solvers/webservice/factcheckgpt_vfr.py b/src/openfactcheck/solvers/webservice/factcheckgpt_vfr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f0255db785f700c8454fe5995506a91874f219
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/factcheckgpt_vfr.py
@@ -0,0 +1,119 @@
+import json
+from typing import Any
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .factcheckgpt_utils.prompt import VERIFY_PROMPT
+from .factcheckgpt_utils.openai_api import gpt
+from .factcheckgpt_utils.data_util import save_to_file
+from .factcheckgpt_utils.prompt import IDENTIFY_STANCE_PROMPT, IDENTIFY_STANCE_PROMPT_FUNC
+from .factcheckgpt_utils.nli import nli_infer
+
+@Solver.register("factcheckgpt_verifier", "claims_with_evidences", "label")
+class FactCheckGPTVerifier(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.stance_model = args.get("stance_model", "gpt-3.5-turbo")
+ self.num_retries = self.global_config.get("num_retries", 3)
+ # self.system_role = args.get("system_role", "You are a helpful factchecker assistant.")
+ self.system_role = "You are a helpful factchecker assistant."
+ self.verify_retries = args.get("verify_retries", 3)
+ self.stance_map = {
+ 1: "support",
+ -1: "refute",
+ 0: "irrelevant"
+ }
+
+ def verify_by_stance(
+ self, claim: str,
+ evidences: list[str],
+ ) -> Any:
+ labels = []
+ for evidence in evidences:
+ labels.append(self.stance(evidence, claim))
+
+ # based on stances of evidence, determine the true/false claim by rules
+ # if there is one evidence supports, we assume it is correct
+ if 1 in labels:
+ return 1
+ # if there isn't support, but refute and irrelevant, we regard as false
+ elif -1 in labels:
+ return -1
+ else:
+ # all irrelevant
+ return 0
+
+ def identify_stance_gpt(self, evidence, claim):
+ user_input = IDENTIFY_STANCE_PROMPT_FUNC.format(claim=claim, evidence=evidence)
+ r = gpt(
+ user_input,
+ model=self.stance_model,
+ system_role=self.system_role,
+ num_retries=self.num_retries
+ )
+ label = 0
+ try:
+ label = eval(r)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}.")
+ return label
+
+ def stance(self, evidence, claim, model="gpt-3.5-turbo"):
+ """input: a claim and an evidence
+ output: label in [support, refute, irrelevant]"""
+ label = 0
+ if self.stance_model == "nli":
+ label = nli_infer(premise=evidence, hypothesis=claim)
+ elif "gpt" in self.stance_model:
+ label = self.identify_stance_gpt(evidence, claim)
+ else:
+ print("Check the model argument, choose either gpt or nli model")
+ return label
+
+ def verify_claim(self, claim: str, evidences: list[str]) -> dict[str, Any]:
+ results = None
+ user_input = VERIFY_PROMPT.format(claim=claim, evidence=evidences)
+ r = ''
+ for _ in range(self.verify_retries):
+ r = gpt(
+ user_input,
+ model=self.stance_model,
+ system_role=self.system_role,
+ num_retries=self.num_retries,
+ )
+ try:
+ results = eval(r)
+ break
+ except Exception as e:
+ try:
+ results = json.loads(r)
+ except Exception as e:
+ print(f"An unexpected error occurred to parse json {r}: {e}.")
+ save_to_file(r, "verification_error.txt")
+ print(f"An unexpected error occurred to eval {r}: {e}.")
+
+ if isinstance(results, dict):
+ return results
+ else:
+ print(f"Error output {r}. It does not output a dict, return factual label by stance aggregation.")
+ factual_label = self.verify_by_stance(claim, evidences)
+ results = {
+ "reasoning": "",
+ "error": "",
+ "correction": "",
+ "factuality": factual_label
+ }
+ return results
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims_with_evidences = state.get(self.input_name)
+ results = []
+ for claim, evidences in claims_with_evidences.items():
+ result = self.verify_claim(claim, [x[1] for x in evidences])
+ result["claim"] = claim
+ result["evidences"] = evidences
+ results.append(result)
+ state.set(self.output_name, all([x['factuality'] > 0 for x in results]))
+ state.set("detail", results)
+ return True, state
diff --git a/src/openfactcheck/solvers/webservice/facttool_utils/__init__.py b/src/openfactcheck/solvers/webservice/facttool_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/webservice/facttool_utils/chat_api.py b/src/openfactcheck/solvers/webservice/facttool_utils/chat_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4632ff472817d1ba1ef94c4129c4cc1e9b6a9fd5
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/facttool_utils/chat_api.py
@@ -0,0 +1,225 @@
+# the async version is adapted from https://gist.github.com/neubig/80de662fb3e225c18172ec218be4917a
+
+from __future__ import annotations
+
+import os
+import yaml
+import openai
+import ast
+import pdb
+import asyncio
+from typing import Any, List
+import os
+import pathlib
+import openai
+from openai import OpenAI, AsyncOpenAI
+import re
+
+
+# from factool.env_config import factool_env_config
+
+# env
+# openai.api_key = factool_env_config.openai_api_key
+
+class OpenAIChat():
+ def __init__(
+ self,
+ model_name='gpt-3.5-turbo',
+ max_tokens=2500,
+ temperature=0,
+ top_p=1,
+ request_timeout=120,
+ ):
+ if 'gpt' not in model_name:
+ openai.api_base = "http://localhost:8000/v1"
+ else:
+ # openai.api_base = "https://api.openai.com/v1"
+ openai.api_key = os.environ.get("OPENAI_API_KEY", None)
+ assert openai.api_key is not None, "Please set the OPENAI_API_KEY environment variable."
+ assert openai.api_key != '', "Please set the OPENAI_API_KEY environment variable."
+ self.client = AsyncOpenAI()
+
+ self.config = {
+ 'model_name': model_name,
+ 'max_tokens': max_tokens,
+ 'temperature': temperature,
+ 'top_p': top_p,
+ 'request_timeout': request_timeout,
+ }
+
+ def extract_list_from_string(self, input_string):
+ # pattern = r'\[.*\]'
+ # result = re.search(pattern, input_string)
+ # if result:
+ # return result.group()
+ # else:
+ # return None
+ start_index = input_string.find('[')
+ end_index = input_string.rfind(']')
+
+ if start_index != -1 and end_index != -1 and start_index < end_index:
+ return input_string[start_index:end_index + 1]
+ else:
+ return None
+
+ def extract_dict_from_string(self, input_string):
+ start_index = input_string.find('{')
+ end_index = input_string.rfind('}')
+
+ if start_index != -1 and end_index != -1 and start_index < end_index:
+ return input_string[start_index:end_index + 1]
+ else:
+ return None
+
+ def _boolean_fix(self, output):
+ return output.replace("true", "True").replace("false", "False")
+
+ def _type_check(self, output, expected_type):
+ try:
+ output_eval = ast.literal_eval(output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ except:
+ '''
+ if(expected_type == List):
+ valid_output = self.extract_list_from_string(output)
+ output_eval = ast.literal_eval(valid_output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ elif(expected_type == dict):
+ valid_output = self.extract_dict_from_string(output)
+ output_eval = ast.literal_eval(valid_output)
+ if not isinstance(output_eval, expected_type):
+ return None
+ return output_eval
+ '''
+ return None
+
+ async def dispatch_openai_requests(
+ self,
+ messages_list,
+ ) -> list[str]:
+ """Dispatches requests to OpenAI API asynchronously.
+
+ Args:
+ messages_list: List of messages to be sent to OpenAI ChatCompletion API.
+ Returns:
+ List of responses from OpenAI API.
+ """
+
+ async def _request_with_retry(messages, retry=3):
+ for _ in range(retry):
+ try:
+ response = await self.client.chat.completions.create(
+ model=self.config['model_name'],
+ messages=messages,
+ max_tokens=self.config['max_tokens'],
+ temperature=self.config['temperature'],
+ top_p=self.config['top_p']
+ )
+ return response
+ except openai.RateLimitError:
+ await asyncio.sleep(1)
+ except openai.Timeout:
+ await asyncio.sleep(1)
+ except openai.APIError:
+ await asyncio.sleep(1)
+ # except openai.err
+
+ # except openai.error.RateLimitError:
+ # print('Rate limit error, waiting for 40 second...')
+ # await asyncio.sleep(40)
+ # except openai.error.APIError:
+ # print('API error, waiting for 1 second...')
+ # await asyncio.sleep(1)
+ # except openai.error.Timeout:
+ # print('Timeout error, waiting for 1 second...')
+ # await asyncio.sleep(1)
+ # except openai.error.ServiceUnavailableError:
+ # print('Service unavailable error, waiting for 3 second...')
+ # await asyncio.sleep(3)
+ # except openai.error.APIConnectionError:
+ # print('API Connection error, waiting for 3 second...')
+ # await asyncio.sleep(3)
+
+ return None
+
+ async_responses = [
+ _request_with_retry(messages)
+ for messages in messages_list
+ ]
+
+ return await asyncio.gather(*async_responses)
+
+ def run(self, messages_list, expected_type):
+ retry = 1
+ responses = [None for _ in range(len(messages_list))]
+ messages_list_cur_index = [i for i in range(len(messages_list))]
+
+ while retry > 0 and len(messages_list_cur_index) > 0:
+ print(f'{retry} retry left...')
+ messages_list_cur = [messages_list[i] for i in messages_list_cur_index]
+
+ predictions = asyncio.run(self.dispatch_openai_requests(
+ messages_list=messages_list_cur,
+ ))
+
+ preds = [self._type_check(self._boolean_fix(prediction.choices[0].message.content),
+ expected_type) if prediction is not None else None for prediction in predictions]
+ finised_index = []
+ for i, pred in enumerate(preds):
+ if pred is not None:
+ responses[messages_list_cur_index[i]] = pred
+ finised_index.append(messages_list_cur_index[i])
+
+ messages_list_cur_index = [i for i in messages_list_cur_index if i not in finised_index]
+
+ retry -= 1
+
+ return responses
+
+# class OpenAIEmbed():
+# def __init__():
+# openai.api_key = os.environ.get("OPENAI_API_KEY", None)
+# assert openai.api_key is not None, "Please set the OPENAI_API_KEY environment variable."
+# assert openai.api_key != '', "Please set the OPENAI_API_KEY environment variable."
+
+# async def create_embedding(self, text, retry=3):
+# for _ in range(retry):
+# try:
+# response = await openai.Embedding.acreate(input=text, model="text-embedding-ada-002")
+# return response
+# except openai.error.RateLimitError:
+# print('Rate limit error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# except openai.error.APIError:
+# print('API error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# except openai.error.Timeout:
+# print('Timeout error, waiting for 1 second...')
+# await asyncio.sleep(1)
+# return None
+
+# async def process_batch(self, batch, retry=3):
+# tasks = [self.create_embedding(text, retry=retry) for text in batch]
+# return await asyncio.gather(*tasks)
+
+# if __name__ == "__main__":
+# chat = OpenAIChat(model_name='llama-2-7b-chat-hf')
+
+# predictions = asyncio.run(chat.async_run(
+# messages_list=[
+# [{"role": "user", "content": "show either 'ab' or '['a']'. Do not do anything else."}],
+# ] * 20,
+# expected_type=List,
+# ))
+
+# print(predictions)
+# Usage
+# embed = OpenAIEmbed()
+# batch = ["string1", "string2", "string3", "string4", "string5", "string6", "string7", "string8", "string9", "string10"] # Your batch of strings
+# embeddings = asyncio.run(embed.process_batch(batch, retry=3))
+# for embedding in embeddings:
+# print(embedding["data"][0]["embedding"])
diff --git a/src/openfactcheck/solvers/webservice/facttool_utils/prompts.yaml b/src/openfactcheck/solvers/webservice/facttool_utils/prompts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e4ed08d6d2aefcf7f1d473c392a502db8d64a67
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/facttool_utils/prompts.yaml
@@ -0,0 +1,67 @@
+claim_extraction:
+ system: |-
+ You are a brilliant assistant.
+ user: |-
+ You are given a piece of text that includes knowledge claims. A claim is a statement that asserts something as true or false, which can be verified by humans. Your task is to accurately identify and extract every claim stated in the provided text. Then, resolve any coreference (pronouns or other referring expressions) in the claim for clarity. Each claim should be concise (less than 15 words) and self-contained.
+ Your response MUST be a list of dictionaries. Each dictionary should contains the key "claim", which correspond to the extracted claim (with all coreferences resolved).
+ You MUST only respond in the format as described below. DO NOT RESPOND WITH ANYTHING ELSE. ADDING ANY OTHER EXTRA NOTES THAT VIOLATE THE RESPONSE FORMAT IS BANNED. START YOUR RESPONSE WITH '['.
+ [response format]:
+ [
+ {{
+ "claim": "Ensure that the claim is fewer than 15 words and conveys a complete idea. Resolve any coreference (pronouns or other referring expressions) in the claim for clarity",
+ }},
+ ...
+ ]
+
+ Here are two examples:
+ [text]: Tomas Berdych defeated Gael Monfis 6-1, 6-4 on Saturday. The sixth-seed reaches Monte Carlo Masters final for the first time . Berdych will face either Rafael Nadal or Novak Djokovic in the final.
+ [response]: [{{"claim": "Tomas Berdych defeated Gael Monfis 6-1, 6-4"}}, {{"claim": "Tomas Berdych defeated Gael Monfis 6-1, 6-4 on Saturday"}}, {{"claim": "Tomas Berdych reaches Monte Carlo Masters final"}}, {{"claim": "Tomas Berdych is the sixth-seed"}}, {{"claim": "Tomas Berdych reaches Monte Carlo Masters final for the first time"}}, {{"claim": "Berdych will face either Rafael Nadal or Novak Djokovic"}}, {{"claim": "Berdych will face either Rafael Nadal or Novak Djokovic in the final"}}]
+
+ [text]: Tinder only displays the last 34 photos - but users can easily see more. Firm also said it had improved its mutual friends feature.
+ [response]: [{{"claim": "Tinder only displays the last photos"}}, {{"claim": "Tinder only displays the last 34 photos"}}, {{"claim": "Tinder users can easily see more photos"}}, {{"claim": "Tinder said it had improved its feature"}}, {{"claim": "Tinder said it had improved its mutual friends feature"}}]
+
+ Now complete the following,ONLY RESPONSE IN A LIST FORMAT, NO OTHER WORDS!!!:
+ [text]: {input}
+ [response]:
+
+query_generation:
+ system: |-
+ You are a query generator that generates effective and concise search engine queries to verify a given claim. You only response in a python list format(NO OTHER WORDS!)
+ user: |-
+ You are a query generator designed to help users verify a given claim using search engines. Your primary task is to generate a Python list of two effective and skeptical search engine queries. These queries should assist users in critically evaluating the factuality of a provided claim using search engines.
+ You should only respond in format as described below (a Python list of queries). PLEASE STRICTLY FOLLOW THE FORMAT. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '['.
+ [response format]: ['query1', 'query2']
+
+ Here are three examples:
+ claim: The CEO of twitter is Bill Gates.
+ response: ["Who is the CEO of twitter?", "CEO Twitter"]
+
+ claim: Michael Phelps is the most decorated Olympian of all time.
+ response: ["Who is the most decorated Olympian of all time?", "Michael Phelps"]
+
+ claim: ChatGPT is created by Google.
+ response: ["Who created ChatGPT?", "ChatGPT"]
+
+ Now complete the following(ONLY RESPONSE IN A LIST FORMAT, DO NOT RETURN OTHER WORDS!!! START YOUR RESPONSE WITH '[' AND END WITH ']'):
+ claim: {input}
+ response:
+
+verification:
+ system: |-
+ You are a brilliant assistant.
+ user: |-
+ You are given a piece of text. Your task is to identify whether there are any factual errors within the text.
+ When you are judging the factuality of the given text, you could reference the provided evidences if needed. The provided evidences may be helpful. Some evidences may contradict to each other. You must be careful when using the evidences to judge the factuality of the given text.
+ The response should be a dictionary with three keys - "reasoning", "factuality", "error", and "correction", which correspond to the reasoning, whether the given text is factual or not (Boolean - True or False), the factual error present in the text, and the corrected text.
+ The following is the given text
+ [text]: {claim}
+ The following is the provided evidences
+ [evidences]: {evidence}
+ You should only respond in format as described below. DO NOT RETURN ANYTHING ELSE. START YOUR RESPONSE WITH '{{'.
+ [response format]:
+ {{
+ "reasoning": "Why is the given text factual or non-factual? Be careful when you said something is non-factual. When you said something is non-factual, you must provide multiple evidences to support your decision.",
+ "error": "None if the text is factual; otherwise, describe the error.",
+ "correction": "The corrected text if there is an error.",
+ "factuality": True if the given text is factual, False otherwise.
+ }}
\ No newline at end of file
diff --git a/src/openfactcheck/solvers/webservice/facttool_utils/search_api.py b/src/openfactcheck/solvers/webservice/facttool_utils/search_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e1ea5484e8680cb099adaaf895e2e9b534c47f
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/facttool_utils/search_api.py
@@ -0,0 +1,117 @@
+import asyncio
+from factool.knowledge_qa.google_serper import GoogleSerperAPIWrapper
+from factool.utils.openai_wrapper import OpenAIEmbed
+import json
+import os
+import numpy as np
+import jsonlines
+import pdb
+import aiohttp
+
+
+class GoogleSerperAPIWrapper():
+ """Wrapper around the Serper.dev Google Search API.
+ You can create a free API key at https://serper.dev.
+ To use, you should have the environment variable ``SERPER_API_KEY``
+ set with your API key, or pass `serper_api_key` as a named parameter
+ to the constructor.
+ Example:
+ .. code-block:: python
+ from langchain import GoogleSerperAPIWrapper
+ google_serper = GoogleSerperAPIWrapper()
+ """
+
+ def __init__(self, snippet_cnt=10) -> None:
+ self.k = snippet_cnt
+ self.gl = "us"
+ self.hl = "en"
+ self.serper_api_key = os.environ.get("SERPER_API_KEY", None)
+ assert self.serper_api_key is not None, "Please set the SERPER_API_KEY environment variable."
+ assert self.serper_api_key != '', "Please set the SERPER_API_KEY environment variable."
+
+ async def _google_serper_search_results(self, session, search_term: str, gl: str, hl: str) -> dict:
+ headers = {
+ "X-API-KEY": self.serper_api_key or "",
+ "Content-Type": "application/json",
+ }
+ params = {"q": search_term, "gl": gl, "hl": hl}
+ async with session.post(
+ "https://google.serper.dev/search", headers=headers, params=params, raise_for_status=True
+ ) as response:
+ return await response.json()
+
+ def _parse_results(self, results):
+ snippets = []
+
+ if results.get("answerBox"):
+ answer_box = results.get("answerBox", {})
+ if answer_box.get("answer"):
+ element = {"content": answer_box.get("answer"), "source": "None"}
+ return [element]
+ elif answer_box.get("snippet"):
+ element = {"content": answer_box.get("snippet").replace("\n", " "), "source": "None"}
+ return [element]
+ elif answer_box.get("snippetHighlighted"):
+ element = {"content": answer_box.get("snippetHighlighted"), "source": "None"}
+ return [element]
+
+ if results.get("knowledgeGraph"):
+ kg = results.get("knowledgeGraph", {})
+ title = kg.get("title")
+ entity_type = kg.get("type")
+ if entity_type:
+ element = {"content": f"{title}: {entity_type}", "source": "None"}
+ snippets.append(element)
+ description = kg.get("description")
+ if description:
+ element = {"content": description, "source": "None"}
+ snippets.append(element)
+ for attribute, value in kg.get("attributes", {}).items():
+ element = {"content": f"{attribute}: {value}", "source": "None"}
+ snippets.append(element)
+
+ for result in results["organic"][: self.k]:
+ if "snippet" in result:
+ element = {"content": result["snippet"], "source": result["link"]}
+ snippets.append(element)
+ for attribute, value in result.get("attributes", {}).items():
+ element = {"content": f"{attribute}: {value}", "source": result["link"]}
+ snippets.append(element)
+
+ if len(snippets) == 0:
+ element = {"content": "No good Google Search Result was found", "source": "None"}
+ return [element]
+
+ # keep only the first k snippets
+ snippets = snippets[:int(self.k / 2)]
+
+ return snippets
+
+ async def parallel_searches(self, search_queries, gl, hl):
+ async with aiohttp.ClientSession() as session:
+ tasks = [self._google_serper_search_results(session, query, gl, hl) for query in search_queries]
+ search_results = await asyncio.gather(*tasks, return_exceptions=True)
+ return search_results
+
+ def run(self, queries):
+ """Run query through GoogleSearch and parse result."""
+ flattened_queries = []
+
+ for sublist in queries:
+ if sublist is None:
+ sublist = ['None', 'None']
+ for item in sublist:
+ flattened_queries.append(item)
+ results = asyncio.run(self.parallel_searches(flattened_queries, gl=self.gl, hl=self.hl))
+ snippets_list = []
+ for i in range(len(results)):
+ snippets_list.append(self._parse_results(results[i]))
+ snippets_split = [snippets_list[i] + snippets_list[i + 1] for i in range(0, len(snippets_list), 2)]
+ return snippets_split
+
+# class google_search():
+# def __init__(self, snippet_cnt):
+# self.serper = GoogleSerperAPIWrapper(snippet_cnt=snippet_cnt)
+
+# def run(self, queries):
+# return asyncio.run(self.serper.run(queries))
diff --git a/src/openfactcheck/solvers/webservice/ftool_cp.py b/src/openfactcheck/solvers/webservice/ftool_cp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d337cfe8f8b69acad92289a786473c7115f57d89
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/ftool_cp.py
@@ -0,0 +1,47 @@
+import os
+import yaml
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .facttool_utils.chat_api import OpenAIChat
+
+@Solver.register("factool_claimprocessor", "response", "claims")
+class FactoolClaimProcessor(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("factool_gpt_model", "gpt-3.5-turbo")
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.claim_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "facttool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["claim_extraction"]
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ response = state.get(self.input_name)
+
+ claims = self._claim_extraction(responses=[response])[0]
+
+ extracted_claims = [claim["claim"] for claim in claims]
+
+ state.set(self.output_name, extracted_claims)
+ return True, state
+
+ def _claim_extraction(self, responses):
+ messages_list = [
+ [
+ {"role": "system", "content": self.claim_prompt["system"]},
+ {
+ "role": "user",
+ "content": self.claim_prompt["user"].format(input=response),
+ },
+ ]
+ for response in responses
+ ]
+ return self.gpt.run(messages_list, list)
diff --git a/src/openfactcheck/solvers/webservice/ftool_rtv.py b/src/openfactcheck/solvers/webservice/ftool_rtv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c71f4650f7f7f5fe25a9afed6bae53087b79e4
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/ftool_rtv.py
@@ -0,0 +1,52 @@
+import os
+import yaml
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+from .facttool_utils.chat_api import OpenAIChat
+from .facttool_utils.search_api import GoogleSerperAPIWrapper
+
+
+@Solver.register("factool_retriever", "claims", "claims_with_evidences")
+class FactoolRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("factool_gpt_model", "gpt-3.5-turbo")
+ self.snippet_cnt = args.get("snippet_cnt", 10)
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.query_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "facttool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["query_generation"]
+ self.search_engine = GoogleSerperAPIWrapper(snippet_cnt=self.snippet_cnt)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ queries = self._query_generation(claims=claims)
+ evidences = self.search_engine.run(queries)
+ results = {}
+ for query, claim, evidence in zip(queries, claims, evidences):
+ merged_query = ' '.join(query) if len(query) > 1 else str(query)
+ results[claim] = [(merged_query, x['content']) for x in evidence]
+ state.set(self.output_name, results)
+ return True, state
+
+ def _query_generation(self, claims):
+ messages_list = [
+ [
+ {"role": "system", "content": self.query_prompt["system"]},
+ {
+ "role": "user",
+ "content": self.query_prompt["user"].format(input=claim),
+ },
+ ]
+ for claim in claims
+ ]
+ return self.gpt.run(messages_list, list)
diff --git a/src/openfactcheck/solvers/webservice/ftool_vfr.py b/src/openfactcheck/solvers/webservice/ftool_vfr.py
new file mode 100644
index 0000000000000000000000000000000000000000..e56474ade4eba4dc4e902a996d8b8f1c0083ba46
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/ftool_vfr.py
@@ -0,0 +1,45 @@
+import os
+import yaml
+
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+from .facttool_utils.chat_api import OpenAIChat
+
+@Solver.register("factool_verifier", "claims_with_evidences", "label")
+class FactoolVerifier(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.gpt_model = self.global_config.get("factool_gpt_model", "gpt-3.5-turbo")
+ self.gpt = OpenAIChat(self.gpt_model)
+ self.verification_prompt = yaml.load(
+ open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "facttool_utils/prompts.yaml",
+ ),
+ "r",
+ ),
+ yaml.FullLoader,
+ )["verification"]
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims_with_evidences = state.get(self.input_name)
+ results = self._verification(claims_with_evidences)
+ for i, k in enumerate(list(claims_with_evidences.keys())):
+ results[i]['claim'] = k
+ results[i]['evidences'] = claims_with_evidences[k]
+ state.set("detail", results)
+ label = all(v['factuality'] for v in results)
+ state.set(self.output_name, label)
+ return True, state
+
+ def _verification(self, claims_with_evidences):
+ messages_list = [
+ [
+ {"role": "system", "content": self.verification_prompt['system']},
+ {"role": "user", "content": self.verification_prompt['user'].format(claim=claim, evidence=str(
+ [e[1] for e in evidence]))},
+ ]
+ for claim, evidence in claims_with_evidences.items()
+ ]
+ return self.gpt.run(messages_list, dict)
diff --git a/src/openfactcheck/solvers/webservice/rarr_rtv.py b/src/openfactcheck/solvers/webservice/rarr_rtv.py
new file mode 100644
index 0000000000000000000000000000000000000000..83be49b12bb086805ea49ac17965e51368c57241
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_rtv.py
@@ -0,0 +1,48 @@
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .rarr_utils.question_generation import run_rarr_question_generation
+from .rarr_utils.functional_prompt import QGEN_PROMPT
+from .rarr_utils import search
+
+
+@Solver.register("rarr_retriever", "claims", "claims_with_evidences")
+class RARRRetriever(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.model = self.global_config.get("rarr_model", "text-davinci-003")
+ self.temperature_qgen = args.get("temperature_qgen", 0.7)
+ self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
+ self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
+ self.max_sentences_per_passage = args.get("max_sentences_per_passage", 4)
+ self.sliding_distance = args.get("sliding_distance", 1)
+ self.max_passages_per_search_result = args.get("max_passages_per_search_result", 1)
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims = state.get(self.input_name)
+
+ results = dict()
+ for claim in claims:
+ questions = run_rarr_question_generation(
+ claim=claim,
+ context=None,
+ model=self.model,
+ prompt=QGEN_PROMPT,
+ temperature=self.temperature_qgen,
+ num_rounds=self.num_rounds_qgen,
+ )
+ evidences = []
+ for question in questions:
+ q_evidences = search.run_search(
+ query=question,
+ max_search_results_per_query=self.max_search_results_per_query,
+ max_sentences_per_passage=self.max_sentences_per_passage,
+ sliding_distance=self.sliding_distance,
+ max_passages_per_search_result_to_return=self.max_passages_per_search_result,
+ )
+ evidences.extend([(question, x['text']) for x in q_evidences])
+
+ results[claim] = evidences
+
+ state.set(self.output_name, results)
+ return True, state
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/__init__.py b/src/openfactcheck/solvers/webservice/rarr_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/agreement_gate.py b/src/openfactcheck/solvers/webservice/rarr_utils/agreement_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a217b7cc3e19e80d44f9aad7a857e4cd07e5b4d7
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/agreement_gate.py
@@ -0,0 +1,85 @@
+"""Utils for running the agreement gate."""
+import os
+import time
+from typing import Any, Dict, Tuple
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> Tuple[bool, str, str]:
+ """Extract the agreement gate state and the reasoning from the GPT-3 API response.
+
+ Our prompt returns questions as a string with the format of an ordered list.
+ This function parses this response in a list of questions.
+
+ Args:
+ api_response: Agreement gate response from GPT-3.
+ Returns:
+ is_open: Whether the agreement gate is open.
+ reason: The reasoning for why the agreement gate is open or closed.
+ decision: The decision of the status of the gate in string form.
+ """
+ api_response = api_response.strip().split("\n")
+ if len(api_response) < 2:
+ reason = "Failed to parse."
+ decision = None
+ is_open = False
+ else:
+ reason = api_response[0]
+ decision = api_response[1].split("Therefore:")[-1].strip()
+ is_open = "disagrees" not in api_response[1]
+ return is_open, reason, decision
+
+
+def run_agreement_gate(
+ claim: str,
+ query: str,
+ evidence: str,
+ model: str,
+ prompt: str,
+ context: str = None,
+ num_retries: int = 5,
+) -> Dict[str, Any]:
+ """Checks if a provided evidence contradicts the claim given a query.
+
+ Checks if the answer to a query using the claim contradicts the answer using the
+ evidence. If so, we open the agreement gate, which means that we allow the editor
+ to edit the claim. Otherwise the agreement gate is closed.
+
+ Args:
+ claim: Text to check the validity of.
+ query: Query to guide the validity check.
+ evidence: Evidence to judge the validity of the claim against.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ gate: A dictionary with the status of the gate and reasoning for decision.
+ """
+ if context:
+ gpt3_input = prompt.format(
+ context=context, claim=claim, query=query, evidence=evidence
+ ).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim, query=query, evidence=evidence).strip()
+
+ for _ in range(num_retries):
+ try:
+ response = openai.completions.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=256,
+ stop=["\n\n"],
+ logit_bias={"50256": -100}, # Don't allow <|endoftext|> to be generated
+ )
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ is_open, reason, decision = parse_api_response(response.choices[0].text)
+ gate = {"is_open": is_open, "reason": reason, "decision": decision}
+ return gate
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/editor.py b/src/openfactcheck/solvers/webservice/rarr_utils/editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf8f972e2392b35ad11fe537a8a59230876e85bd
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/editor.py
@@ -0,0 +1,77 @@
+"""Utils for running the editor."""
+import os
+import time
+from typing import Dict, Union
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> str:
+ """Extract the agreement gate state and the reasoning from the GPT-3 API response.
+
+ Our prompt returns a reason for the edit and the edit in two consecutive lines.
+ Only extract out the edit from the second line.
+
+ Args:
+ api_response: Editor response from GPT-3.
+ Returns:
+ edited_claim: The edited claim.
+ """
+ api_response = api_response.strip().split("\n")
+ if len(api_response) < 2:
+ print("Editor error.")
+ return None
+ edited_claim = api_response[1].split("My fix:")[-1].strip()
+ return edited_claim
+
+
+def run_rarr_editor(
+ claim: str,
+ query: str,
+ evidence: str,
+ model: str,
+ prompt: str,
+ context: str = None,
+ num_retries: int = 5,
+) -> Dict[str, str]:
+ """Runs a GPT-3 editor on the claim given a query and evidence to support the edit.
+
+ Args:
+ claim: Text to edit.
+ query: Query to guide the editing.
+ evidence: Evidence to base the edit on.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ edited_claim: The edited claim.
+ """
+ if context:
+ gpt3_input = prompt.format(
+ context=context, claim=claim, query=query, evidence=evidence
+ ).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim, query=query, evidence=evidence).strip()
+
+ for _ in range(num_retries):
+ try:
+ response = openai.completions.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=512,
+ stop=["\n\n"],
+ )
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ edited_claim = parse_api_response(response.choices[0].text)
+ # If there was an error in GPT-3 generation, return the claim.
+ if not edited_claim:
+ edited_claim = claim
+ output = {"text": edited_claim}
+ return output
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/evidence_selection.py b/src/openfactcheck/solvers/webservice/rarr_utils/evidence_selection.py
new file mode 100644
index 0000000000000000000000000000000000000000..92138b6a88cb49b3214e5e0b9ff9dee57004c40a
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/evidence_selection.py
@@ -0,0 +1,89 @@
+import itertools
+from typing import Any, Dict, List
+
+import torch
+from sentence_transformers import CrossEncoder
+
+PASSAGE_RANKER = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+)
+
+
+def compute_score_matrix(
+ questions: List[str], evidences: List[str]
+) -> List[List[float]]:
+ """Scores the relevance of all evidence against all questions using a CrossEncoder.
+
+ Args:
+ questions: A list of unique questions.
+ evidences: A list of unique evidences.
+ Returns:
+ score_matrix: A 2D list list of question X evidence relevance scores.
+ """
+ score_matrix = []
+ for q in questions:
+ evidence_scores = PASSAGE_RANKER.predict([(q, e) for e in evidences]).tolist()
+ score_matrix.append(evidence_scores)
+ return score_matrix
+
+
+def question_coverage_objective_fn(
+ score_matrix: List[List[float]], evidence_indices: List[int]
+) -> float:
+ """Given (query, evidence) scores and a subset of evidence, return the coverage.
+
+ Given all pairwise query and evidence scores, and a subset of the evidence
+ specified by indices, return a value indicating how well this subset of evidence
+ covers (i.e., helps answer) all questions.
+
+ Args:
+ score_matrix: A 2D list list of question X evidence relevance scores.
+ evidence_indicies: A subset of the evidence to to get the coverage score of.
+ Returns:
+ total: The coverage we would get by using the subset of evidence in
+ `evidence_indices` over all questions.
+ """
+ # Compute sum_{question q} max_{selected evidence e} score(q, e).
+ # This encourages all questions to be explained by at least one evidence.
+ total = 0.0
+ for scores_for_question in score_matrix:
+ total += max(scores_for_question[j] for j in evidence_indices)
+ return total
+
+
+def select_evidences(
+ example: Dict[str, Any], max_selected: int = 5, prefer_fewer: bool = False
+) -> List[Dict[str, Any]]:
+ """Selects the set of evidence that maximizes information converage over the claim.
+
+ Args:
+ example: The result of running the editing pipeline on one claim.
+ max_selected: Maximum number of evidences to select.
+ prefer_fewer: If True and the maximum objective value can be achieved by
+ fewer evidences than `max_selected`, prefer selecting fewer evidences.
+ Returns:
+ selected_evidences: Selected evidences that serve as the attribution report.
+ """
+ questions = sorted(set(example["questions"]))
+ evidences = sorted(set(e["text"] for e in example["revisions"][0]["evidences"]))
+ num_evidences = len(evidences)
+ if not num_evidences:
+ return []
+
+ score_matrix = compute_score_matrix(questions, evidences)
+
+ best_combo = tuple()
+ best_objective_value = float("-inf")
+ max_selected = min(max_selected, num_evidences)
+ min_selected = 1 if prefer_fewer else max_selected
+ for num_selected in range(min_selected, max_selected + 1):
+ for combo in itertools.combinations(range(num_evidences), num_selected):
+ objective_value = question_coverage_objective_fn(score_matrix, combo)
+ if objective_value > best_objective_value:
+ best_combo = combo
+ best_objective_value = objective_value
+
+ selected_evidences = [{"text": evidences[idx]} for idx in best_combo]
+ return selected_evidences
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/functional_prompt.py b/src/openfactcheck/solvers/webservice/rarr_utils/functional_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9b05ef90d032e8c8d9275256d92e03d48eaa6a
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/functional_prompt.py
@@ -0,0 +1,323 @@
+"""All prompts used for RARR prompting."""
+
+QGEN_PROMPT = """I will check things you said and ask questions.
+
+You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+To verify it,
+1. I googled: Does your nose switch between nostrils?
+2. I googled: How often does your nostrils switch?
+3. I googled: Why does your nostril switch?
+4. I googled: What is nasal cycle?
+
+You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+To verify it,
+1. I googled: Where was Stanford Prison Experiment was conducted?
+
+You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+To verify it,
+1. I googled: What does Havel-Hakimi algorithm do?
+2. I googled: Who are Havel-Hakimi algorithm named after?
+
+You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+To verify it,
+1. I googled: Who sings the song "Time of My Life"?
+2. I googled: Which film is the song "Time of My Life" from?
+3. I googled: Who produced the song "Time of My Life"?
+
+You said: Kelvin Hopins was suspended from the Labor Party due to his membership in the Conservative Party.
+To verify it,
+1. I googled: Why was Kelvin Hopins suspended from Labor Party?
+
+You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+To verify it,
+1. I googled: What philosophical tradition is social work based on?
+2. I googled: What year does social work have its root in?
+
+You said: {claim}
+To verify it,
+""".strip()
+
+CONTEXTUAL_QGEN_PROMPT = """I will check things you said and ask questions.
+
+Context: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes.
+You said: This is to prevent a buildup of mucus. It's called the nasal cycle.
+To verify what you just said,
+1. I googled: Why does your nostril switch during sleep?
+2. I googled: What is nasal cycle?
+3. I googled: What is the nostril switching during sleep called?
+
+Context: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford's psychology building.
+You said: It is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+To verify what you just said,
+1. I googled: What type of experiment was the Stanford Prison Experiment?
+2. I googled: What was the objective of the Stanford Prison Experiment?
+
+Context: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+You said: It is named after Václav Havel and Samih Hakimi.
+To verify what you just said,
+1. I googled: Who are Havel-Hakimi algorithm named after?
+
+Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+You said: The song was produced by Michael Lloyd in the same year.
+To verify what you just said,
+1. I googled: Who produced the song "Time of My Life"?
+2. I googled: When was the song "Time of My Life" by Bill Medley produced?
+
+Context: The Late Show with Stephen Colbert is an American late-night talk show hosted by Stephen Colbert, which premiered on September 8, 2015.
+You said: Produced by Spartina Productions and CBS Television Studios, it is the second iteration of CBS' Late Show franchise.
+To verify what you just said,
+1. I googled: Who produces "The Late Show with Stephen Colbert"?
+2. I googled: What are the iterations of CBS' Late Show franchise?
+
+Context: Super Mario Sunshine was released on GameCube in 2002. In the game, Mario uses a tool strapped to his back called FLUDD, which stands for The Flash Liquidizer Ultra Dousing Device.
+You said: It can be used to spray water at objects or enemies. This allows Mario to change his movements, kill enemies, or clean up hazards on the floor.
+To verify what you just said,
+1. I googled: What is the main function of FLUDD in Super Mario Sunshine?
+2. I googled: What can FLUDD in Super Mario Sunshine be used on?
+
+Context: {context}
+You said: {claim}
+To verify what you just said,
+""".strip()
+
+AGREEMENT_GATE_PROMPT = """I will check some things you said.
+
+1. You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+2. I checked: How often do your nostrils switch?
+3. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+4. Reasoning: The article said the nose’s switching time is about every 2 hours, and you said the nose's switching time is about every 45 minutes.
+5. Therefore: This disagrees with what you said.
+
+1. You said: The Little House books were written by Laura Ingalls Wilder. The books were published by HarperCollins.
+2. I checked: Who published the Little House books?
+3. I found this article: These are the books that started it all -- the stories that captured the hearts and imaginations of children and young adults worldwide. Written by Laura Ingalls Wilder and published by HarperCollins, these beloved books remain a favorite to this day.
+4. Reasoning: The article said the Little House books were published by HarperCollins and you said the books were published by HarperCollins.
+5. Therefore: This agrees with what you said.
+
+1. You said: Real Chance of Love was an American reality TV show. Season 2 of the show was won by Cali, who chose to be with Chance.
+2. I checked: Who won season 2 of Real Chance of Love?
+3. I found this article: Real Chance of Love 2: Back in the Saddle is the second season of the VH1 reality television dating series Real Chance of Love. Ahmad Givens (Real) and Kamal Givens (Chance), former contestants on I Love New York are the central figures.
+4. Reasoning: The article doesn't answer the question and you said that Cali won season 2 of Real Chance of Love.
+5. Therefore: This is irrelevant to what you said.
+
+1. You said: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+2. I checked: Where was Stanford Prison Experiment conducted?
+3. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+4. Reasoning: The article said the Stanford Prison Experiment was conducted in Jordan Hall and you said the Stanford Prison Experiment was conducted in Jordan Hall.
+5. Therefore: This agrees with what you said.
+
+1. You said: Social work is a profession that is based in the philosophical tradition of humanism. It is an intellectual discipline that has its roots in the 1800s.
+2. I checked: When did social work have its roots?
+3. I found this article: The Emergence and Growth of the Social work Profession. Social work’s roots were planted in the 1880s, when charity organization societies (COS) were created to organize municipal voluntary relief associations and settlement houses were established.
+4. Reasoning: The article said social work has its roots planted in the 1880s and you said social work has its root in the 1800s.
+5. Therefore: This disagrees with what you said.
+
+1. You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+2. I checked: What is the Havel-Hakimi algorithm?
+3. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+4. Reasoning: The article said the Havel-Hakimi algorithm is for constructing a special solution if a simple graph for the given degree sequence exists and you said the Havel-Hakimi algorithm is for converting the adjacency matrix of a graph.
+5. Therefore: This disagrees with what you said.
+
+1. You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+2. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+3. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+4. Reasoning: The article said that a demo was produced by Michael Lloyd and you said "Time of My Life" was produced by Michael Lloyd.
+5. Therefore: This agrees with what you said.
+
+1. You said: Tiger Woods is the only player who has won the most green jackets. He has won four times. The Green Jacket is one of the most coveted prizes in all of golf.
+2. I checked: What is the Green Jacket in golf?
+3. I found this article: The green jacket is a classic, three-button, single-breasted and single-vent, featuring the Augusta National Golf Club logo on the left chest pocket. The logo also appears on the brass buttons.
+4. Reasoning: The article said the Green Jacket is a classic three-button single-breasted and single-vent and you said the Green Jacket is one of the most coveted prizes in all of golf.
+5. Therefore: This is irrelevant to what you said.
+
+1. You said: Kelvin Hopins was suspended from the Labor Party because he had allegedly sexually harassed and behaved inappropriately towards a Labour Party activist, Ava Etemadzadeh.
+2. I checked: Why was Kelvin Hopins suspeneded from the Labor Party?
+3. I found this article: A former Labour MP has left the party before an inquiry into sexual harassment allegations against him was able to be concluded, the party has confirmed. Kelvin Hopkins was accused in 2017 of inappropriate physical contact and was suspended by the Labour party pending an investigation.
+4. Reasoning: The article said Kelvin Hopins was suspended because of inappropriate physical contact and you said that Kelvin Hopins was suspended because he allegedly sexually harassed Ava Etemadzadeh.
+5. Therefore: This agrees with what you said.
+
+1. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Smith.
+2. I checked: Who led the British side in the battle of Lexington and Concord?
+3. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+4. Reasoning: The article said the British side was led by Lieutenant Colonel Francis Smith and you said the British side was led by General Thomas Smith.
+5. Therefore: This disagrees with what you said.
+
+1. You said: {claim}
+2. I checked: {query}
+3. I found this article: {evidence}
+4. Reasoning:
+""".strip()
+
+CONTEXTUAL_AGREEMENT_GATE_PROMPT = """I will check some things you said.
+
+1. Context: Your nose switches back and forth between nostrils. It's called the nasal cycle. This is to prevent a buildup of mucus.
+2. You said: When you sleep, you switch about every 45 minutes.
+3. I checked: How often do your nostrils switch?
+4. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+5. Reasoning: The article said the nose’s switching time is about every 2 hours, and you said the nose's switching time is about every 45 minutes.
+6. Therefore: This disagrees with what you said.
+
+1. Context: The Little House books is a series of American children's novels.
+2. You said: The books were published by HarperCollins.
+3. I checked: Who published the Little House books?
+4. I found this article: These are the books that started it all -- the stories that captured the hearts and imaginations of children and young adults orldwide. Written by Laura Ingalls Wilder and published by HarperCollins, these beloved books remain a favorite to this day.
+5. Reasoning: The article said the Little House books were published by HarperCollins and you said the books were published by HarperCollins.
+6. Therefore: This agrees with what you said.
+
+1. Context: Real Chance of Love was an American reality TV show.
+2. You said: Season 2 of the show was won by Cali, who chose to be with Chance.
+3. I checked: Who won season 2 of Real Chance of Love?
+4. I found this article: Real Chance of Love 2: Back in the Saddle is the second season of the VH1 reality television dating series Real Chance of Love. Ahmad Givens (Real) and Kamal Givens (Chance), former contestants on I Love New York are the central figures.
+5. Reasoning: The article doesn't answer the question and you said that Cali won season 2 of Real Chance of Love.
+6. Therefore: This is irrelevant to what you said.
+
+1. Context: The Stanford Prison Experiment is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+2. You said: It was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+3. I checked: Where was Stanford Prison Experiment conducted?
+4. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+5. Reasoning: The article said the Stanford Prison Experiment was conducted in Jordan Hall and you said the Stanford Prison Experiment was conducted in Jordan Hall.
+6. Therefore: This agrees with what you said.
+
+1. Context: Social work is a profession that is based in the philosophical tradition of humanism.
+2. You said: It is an intellectual discipline that has its roots in the 1800s.
+3. I checked: When did social work have its roots?
+4. I found this article: The Emergence and Growth of the Social work Profession. Social work’s roots were planted in the 1880s, when charity organization societies (COS) were created to organize municipal voluntary relief associations and settlement houses were established.
+5. Reasoning: The article said social work has its roots planted in the 1880s and you said social work has its root in the 1800s.
+6. Therefore: This disagrees with what you said.
+
+1. Context: The Havel-Hakimi algorithm is named after Václav Havel and Samih Hakimi.
+2. You said: It is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+3. I checked: What is the Havel-Hakimi algorithm?
+4. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+5. Reasoning: The article said the Havel-Hakimi algorithm is for constructing a special solution if a simple graph for the given degree sequence exists and you said the Havel-Hakimi algorithm is for converting the adjacency matrix of a graph.
+6. Therefore: This disagrees with what you said.
+
+1. Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+2. You said: The song was produced by Michael Lloyd.
+3. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+4. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+5. Reasoning: The article said that a demo was produced by Michael Lloyd and you said "Time of My Life" was produced by Michael Lloyd.
+6. Therefore: This agrees with what you said.
+
+1. Context: Tiger Woods is the only player who has won the most green jackets. He has won four times.
+2. You said: The Green Jacket is one of the most coveted prizes in all of golf.
+3. I checked: What is the Green Jacket in golf?
+4. I found this article: The green jacket is a classic, three-button, single-breasted and single-vent, featuring the Augusta National Golf Club logo on the left chest pocket. The logo also appears on the brass buttons.
+5. Reasoning: The article said the Green Jacket is a classic three-button single-breasted and single-vent and you said the Green Jacket is one of the most coveted prizes in all of golf.
+6. Therefore: This is irrelevant to what you said.
+
+1. Context: Kelvin Hopins was suspended from the Labor Party.
+2. You said: This was because he had allegedly sexually harassed and behaved inappropriately towards a Labour Party activist, Ava Etemadzadeh.
+3. I checked: Why was Kelvin Hopins suspeneded from the Labor Party?
+4. I found this article: A former Labour MP has left the party before an inquiry into sexual harassment allegations against him was able to be concluded, the party has confirmed. Kelvin Hopkins was accused in 2017 of inappropriate physical contact and was suspended by the Labour party pending an investigation.
+5. Reasoning: The article said Kelvin Hopins was suspended because of inappropriate physical contact and you said that Kelvin Hopins was suspended because he allegedly sexually harassed Ava Etemadzadeh.
+6. Therefore: This agrees with what you said.
+
+1. Context: The Battles of Lexington and Concord, fought on April 19, 1775, kicked off the American Revolutionary War (1775-83).
+2. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Smith.
+3. I checked: Who led the British side in the battle of Lexington and Concord?
+4. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+5. Reasoning: The article said the British side was led by Lieutenant Colonel Francis Smith and you said the British side was led by General Thomas Smith.
+6. Therefore: This disagrees with what you said.
+
+1. Context: {context}
+2. You said: {claim}
+3. I checked: {query}
+4. I found this article: {evidence}
+5. Reasoning:
+""".strip()
+
+EDITOR_PROMPT = """I will fix some things you said.
+
+1. You said: Your nose switches back and forth between nostrils. When you sleep, you switch about every 45 minutes. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+2. I checked: How often do your nostrils switch?
+3. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+4. This suggests 45 minutes switch time in your statement is wrong.
+5. My fix: Your nose switches back and forth between nostrils. When you sleep, you switch about every 2 hours. This is to prevent a buildup of mucus. It’s called the nasal cycle.
+
+1. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+2. I checked: Who led the British side in the battle of Lexington and Concord?
+3. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+4. This suggests General Thomas Hall in your statement is wrong.
+5. My fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+1. You said: The Stanford Prison Experiment was conducted in the basement of Encina Hall, Stanford’s psychology building.
+2. I checked: Where was Stanford Prison Experiment conducted?
+3. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+4. This suggests Encina Hall in your statement is wrong.
+5. My fix: The Stanford Prison Experiment was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+1. You said: The Havel-Hakimi algorithm is an algorithm for converting the adjacency matrix of a graph into its adjacency list. It is named after Vaclav Havel and Samih Hakimi.
+2. I checked: What is the Havel-Hakimi algorithm?
+3. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+4. This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+5. My fix: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. It is named after Vaclav Havel and Samih Hakimi.
+
+1. You said: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Phil Ramone.
+2. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+3. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+4. This suggests "Time of My Life" producer name in your statement is wrong.
+5. My fix: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing. The song was produced by Michael Lloyd.
+
+1. You said: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+2. I checked: What is the area of Phoenix Market City in Pune?
+3. I found this article: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+4. This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+5. My fix: Phoenix Market City Pune is located on 21 acres of prime property in Pune. It is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+1. You said: {claim}
+2. I checked: {query}
+3. I found this article: {evidence}
+4. This suggests
+""".strip()
+
+CONTEXTUAL_EDITOR_PROMPT = """I will fix some things you said.
+
+1. Context: Your nose switches back and forth between nostrils. It's called the nasal cycle. This is to prevent a buildup of mucus.
+2. You said: When you sleep, you switch about every 45 minutes.
+3. I checked: How often do your nostrils switch?
+4. I found this article: Although we don’t usually notice it, during the nasal cycle one nostril becomes congested and thus contributes less to airflow, while the other becomes decongested. On average, the congestion pattern switches about every 2 hours, according to a small 2016 study published in the journal PLOS One.
+5. This suggests 45 minutes switch time in your statement is wrong.
+6. My fix: When you sleep, you switch about every 2 hours.
+
+1. Context: The Battles of Lexington and Concord, fought on April 19, 1775, kicked off the American Revolutionary War (1775-83).
+2. You said: In the battles of Lexington and Concord, the British side was led by General Thomas Hall.
+3. I checked: Who led the British side in the battle of Lexington and Concord?
+4. I found this article: Interesting Facts about the Battles of Lexington and Concord. The British were led by Lieutenant Colonel Francis Smith. There were 700 British regulars.
+5. This suggests General Thomas Hall in your statement is wrong.
+6. My fix: In the battles of Lexington and Concord, the British side was led by Lieutenant Colonel Francis Smith.
+
+1. Context: The Stanford Prison Experiment is a psychological study to observe the behaviors of conflict and violence that happen between inmates and prisoners in real prisons.
+2. You said: It was conducted in the basement of Encina Hall, Stanford’s psychology building.
+3. I checked: Where was Stanford Prison Experiment conducted?
+4. I found this article: Carried out August 15-21, 1971 in the basement of Jordan Hall, the Stanford Prison Experiment set out to examine the psychological effects of authority and powerlessness in a prison environment.
+5. This suggests Encina Hall in your statement is wrong.
+6. My fix: It was conducted in the basement of Jordan Hall, Stanford’s psychology building.
+
+1. Context: The Havel-Hakimi algorithm is named after Václav Havel and Samih Hakimi.
+2. You said: It is an algorithm for converting the adjacency matrix of a graph into its adjacency list.
+3.. I checked: What is the Havel-Hakimi algorithm?
+4. I found this article: The Havel-Hakimi algorithm constructs a special solution if a simple graph for the given degree sequence exists, or proves that one cannot find a positive answer. This construction is based on a recursive algorithm. The algorithm was published by Havel (1955), and later by Hakimi (1962).
+5. This suggests the Havel-Hakimi algorithm’s functionality in your statement is wrong.
+6. My fix: It is an algorithm for constructing a special solution if a simple graph for the given degree sequence exists, or proving that one cannot find a positive answer.
+
+1. Context: "Time of My Life" is a song by American singer-songwriter Bill Medley from the soundtrack of the 1987 film Dirty Dancing.
+2. You said: The song was produced by Phil Ramone.
+3. I checked: Who was the producer of "(I’ve Had) The Time of My Life"?
+4. I found this article: On September 8, 2010, the original demo of this song, along with a remix by producer Michael Lloyd , was released as digital files in an effort to raise money for the Patrick Swayze Pancreas Cancer Resarch Foundation at Stanford University.
+5. This suggests "Time of My Life" producer name in your statement is wrong.
+6. My fix: The song was produced by Michael Lloyd.
+
+1. Context: Phoenix Market City Pune is located on 21 acres of prime property in Pune.
+2. You said: Phoenix Market City is spread across four levels with approximately 1.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+3. I checked: What is the area of Phoenix Market City in Pune?
+4. I found this article: Phoenix Market City was opened in January 2013 and has the distinction of being the largest mall in the city of Pune, with the area of 3.4 million square feet. It is located in the Viman Nagar area of Pune.
+5. This suggests the 1.4 million square feet of built-up space in your statment is wrong.
+6. My fix: Phoenix Market City is spread across four levels with approximately 3.4 million square feet of built-up space. The mall is owned and operated by Phoenix Mills Limited.
+
+1. Context: {context}
+2. You said: {claim}
+3. I checked: {query}
+4. I found this article: {evidence}
+5. This suggests
+""".strip()
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/hallucination.py b/src/openfactcheck/solvers/webservice/rarr_utils/hallucination.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc02d2d658eab9d11a92584d1c86b1318f6ded3
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/hallucination.py
@@ -0,0 +1,44 @@
+"""Utils for generating fake evidence given a query."""
+import os
+import time
+from typing import Dict
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def run_evidence_hallucination(
+ query: str,
+ model: str,
+ prompt: str,
+ num_retries: int = 5,
+) -> Dict[str, str]:
+ """Generates a fake piece of evidence via LLM given the question.
+
+ Args:
+ query: Query to guide the validity check.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ num_retries: Number of times to retry OpenAI call in the event of an API failure.
+ Returns:
+ output: A potentially inaccurate piece of evidence.
+ """
+ gpt3_input = prompt.format(query=query).strip()
+ for _ in range(num_retries):
+ try:
+ response = openai.completions.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=0.0,
+ max_tokens=256,
+ stop=["\n", "\n\n"],
+ )
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(2)
+
+ hallucinated_evidence = response.choices[0].text.strip()
+ output = {"text": hallucinated_evidence, "query": query}
+ return output
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/hallucination_prompts.py b/src/openfactcheck/solvers/webservice/rarr_utils/hallucination_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..49acfdc91ebcb3aa268cbcf1fddf0b44113c630d
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/hallucination_prompts.py
@@ -0,0 +1,13 @@
+"""Prompts for generating hallucinations."""
+
+EVIDENCE_HALLUCINATION = """Generate a paragraph that answers the question.
+
+Question: What is New York-Style pizza?
+Text: New York-style pizza has slices that are large and wide with a thin crust that is foldable yet crispy. It is traditionally topped with tomato sauce and mozzarella cheese, with any extra toppings placed on top of the cheese.
+
+Question: When did the first McDonald's open?
+Text: The McDonald's brothers opened their first McDonald's restaurant in 1940 in San Bernardino, California. Originally, a carhop drive-in system was used to serve customers. The initial menu items were centered around barbecue and the first name the brothers called their business was "McDonald's Famous Barbecue."
+
+Question: {query}
+Text:
+""".strip()
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/question_generation.py b/src/openfactcheck/solvers/webservice/rarr_utils/question_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3810f97f0e04062e51552e3324f41582ea3ebe1a
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/question_generation.py
@@ -0,0 +1,82 @@
+"""Utils for running question generation."""
+import os
+import time
+from typing import List
+
+import openai
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+def parse_api_response(api_response: str) -> List[str]:
+ """Extract questions from the GPT-3 API response.
+
+ Our prompt returns questions as a string with the format of an ordered list.
+ This function parses this response in a list of questions.
+
+ Args:
+ api_response: Question generation response from GPT-3.
+ Returns:
+ questions: A list of questions.
+ """
+ search_string = "I googled:"
+ questions = []
+ for question in api_response.split("\n"):
+ # Remove the search string from each question
+ if search_string not in question:
+ continue
+ question = question.split(search_string)[1].strip()
+ questions.append(question)
+
+ return questions
+
+
+def run_rarr_question_generation(
+ claim: str,
+ model: str,
+ prompt: str,
+ temperature: float,
+ num_rounds: int,
+ context: str = None,
+ num_retries: int = 5,
+) -> List[str]:
+ """Generates questions that interrogate the information in a claim.
+
+ Given a piece of text (claim), we use GPT-3 to generate questions that question the
+ information in the claim. We run num_rounds of sampling to get a diverse set of questions.
+
+ Args:
+ claim: Text to generate questions off of.
+ model: Name of the OpenAI GPT-3 model to use.
+ prompt: The prompt template to query GPT-3 with.
+ temperature: Temperature to use for sampling questions. 0 represents greedy deconding.
+ num_rounds: Number of times to sample questions.
+ Returns:
+ questions: A list of questions.
+ """
+ if context:
+ gpt3_input = prompt.format(context=context, claim=claim).strip()
+ else:
+ gpt3_input = prompt.format(claim=claim).strip()
+
+ questions = set()
+ for _ in range(num_rounds):
+ for _ in range(num_retries):
+ try:
+ response = openai.completions.create(
+ model=model,
+ prompt=gpt3_input,
+ temperature=temperature,
+ max_tokens=256,
+ )
+ cur_round_questions = parse_api_response(
+ response.choices[0].text.strip()
+ )
+ questions.update(cur_round_questions)
+ break
+ except openai.OpenAIError as exception:
+ print(f"{exception}. Retrying...")
+ time.sleep(1)
+
+ questions = list(sorted(questions))
+ return questions
diff --git a/src/openfactcheck/solvers/webservice/rarr_utils/search.py b/src/openfactcheck/solvers/webservice/rarr_utils/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..8351d03789bbbf1c626af22986d5603e9993a673
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_utils/search.py
@@ -0,0 +1,219 @@
+"""Utils for searching a query and returning top passages from search results."""
+import concurrent.futures
+import itertools
+import os
+import random
+from typing import Any, Dict, List, Tuple
+
+import bs4
+import requests
+import spacy
+import torch
+from sentence_transformers import CrossEncoder
+
+PASSAGE_RANKER = CrossEncoder(
+ "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ max_length=512,
+ device="cpu",
+)
+SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search/"
+SUBSCRIPTION_KEY = os.getenv("AZURE_SEARCH_KEY")
+TOKENIZER = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer"])
+
+
+def chunk_text(
+ text: str,
+ sentences_per_passage: int,
+ filter_sentence_len: int,
+ sliding_distance: int = None,
+) -> List[str]:
+ """Chunks text into passages using a sliding window.
+
+ Args:
+ text: Text to chunk into passages.
+ sentences_per_passage: Number of sentences for each passage.
+ filter_sentence_len: Maximum number of chars of each sentence before being filtered.
+ sliding_distance: Sliding distance over the text. Allows the passages to have
+ overlap. The sliding distance cannot be greater than the window size.
+ Returns:
+ passages: Chunked passages from the text.
+ """
+ if not sliding_distance or sliding_distance > sentences_per_passage:
+ sliding_distance = sentences_per_passage
+ assert sentences_per_passage > 0 and sliding_distance > 0
+
+ passages = []
+ try:
+ doc = TOKENIZER(text[:500000]) # Take 500k chars to not break tokenization.
+ sents = [
+ s.text
+ for s in doc.sents
+ if len(s.text) <= filter_sentence_len # Long sents are usually metadata.
+ ]
+ for idx in range(0, len(sents), sliding_distance):
+ passages.append(" ".join(sents[idx : idx + sentences_per_passage]))
+ except UnicodeEncodeError as _: # Sometimes run into Unicode error when tokenizing.
+ print("Unicode error when using Spacy. Skipping text.")
+
+ return passages
+
+
+def is_tag_visible(element: bs4.element) -> bool:
+ """Determines if an HTML element is visible.
+
+ Args:
+ element: A BeautifulSoup element to check the visiblity of.
+ returns:
+ Whether the element is visible.
+ """
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ] or isinstance(element, bs4.element.Comment):
+ return False
+ return True
+
+
+def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
+ """Scrapes a URL for all text information.
+
+ Args:
+ url: URL of webpage to scrape.
+ timeout: Timeout of the requests call.
+ Returns:
+ web_text: The visible text of the scraped URL.
+ url: URL input.
+ """
+ # Scrape the URL
+ try:
+ response = requests.get(url, timeout=timeout)
+ response.raise_for_status()
+ except requests.exceptions.RequestException as _:
+ return None, url
+
+ # Extract out all text from the tags
+ try:
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
+ texts = soup.findAll(text=True)
+ # Filter out invisible text from the page.
+ visible_text = filter(is_tag_visible, texts)
+ except Exception as _:
+ return None, url
+
+ # Returns all the text concatenated as a string.
+ web_text = " ".join(t.strip() for t in visible_text).strip()
+ # Clean up spacing.
+ web_text = " ".join(web_text.split())
+ return web_text, url
+
+
+def search_bing(query: str, timeout: float = 3) -> List[str]:
+ """Searches the query using Bing.
+ Args:
+ query: Search query.
+ timeout: Timeout of the requests call.
+ Returns:
+ search_results: A list of the top URLs relevant to the query.
+ """
+ headers = {"Ocp-Apim-Subscription-Key": os.getenv("AZURE_SEARCH_KEY")}
+ params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
+ response = requests.get(SEARCH_URL, headers=headers, params=params, timeout=timeout)
+ response.raise_for_status()
+
+ response = response.json()
+ search_results = [r["url"] for r in response["webPages"]["value"]]
+ return search_results
+
+
+def run_search(
+ query: str,
+ cached_search_results: List[str] = None,
+ max_search_results_per_query: int = 3,
+ max_sentences_per_passage: int = 5,
+ sliding_distance: int = 1,
+ max_passages_per_search_result_to_return: int = 1,
+ timeout: float = 3,
+ randomize_num_sentences: bool = False,
+ filter_sentence_len: int = 250,
+ max_passages_per_search_result_to_score: int = 30,
+) -> List[Dict[str, Any]]:
+ """Searches the query on a search engine and returns the most relevant information.
+
+ Args:
+ query: Search query.
+ max_search_results_per_query: Maximum number of search results to get return.
+ max_sentences_per_passage: Maximum number of sentences for each passage.
+ filter_sentence_len: Maximum length of a sentence before being filtered.
+ sliding_distance: Sliding distance over the sentences of each search result.
+ Used to extract passages.
+ max_passages_per_search_result_to_score: Maxinum number of passages to score for
+ each search result.
+ max_passages_per_search_result_to_return: Maximum number of passages to return
+ for each search result.
+ Returns:
+ retrieved_passages: Top retrieved passages for the search query.
+ """
+ if cached_search_results is not None:
+ search_results = cached_search_results
+ else:
+ search_results = search_bing(query, timeout=timeout)
+
+ # Scrape search results in parallel
+ with concurrent.futures.ThreadPoolExecutor() as e:
+ scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
+ # Remove URLs if we weren't able to scrape anything or if they are a PDF.
+ scraped_results = [r for r in scraped_results if r[0] and ".pdf" not in r[1]]
+
+ # Iterate through the scraped results and extract out the most useful passages.
+ retrieved_passages = []
+ for webtext, url in scraped_results[:max_search_results_per_query]:
+ if randomize_num_sentences:
+ sents_per_passage = random.randint(1, max_sentences_per_passage)
+ else:
+ sents_per_passage = max_sentences_per_passage
+
+ # Chunk the extracted text into passages.
+ passages = chunk_text(
+ text=webtext,
+ sentences_per_passage=sents_per_passage,
+ filter_sentence_len=filter_sentence_len,
+ sliding_distance=sliding_distance,
+ )
+ passages = passages[:max_passages_per_search_result_to_score]
+ if not passages:
+ continue
+
+ # Score the passages by relevance to the query using a cross-encoder.
+ scores = PASSAGE_RANKER.predict([(query, p) for p in passages]).tolist()
+ passage_scores = list(zip(passages, scores))
+
+ # Take the top passages_per_search passages for the current search result.
+ passage_scores.sort(key=lambda x: x[1], reverse=True)
+ for passage, score in passage_scores[:max_passages_per_search_result_to_return]:
+ retrieved_passages.append(
+ {
+ "text": passage,
+ "url": url,
+ "query": query,
+ "sents_per_passage": sents_per_passage,
+ "retrieval_score": score, # Cross-encoder score as retr score
+ }
+ )
+
+ if retrieved_passages:
+ # Sort all retrieved passages by the retrieval score.
+ retrieved_passages = sorted(
+ retrieved_passages, key=lambda d: d["retrieval_score"], reverse=True
+ )
+
+ # Normalize the retreival scores into probabilities
+ scores = [r["retrieval_score"] for r in retrieved_passages]
+ probs = torch.nn.functional.softmax(torch.Tensor(scores), dim=-1).tolist()
+ for prob, passage in zip(probs, retrieved_passages):
+ passage["score"] = prob
+
+ return retrieved_passages
diff --git a/src/openfactcheck/solvers/webservice/rarr_vfr.py b/src/openfactcheck/solvers/webservice/rarr_vfr.py
new file mode 100644
index 0000000000000000000000000000000000000000..79d89ff8cd0e09cf952d09da33d312b451b396bb
--- /dev/null
+++ b/src/openfactcheck/solvers/webservice/rarr_vfr.py
@@ -0,0 +1,39 @@
+from openfactcheck.core.state import FactCheckerState
+from openfactcheck.core.solver import StandardTaskSolver, Solver
+
+from .rarr_utils.agreement_gate import run_agreement_gate
+from .rarr_utils.functional_prompt import AGREEMENT_GATE_PROMPT
+
+
+@Solver.register("rarr_verifier", "claims_with_evidences", "label")
+class RARRAgreementGate(StandardTaskSolver):
+ def __init__(self, args):
+ super().__init__(args)
+ self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
+ self.model = self.global_config.get("rarr_model", "text-davinci-003")
+
+ def __call__(self, state: FactCheckerState, *args, **kwargs):
+ claims_with_evidences = state.get(self.input_name)
+ results = []
+ for claim, evidences in claims_with_evidences.items():
+ result = {}
+ evidences = evidences[:self.max_evidences_per_question]
+ labels = []
+ for query, evidence in evidences:
+ gate = run_agreement_gate(
+ claim=claim,
+ context=None,
+ query=query,
+ evidence=evidence,
+ model=self.model,
+ prompt=AGREEMENT_GATE_PROMPT
+ )
+ labels.append(gate['is_open'])
+ result['claim'] = claim
+ result['evidences'] = evidences
+ result['labels'] = labels
+ result['factuality'] = all(labels)
+ results.append(result)
+ state.set(self.output_name, all([x['factuality'] for x in results]))
+ state.set("detail", results)
+ return True, state