Liyan06
commited on
Commit
·
3fbb656
1
Parent(s):
113a57e
update retrieval and doc display ranking
Browse files- handler.py +25 -4
- web_retrieval.py +3 -2
handler.py
CHANGED
|
@@ -51,6 +51,7 @@ class EndpointHandler():
|
|
| 51 |
def __init__(self, path="./"):
|
| 52 |
self.scorer = MiniCheck(path=path)
|
| 53 |
self.rouge = evaluate.load('rouge')
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
def __call__(self, data):
|
|
@@ -82,7 +83,7 @@ class EndpointHandler():
|
|
| 82 |
else:
|
| 83 |
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
|
| 84 |
|
| 85 |
-
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
|
| 86 |
|
| 87 |
span_to_highlight = []
|
| 88 |
for doc_chunk, score in zip(ranked_docs, scores):
|
|
@@ -104,7 +105,12 @@ class EndpointHandler():
|
|
| 104 |
return outputs
|
| 105 |
|
| 106 |
|
| 107 |
-
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
search_results = search_google(claim, timeout=timeout)
|
| 110 |
|
|
@@ -133,9 +139,24 @@ class EndpointHandler():
|
|
| 133 |
num_chunks = len([item for items in used_chunk for item in items])
|
| 134 |
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
|
| 135 |
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
|
| 140 |
|
| 141 |
def chunk_and_highest_rouge_score(self, doc, claim):
|
|
|
|
| 51 |
def __init__(self, path="./"):
|
| 52 |
self.scorer = MiniCheck(path=path)
|
| 53 |
self.rouge = evaluate.load('rouge')
|
| 54 |
+
self.tfidf_order = True
|
| 55 |
|
| 56 |
|
| 57 |
def __call__(self, data):
|
|
|
|
| 83 |
else:
|
| 84 |
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
|
| 85 |
|
| 86 |
+
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
|
| 87 |
|
| 88 |
span_to_highlight = []
|
| 89 |
for doc_chunk, score in zip(ranked_docs, scores):
|
|
|
|
| 105 |
return outputs
|
| 106 |
|
| 107 |
|
| 108 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False, tfidf_order=False):
|
| 109 |
+
|
| 110 |
+
"""
|
| 111 |
+
if tfidf_order == True, then display the docs in the order of TF-IDF similarity with the claim, regardless of the entailment score
|
| 112 |
+
otherwise, display the docs in the order of the entailment score
|
| 113 |
+
"""
|
| 114 |
|
| 115 |
search_results = search_google(claim, timeout=timeout)
|
| 116 |
|
|
|
|
| 139 |
num_chunks = len([item for items in used_chunk for item in items])
|
| 140 |
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
|
| 141 |
|
| 142 |
+
if tfidf_order:
|
| 143 |
+
tfidf_docs, scores = [], []
|
| 144 |
+
for used_c, support_prob_per_c in zip(used_chunk, support_prob_per_chunk):
|
| 145 |
+
# If the doc can support the claim, find the chunk with the
|
| 146 |
+
# highest entailment score; otherwise, use the first chunk
|
| 147 |
+
if max(support_prob_per_c) > 0.5:
|
| 148 |
+
tfidf_docs.append(used_c[np.argmax(support_prob_per_c)])
|
| 149 |
+
scores.append(max(support_prob_per_c))
|
| 150 |
+
else:
|
| 151 |
+
tfidf_docs.append(used_c[0])
|
| 152 |
+
scores.append(support_prob_per_c[0])
|
| 153 |
+
|
| 154 |
+
return tfidf_docs, scores, urls
|
| 155 |
+
|
| 156 |
+
else:
|
| 157 |
+
ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
|
| 158 |
|
| 159 |
+
return ranked_docs, scores, ranked_urls
|
| 160 |
|
| 161 |
|
| 162 |
def chunk_and_highest_rouge_score(self, doc, claim):
|
web_retrieval.py
CHANGED
|
@@ -82,7 +82,7 @@ def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
|
|
| 82 |
return web_text, url
|
| 83 |
|
| 84 |
|
| 85 |
-
def search_google(query:str, num_web_pages:int=
|
| 86 |
"""Searches the query using Google.
|
| 87 |
Args:
|
| 88 |
query: Search query.
|
|
@@ -108,7 +108,8 @@ def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='
|
|
| 108 |
for page in range(0, num_web_pages, 10):
|
| 109 |
# here page is google search's bottom page meaning, click 2 -> start=10
|
| 110 |
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
| 111 |
-
url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
|
|
|
| 112 |
r = requests.get(url, headers=headers, timeout=timeout)
|
| 113 |
# collect all urls by regular expression
|
| 114 |
# how to do if I just want to have the returned top-k pages?
|
|
|
|
| 82 |
return web_text, url
|
| 83 |
|
| 84 |
|
| 85 |
+
def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
|
| 86 |
"""Searches the query using Google.
|
| 87 |
Args:
|
| 88 |
query: Search query.
|
|
|
|
| 108 |
for page in range(0, num_web_pages, 10):
|
| 109 |
# here page is google search's bottom page meaning, click 2 -> start=10
|
| 110 |
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
| 111 |
+
# url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
| 112 |
+
url = f"https://www.google.com/search?q={search_query}&start={page}"
|
| 113 |
r = requests.get(url, headers=headers, timeout=timeout)
|
| 114 |
# collect all urls by regular expression
|
| 115 |
# how to do if I just want to have the returned top-k pages?
|