Upload free_ask_internet.py
Browse files- free_ask_internet.py +76 -3
free_ask_internet.py
CHANGED
@@ -121,9 +121,17 @@ def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000,
|
|
121 |
}
|
122 |
if len(ref_content) > 0:
|
123 |
prompts = '''
|
124 |
-
You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
|
126 |
-
|
127 |
Here are the set of contexts:
|
128 |
''' + "\n\n" + "```"
|
129 |
|
@@ -278,4 +286,69 @@ def ask_internet(query:str, model:str, debug=False):
|
|
278 |
subdomain = tldextract.extract(url).subdomain
|
279 |
site_name = subdomain + "." + domain if subdomain else domain
|
280 |
yield f"*{count}. [{site_name}]({url})*\n"
|
281 |
-
count +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
if len(ref_content) > 0:
|
123 |
prompts = '''
|
124 |
+
You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol.
|
125 |
+
|
126 |
+
IMPORTANT INSTRUCTIONS FOR CITATIONS:
|
127 |
+
1. Use ONLY the exact reference symbols provided (like ¹, ², ³) to cite sources
|
128 |
+
2. Do not invent or combine citation numbers that weren't provided
|
129 |
+
3. Place the citation symbol immediately after the sentence or phrase it supports
|
130 |
+
4. Each citation should correspond to exactly one source
|
131 |
+
5. Do not use citation symbols that weren't defined in the context
|
132 |
+
|
133 |
Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
|
134 |
+
|
135 |
Here are the set of contexts:
|
136 |
''' + "\n\n" + "```"
|
137 |
|
|
|
286 |
subdomain = tldextract.extract(url).subdomain
|
287 |
site_name = subdomain + "." + domain if subdomain else domain
|
288 |
yield f"*{count}. [{site_name}]({url})*\n"
|
289 |
+
count += 1def ask_internet(query:str, model:str, debug=False):
|
290 |
+
content_list = search_web_ref(query, debug=debug)
|
291 |
+
if debug:
|
292 |
+
print(content_list)
|
293 |
+
prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
|
294 |
+
|
295 |
+
# 收集所有回答内容
|
296 |
+
response_content = ""
|
297 |
+
for token in chat(prompt=prompt, model=model):
|
298 |
+
if token:
|
299 |
+
response_content += token
|
300 |
+
|
301 |
+
# 改进的引用处理逻辑
|
302 |
+
if content_list:
|
303 |
+
# 定义 Unicode 上标数字映射
|
304 |
+
superscript_digits = {
|
305 |
+
'1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵',
|
306 |
+
'6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹', '0': '⁰'
|
307 |
+
}
|
308 |
+
|
309 |
+
# 创建上标符号到URL的映射
|
310 |
+
symbol_url_map = {}
|
311 |
+
for i in range(len(content_list)):
|
312 |
+
if i >= 10: # 限制最多10个引用,避免复杂引用
|
313 |
+
break
|
314 |
+
num = str(i + 1)
|
315 |
+
sup_num = ''.join(superscript_digits.get(c, c) for c in num)
|
316 |
+
url = content_list[i].get('url')
|
317 |
+
if url:
|
318 |
+
symbol_url_map[sup_num] = url
|
319 |
+
|
320 |
+
# 更精确的引用替换
|
321 |
+
modified_content = response_content
|
322 |
+
for symbol, url in symbol_url_map.items():
|
323 |
+
# 使用正则表达式确保只替换独立的上标字符,避免替换组合上标
|
324 |
+
import re
|
325 |
+
pattern = r'(?<!
|
326 |
+
$$
|
327 |
+
)' + re.escape(symbol) + r'(?!
|
328 |
+
$$
|
329 |
+
)'
|
330 |
+
modified_content = re.sub(pattern, f'[{symbol}]({url})', modified_content)
|
331 |
+
|
332 |
+
yield modified_content
|
333 |
+
else:
|
334 |
+
yield response_content
|
335 |
+
|
336 |
+
# 参考资料部分
|
337 |
+
yield "\n\n---\n"
|
338 |
+
yield "参考资料:\n"
|
339 |
+
if content_list:
|
340 |
+
for i, url_content in enumerate(content_list):
|
341 |
+
if i >= 10: # 限制最多10个引用
|
342 |
+
break
|
343 |
+
url = url_content.get('url')
|
344 |
+
if url:
|
345 |
+
# 提取域名作为显示名称
|
346 |
+
domain = tldextract.extract(url).domain
|
347 |
+
subdomain = tldextract.extract(url).subdomain
|
348 |
+
site_name = subdomain + "." + domain if subdomain else domain
|
349 |
+
|
350 |
+
# 使用上标数字作为引用编号
|
351 |
+
num = str(i + 1)
|
352 |
+
sup_num = ''.join(superscript_digits.get(c, c) for c in num)
|
353 |
+
|
354 |
+
yield f"{sup_num} [{site_name}]({url})\n"
|