deeme commited on
Commit
0eb1947
·
verified ·
1 Parent(s): f8fb8ae

Upload free_ask_internet.py

Browse files
Files changed (1) hide show
  1. free_ask_internet.py +76 -3
free_ask_internet.py CHANGED
@@ -121,9 +121,17 @@ def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000,
121
  }
122
  if len(ref_content) > 0:
123
  prompts = '''
124
- You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol like ¹. Please use the context and cite the context at the end of each sentence if applicable.
 
 
 
 
 
 
 
 
125
  Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
126
- Please cite the contexts with the reference symbols. If a sentence comes from multiple contexts, please list all applicable citations, like ³⁵. Other than code and specific names and citations, your answer must be written in the same language as the question.
127
  Here are the set of contexts:
128
  ''' + "\n\n" + "```"
129
 
@@ -278,4 +286,69 @@ def ask_internet(query:str, model:str, debug=False):
278
  subdomain = tldextract.extract(url).subdomain
279
  site_name = subdomain + "." + domain if subdomain else domain
280
  yield f"*{count}. [{site_name}]({url})*\n"
281
- count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  }
122
  if len(ref_content) > 0:
123
  prompts = '''
124
+ You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol.
125
+
126
+ IMPORTANT INSTRUCTIONS FOR CITATIONS:
127
+ 1. Use ONLY the exact reference symbols provided (like ¹, ², ³) to cite sources
128
+ 2. Do not invent or combine citation numbers that weren't provided
129
+ 3. Place the citation symbol immediately after the sentence or phrase it supports
130
+ 4. Each citation should correspond to exactly one source
131
+ 5. Do not use citation symbols that weren't defined in the context
132
+
133
  Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
134
+
135
  Here are the set of contexts:
136
  ''' + "\n\n" + "```"
137
 
 
286
  subdomain = tldextract.extract(url).subdomain
287
  site_name = subdomain + "." + domain if subdomain else domain
288
  yield f"*{count}. [{site_name}]({url})*\n"
289
+ count += 1def ask_internet(query:str, model:str, debug=False):
290
+ content_list = search_web_ref(query, debug=debug)
291
+ if debug:
292
+ print(content_list)
293
+ prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
294
+
295
+ # 收集所有回答内容
296
+ response_content = ""
297
+ for token in chat(prompt=prompt, model=model):
298
+ if token:
299
+ response_content += token
300
+
301
+ # 改进的引用处理逻辑
302
+ if content_list:
303
+ # 定义 Unicode 上标数字映射
304
+ superscript_digits = {
305
+ '1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵',
306
+ '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹', '0': '⁰'
307
+ }
308
+
309
+ # 创建上标符号到URL的映射
310
+ symbol_url_map = {}
311
+ for i in range(len(content_list)):
312
+ if i >= 10: # 限制最多10个引用,避免复杂引用
313
+ break
314
+ num = str(i + 1)
315
+ sup_num = ''.join(superscript_digits.get(c, c) for c in num)
316
+ url = content_list[i].get('url')
317
+ if url:
318
+ symbol_url_map[sup_num] = url
319
+
320
+ # 更精确的引用替换
321
+ modified_content = response_content
322
+ for symbol, url in symbol_url_map.items():
323
+ # 使用正则表达式确保只替换独立的上标字符,避免替换组合上标
324
+ import re
325
+ pattern = r'(?<!
326
+ $$
327
+ )' + re.escape(symbol) + r'(?!
328
+ $$
329
+ )'
330
+ modified_content = re.sub(pattern, f'[{symbol}]({url})', modified_content)
331
+
332
+ yield modified_content
333
+ else:
334
+ yield response_content
335
+
336
+ # 参考资料部分
337
+ yield "\n\n---\n"
338
+ yield "参考资料:\n"
339
+ if content_list:
340
+ for i, url_content in enumerate(content_list):
341
+ if i >= 10: # 限制最多10个引用
342
+ break
343
+ url = url_content.get('url')
344
+ if url:
345
+ # 提取域名作为显示名称
346
+ domain = tldextract.extract(url).domain
347
+ subdomain = tldextract.extract(url).subdomain
348
+ site_name = subdomain + "." + domain if subdomain else domain
349
+
350
+ # 使用上标数字作为引用编号
351
+ num = str(i + 1)
352
+ sup_num = ''.join(superscript_digits.get(c, c) for c in num)
353
+
354
+ yield f"{sup_num} [{site_name}]({url})\n"