Spaces:

deeme
/

net

Running

App Files Files Community

deeme commited on 10 days ago

Commit

0eb1947

verified ·

1 Parent(s): f8fb8ae

Upload free_ask_internet.py

Browse files

Files changed (1) hide show

free_ask_internet.py +76 -3

free_ask_internet.py CHANGED Viewed

@@ -121,9 +121,17 @@ def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000,
     }
     if len(ref_content) > 0:
         prompts = '''
-        You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol like ¹. Please use the context and cite the context at the end of each sentence if applicable.
         Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
-        Please cite the contexts with the reference symbols. If a sentence comes from multiple contexts, please list all applicable citations, like ³⁵. Other than code and specific names and citations, your answer must be written in the same language as the question.
         Here are the set of contexts:
         '''  + "\n\n" + "```"
@@ -278,4 +286,69 @@ def ask_internet(query:str, model:str, debug=False):
             subdomain = tldextract.extract(url).subdomain
             site_name = subdomain + "." + domain if subdomain else domain
             yield f"*{count}. [{site_name}]({url})*\n"
-            count += 1

     }
     if len(ref_content) > 0:
         prompts = '''
+        You are a large language AI assistant. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference symbol.
+        IMPORTANT INSTRUCTIONS FOR CITATIONS:
+        1. Use ONLY the exact reference symbols provided (like ¹, ², ³) to cite sources
+        2. Do not invent or combine citation numbers that weren't provided
+        3. Place the citation symbol immediately after the sentence or phrase it supports
+        4. Each citation should correspond to exactly one source
+        5. Do not use citation symbols that weren't defined in the context
         Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
         Here are the set of contexts:
         '''  + "\n\n" + "```"
             subdomain = tldextract.extract(url).subdomain
             site_name = subdomain + "." + domain if subdomain else domain
             yield f"*{count}. [{site_name}]({url})*\n"
+            count += 1def ask_internet(query:str, model:str, debug=False):
+    content_list = search_web_ref(query, debug=debug)
+    if debug:
+        print(content_list)
+    prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
+    # 收集所有回答内容
+    response_content = ""
+    for token in chat(prompt=prompt, model=model):
+        if token:
+            response_content += token
+    # 改进的引用处理逻辑
+    if content_list:
+        # 定义 Unicode 上标数字映射
+        superscript_digits = {
+            '1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵',
+            '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹', '0': '⁰'
+        }
+        # 创建上标符号到URL的映射
+        symbol_url_map = {}
+        for i in range(len(content_list)):
+            if i >= 10:  # 限制最多10个引用，避免复杂引用
+                break
+            num = str(i + 1)
+            sup_num = ''.join(superscript_digits.get(c, c) for c in num)
+            url = content_list[i].get('url')
+            if url:
+                symbol_url_map[sup_num] = url
+        # 更精确的引用替换
+        modified_content = response_content
+        for symbol, url in symbol_url_map.items():
+            # 使用正则表达式确保只替换独立的上标字符，避免替换组合上标
+            import re
+            pattern = r'(?<!
+$$
+)' + re.escape(symbol) + r'(?!
+$$
+)'
+            modified_content = re.sub(pattern, f'[{symbol}]({url})', modified_content)
+        yield modified_content
+    else:
+        yield response_content
+    # 参考资料部分
+    yield "\n\n---\n"
+    yield "参考资料:\n"
+    if content_list:
+        for i, url_content in enumerate(content_list):
+            if i >= 10:  # 限制最多10个引用
+                break
+            url = url_content.get('url')
+            if url:
+                # 提取域名作为显示名称
+                domain = tldextract.extract(url).domain
+                subdomain = tldextract.extract(url).subdomain
+                site_name = subdomain + "." + domain if subdomain else domain
+                # 使用上标数字作为引用编号
+                num = str(i + 1)
+                sup_num = ''.join(superscript_digits.get(c, c) for c in num)
+                yield f"{sup_num} [{site_name}]({url})\n"