File size: 9,250 Bytes
68786bb
 
 
 
 
 
 
 
734d84d
68786bb
 
 
5e57195
68786bb
 
 
 
 
 
734d84d
 
68786bb
734d84d
 
68786bb
734d84d
68786bb
a90e364
 
68786bb
 
 
a90e364
26d27e8
68786bb
 
a90e364
68786bb
 
 
 
 
 
 
a90e364
68786bb
 
 
 
 
 
 
 
 
 
 
 
a90e364
68786bb
 
 
a90e364
 
 
 
 
68786bb
 
 
 
 
 
 
a90e364
68786bb
 
 
 
 
 
a90e364
68786bb
 
 
a90e364
68786bb
a90e364
68786bb
a90e364
68786bb
 
 
 
 
 
a90e364
68786bb
 
 
 
bc07d07
 
68786bb
 
 
 
bc07d07
68786bb
734d84d
68786bb
734d84d
68786bb
734d84d
68786bb
734d84d
bc07d07
 
68786bb
33cb929
8ee7966
e76904c
8ee7966
 
e76904c
8ee7966
 
 
 
 
 
 
 
 
33cb929
 
8ee7966
e76904c
8ee7966
bc07d07
68786bb
 
bc07d07
68786bb
 
734d84d
68786bb
 
bc07d07
734d84d
68786bb
 
 
734d84d
68786bb
 
 
 
734d84d
68786bb
 
734d84d
68786bb
734d84d
68786bb
 
 
 
 
734d84d
68786bb
 
 
 
 
 
734d84d
 
68786bb
734d84d
 
68786bb
734d84d
 
 
 
 
 
 
 
68786bb
26c1d7c
68786bb
 
 
 
 
734d84d
3621224
734d84d
68786bb
 
734d84d
68786bb
 
 
 
 
 
 
 
 
 
734d84d
68786bb
 
 
 
734d84d
68786bb
 
 
 
734d84d
68786bb
bc07d07
 
734d84d
6f36198
bc07d07
5260a86
 
bc07d07
6f36198
 
0eb1947
 
bc07d07
6f36198
bc07d07
59f800d
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import json
import os 
from pprint import pprint
import requests
import trafilatura
from trafilatura import bare_extraction
from concurrent.futures import ThreadPoolExecutor
import concurrent
import requests
import openai
import time 
from datetime import datetime
from urllib.parse import urlparse
import tldextract
import platform
import urllib.parse

 
def extract_url_content(url):
    downloaded = trafilatura.fetch_url(url)
    content =  trafilatura.extract(downloaded)
    
    return {"url":url, "content":content}


 

def search_web_ref(query:str, debug=False):
 
    content_list = []

    try:

        safe_string = urllib.parse.quote_plus(":all !general " + query)

        searxng_url = os.environ.get('SEARXNG_URL')
        response = requests.get(searxng_url + '?q=' + safe_string + '&format=json')
        response.raise_for_status()
        search_results = response.json()
 
        if debug:
            print("JSON Response:")
            pprint(search_results)
        pedding_urls = []

        conv_links = []

        if search_results.get('results'):
            for item in search_results.get('results')[0:9]:
                name = item.get('title')
                snippet = item.get('content')
                url = item.get('url')
                pedding_urls.append(url)

                if url:
                    url_parsed = urlparse(url)
                    domain = url_parsed.netloc
                    icon_url =  url_parsed.scheme + '://' + url_parsed.netloc + '/favicon.ico'
                    site_name = tldextract.extract(url).domain
 
                conv_links.append({
                    'site_name':site_name,
                    'icon_url':icon_url,
                    'title':name,
                    'url':url,
                    'snippet':snippet
                })

            results = []
            futures = []

            executor = ThreadPoolExecutor(max_workers=10) 
            for url in pedding_urls:
                futures.append(executor.submit(extract_url_content,url))
            try:
                for future in futures:
                    res = future.result(timeout=5)
                    results.append(res)
            except concurrent.futures.TimeoutError:
                print("任务执行超时")
                executor.shutdown(wait=False,cancel_futures=True)

            for content in results:
                if content and content.get('content'):
                    
                    item_dict = {
                        "url":content.get('url'),
                        "content": content.get('content'),
                        "length":len(content.get('content'))
                    }
                    content_list.append(item_dict)
                if debug:
                    print("URL: {}".format(url))
                    print("=================")
 
        return  content_list
    except Exception as ex:
        raise ex


def gen_prompt(question,content_list, lang="zh-CN", context_length_limit=11000,debug=False):
    
    limit_len = (context_length_limit - 2000)
    if len(question) > limit_len:
        question = question[0:limit_len]
    
    ref_content = [ item.get("content") for item in content_list]
    
    answer_language = ' Simplified Chinese '
    if lang == "zh-CN":
        answer_language = ' Simplified Chinese '
    if lang == "zh-TW":
        answer_language = ' Traditional Chinese '
    if lang == "en-US":
        answer_language = ' English '


    if len(ref_content) > 0:
        prompts = '''
        You are a large language AI assistant develop by nash_su. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question. Please use the context to provide accurate information.
        Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.

        Other than code and specific names, your answer must be written in the same language as the question.
        Here are the set of contexts:
        '''  + "\n\n" + "```" 
        ref_index = 1

        for ref_text in ref_content:
            prompts = prompts + "\n\n" + " [Context {}]  ".format(str(ref_index)) +  ref_text
            ref_index += 1

        if len(prompts) >= limit_len:
            prompts = prompts[0:limit_len]        
        prompts = prompts + '''
        ```
        Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Don't mention the context numbers in your response.
        Remember, don't blindly repeat the contexts verbatim. And here is the user question:
        ''' + question 

    else:
        prompts = question

    if debug:
        print(prompts)
        print("总长度:"+ str(len(prompts)))
    return prompts


def defaultchat(message, model:str, stream=True, debug=False):
    openai.base_url = os.environ.get('OPENAI_BASE_URL')
    openai.api_key = os.environ.get('OPENAI_API_KEY')
    total_content = ""
    #print(message)
    for chunk in openai.chat.completions.create(
        model=model,
        messages=message,
        stream=True,
        max_tokens=3072,temperature=0.2
    ):
        stream_resp = chunk.dict()
        #print(stream_resp)
        token = stream_resp["choices"][0]["delta"].get("content", "")
        #print(token)
        if token:
            total_content += token
            yield token

def ask_gpt(message, model_id, debug=False):
    #print(message)
    total_token = ""
    for token in defaultchat(message, model_id):
        if token:
            total_token += token
            yield token
            
def summary_gpt(message,  model:str, debug=False):
    #message = '\n'.join([msg.content for msg in message])
    msgs = []
    msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。举例说明,1-当提问者问:他在做什么?,你根据上下文你可以得知他是"小明",那么你优化问题后输出"小明在干什么?"2-当提问者问:他们乐队都有谁?,你根据上下文可以得知乐队是"小强乐队",那么你优化问题后输出"小强乐队都有谁?"'})
    msgs.append({"role": "user", "content":str(message)})
    json_data = {
            "model":model,
            "messages":msgs,
            "temperature":0.8,
            "max_tokens":2560,
            "top_p":1,
            "frequency_penalty":0,
            "presence_penalty":0,
            "stop":None
    }
    apiurl = os.environ.get('OPENAI_BASE_URL')
    pooltoken = os.environ.get('OPENAI_API_KEY')
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Bearer {}'.format(pooltoken),
    }
    response = requests.post( apiurl + 'chat/completions', headers=headers, json=json_data )
    res = json.loads(response.text)['choices'][0]['message']['content']
    #print(res)
    return res

def chat(prompt, model:str, stream=True, debug=False):
    openai.base_url = os.environ.get('OPENAI_BASE_URL')
    openai.api_key = os.environ.get('OPENAI_API_KEY')
    total_content = ""
    for chunk in openai.chat.completions.create(
        model=model,
        messages=[{
            "role": "user",
            "content": prompt
        }],
        stream=True,
        max_tokens=3072,temperature=0.2
    ):
        stream_resp = chunk.dict()
        token = stream_resp["choices"][0]["delta"].get("content", "")
        if token:
            
            total_content += token
            yield token
    if debug:
        print(total_content)
 

 
    
def ask_internet(query:str, model:str, debug=False):
  
    content_list = search_web_ref(query,debug=debug)
    if debug:
        print(content_list)
    prompt = gen_prompt(query,content_list,context_length_limit=6000,debug=debug)
    total_token =  ""
 
    for token in chat(prompt=prompt, model=model):
        if token:
            total_token += token
            yield token

    yield "\n\n"
    # 是否返回参考资料
    if True:
        yield "---"
        yield "\nSearxng"
        yield "参考资料:\n"
        count = 1
        for url_content in content_list:
           url = url_content.get('url')
           yield "*[{}. {}]({})*".format(str(count),url,url )  
           yield "\n"
           count += 1