File size: 2,753 Bytes
7d244cb
 
 
 
 
 
 
 
a743a65
9834006
 
7d244cb
 
 
 
 
 
 
 
 
a743a65
7d244cb
 
d69d4a4
7d244cb
 
 
 
 
 
 
 
 
 
 
 
 
fb9d705
7d244cb
 
 
 
 
a743a65
7d244cb
 
 
 
d69d4a4
9834006
d69d4a4
eaf592b
 
d69d4a4
 
7d244cb
a743a65
 
 
 
 
 
7d244cb
 
 
eaf592b
 
 
 
a743a65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import spaces
import os
import spaces
import torch
import random
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
import transformers



# Set an environment variable
HF_TOKEN = os.environ.get("HF_TOKEN", None)

zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔


model_id = 'FINGU-AI/Qwen-Orpo-v1'              #attn_implementation="flash_attention_2",
model = AutoModelForCausalLM.from_pretrained(model_id,attn_implementation="sdpa",  torch_dtype= torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model.to('cuda')

# terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

generation_params = {
    'max_new_tokens': 1000,
    'use_cache': True,
    'do_sample': True,
    'temperature': 0.7,
    'top_p': 0.9,
    # 'top_k': 50,
}

@spaces.GPU
def inference(query):
    messages = [
    {"role": "system", "content": """You are ai trader, invester helpfull assistant."""},
    {"role": "user", "content": f"{query}"}, 
]

    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(tokenized_chat, **generation_params)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
    assistant_response = decoded_outputs[0].split("<|im_start|>assistant\n")[-1].strip()
    response_ = assistant_response.replace('<|im_end|>', "")
    return response_
    # outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
    # return outputs

examples = ['How can options strategies such as straddles, strangles, and spreads be used to hedge against market volatility?',
           'How do changes in interest rates, inflation, and GDP growth impact stock and bond markets?',
           'What are the key components and strategies involved in developing an effective algorithmic trading system?',
           'How can investors integrate environmental, social, and governance (ESG) factors into their investment decisions to achieve both financial returns and social impact?',
           'How do geopolitical events such as trade wars, political instability, and international conflicts affect global financial markets?',
           'How does blockchain technology have the potential to disrupt financial markets and investment practices?']

def response(message, history):
    text = inference(message)
    return text 
    # for i in range(len(text)):
    #     time.sleep(0.01)
    #     yield text[: i + 1]
gr.ChatInterface(response,examples=examples).launch()