alamin655's picture
Rename inference.py to app.py
fc9f271
raw
history blame
2.05 kB
import os
from dataclasses import dataclass, asdict
from ctransformers import AutoModelForCausalLM, AutoConfig
import gradio as gr
@dataclass
class GenerationConfig:
temperature: float
top_k: int
top_p: float
repetition_penalty: float
max_new_tokens: int
seed: int
reset: bool
stream: bool
threads: int
stop: list[str]
def format_prompt(user_prompt: str):
return f"""### Instruction:
{user_prompt}
### Response:"""
def generate(
llm: AutoModelForCausalLM,
generation_config: GenerationConfig,
user_prompt: str,
):
"""run model inference, will return a Generator if streaming is true"""
return llm(
format_prompt(
user_prompt,
),
**asdict(generation_config),
)
def generate_response(user_input):
generator = generate(llm, generation_config, user_input.strip())
response = ""
for word in generator:
response += word
return response
if __name__ == "__main__":
config = AutoConfig.from_pretrained(
"teknium/Replit-v2-CodeInstruct-3B", context_length=2048
)
llm = AutoModelForCausalLM.from_pretrained(
os.path.abspath("models/replit-v2-codeinstruct-3b.q4_1.bin"),
model_type="replit",
config=config,
)
generation_config = GenerationConfig(
temperature=0.2,
top_k=50,
top_p=0.9,
repetition_penalty=1.0,
max_new_tokens=512, # adjust as needed
seed=42,
reset=True, # reset history (cache)
stream=True, # streaming per word/token
threads=int(os.cpu_count() / 6), # adjust for your CPU
stop=[""],
)
user_prefix = "[user]: "
assistant_prefix = f"[assistant]: "
iface = gr.Interface(
fn=generate_response,
inputs=gr.inputs.Textbox(label=user_prefix),
outputs=gr.outputs.Textbox(label=assistant_prefix),
title="Chat with Assistant",
description="Ask any question and get a response from the Assistant!",
)
iface.launch()