Spaces:

cris177
/

Simple-Arguments

Sleeping

App Files Files Community

cris177 commited on Jul 11, 2024

Commit

ab7670e

verified ·

1 Parent(s): 772cf77

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -10

app.py CHANGED Viewed

@@ -1,10 +1,21 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
-model = AutoModelForCausalLM.from_pretrained("cris177/Qwen2-Simple-Arguments")
-tokenizer = AutoTokenizer.from_pretrained("cris177/Qwen2-Simple-Arguments")
-argument = "If it's wednesday it's cold, and it's cold, therefore it's wednesday."
 def analyze_argument(argument):
     instruction = 'Based on the following argument, identify the following elements: premises, conclusion, propositions, type of argument, negation of propositions and validity.'
     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
@@ -17,19 +28,17 @@ def analyze_argument(argument):
     ### Response:"""
     prompt = alpaca_prompt.format(instruction, argument)
-    input_ids = tokenizer(prompt, return_tensors="pt")
-    outputs = model.generate(**input_ids, max_length=1000, num_return_sequences=1)
-    output = tokenizer.decode(outputs[0])
-    # remove prompt from output
-    output = output.split("### Response:")[1].strip().split("<|endoftext|>")[0]
     return output
 description = """This tool analyzes simple arguments, that is, arguments composed of at most two propositions.
 It applies the fine-tuned LLM from https://huggingface.co/cris177/Qwen2-Simple-Arguments
-It requires only 6 GB of RAM, and runs on just 2 vCPUs (which causes it to run somewhat slowly in this demo).
 """
 gr.Interface(analyze_argument, inputs="text", outputs="text",

+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 import gradio as gr
+## Download the GGUF model
+model_name = "cris177/Qwen2-Simple-Arguments"
+model_file = "Qwen2_arguments.Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
+model_path = hf_hub_download(model_name, filename=model_file)
+## Instantiate model from downloaded file
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2000,  # Context length to use
+    n_threads=2,            # Number of CPU threads to use
+    n_gpu_layers=0        # Number of model layers to offload to GPU
+)
 def analyze_argument(argument):
     instruction = 'Based on the following argument, identify the following elements: premises, conclusion, propositions, type of argument, negation of propositions and validity.'
     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
     ### Response:"""
     prompt = alpaca_prompt.format(instruction, argument)
+    output = llm(prompt, max_tokens=1000)['choices'][0]['text'].strip()
     return output
 description = """This tool analyzes simple arguments, that is, arguments composed of at most two propositions.
 It applies the fine-tuned LLM from https://huggingface.co/cris177/Qwen2-Simple-Arguments
+For faster inference we use the 4-bit quantization model https://huggingface.co/cris177/Qwen2-Simple-Arguments/resolve/main/Qwen2_arguments.Q4_K_M.gguf.
+It requires only 3 GB of RAM, and runs on just 2 vCPUs (which causes it to run somewhat slowly in this demo).
 """
 gr.Interface(analyze_argument, inputs="text", outputs="text",