gemma-2-9b-it1

Runtime error

App Files Files Community

Leri777 commited on Oct 9

Commit

7ecb022

•

1 Parent(s): 7096a95

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -124

app.py CHANGED Viewed

@@ -1,140 +1,134 @@
 import os
-import logging
-import time
-import random
-from logging.handlers import RotatingFileHandler
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, GemmaTokenizerFast, pipeline
-from langchain_huggingface import HuggingFacePipeline
-from langchain.prompts import PromptTemplate
-from langchain.chains import LLMChain
-# Logging setup
-log_file = '/tmp/app_debug.log'
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
-file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-logger.addHandler(file_handler)
-logger.debug("Application started")
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
-# Function to load model with GPU availability check
-def load_model():
-    max_attempts = 5
-    attempts = 0
-    while attempts < max_attempts:
-        if torch.cuda.is_available():
-            logger.debug("GPU is available. Proceeding with GPU setup.")
-            try:
-                return AutoModelForCausalLM.from_pretrained(
-                    model_id,
-                    device_map="auto",
-                    torch_dtype=torch.bfloat16,
-                )
-            except Exception as e:
-                logger.error(f"Error initializing model with GPU: {e}. Retrying...")
-                attempts += 1
-                time.sleep(random.uniform(20, 60))  # Wait before retrying
-        else:
-            logger.warning("GPU is not available. Retrying GPU initialization...")
-            attempts += 1
-            time.sleep(random.uniform(20, 60))
-    # If GPU is still not available, fall back to CPU
-    logger.warning("Falling back to CPU setup after multiple attempts.")
-    return AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        low_cpu_mem_usage=True,
-        token=os.getenv('HF_TOKEN'),
-    )
-# Retry logic to load model with random delay
-model = None
-while model is None:
-    try:
-        model = load_model()
-        model.eval()
-    except Exception as e:
-        retry_delay = random.uniform(30, 60)  # Increased delay between retries
-        logger.error(f"Failed to load model: {e}. Retrying in {retry_delay:.2f} seconds...")
-        time.sleep(retry_delay)
-# Create Hugging Face pipeline
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_length=2048,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.9,
-    repetition_penalty=1.2,
-)
-# Initialize HuggingFacePipeline model for LangChain
-chat_model = HuggingFacePipeline(pipeline=pipe)
-# Define the conversation template for LangChain
-template = """<|im_start|>system
-{system_prompt}
-<|im_end|>
-{history}
-<|im_start|>user
-{human_input}
-<|im_end|>
-<|im_start|>assistant"""
-# Create LangChain prompt and chain
-prompt = PromptTemplate(
-    template=template, input_variables=["system_prompt", "history", "human_input"]
-)
-chain = prompt | chat_model
-# Prediction function using LangChain and model
-def predict(message, chat_history=[]):
-    formatted_history = "\n".join(
-        [f"<|im_start|>{entry['role']}\n{entry['content']}<|im_end|>" for entry in chat_history]
     )
-    system_prompt = "You are a helpful coding assistant."
-    try:
-        result = chain.run({
-            "system_prompt": system_prompt,
-            "history": formatted_history,
-            "human_input": message
-        })
-        return result
-    except Exception as e:
-        logger.exception(f"Error during prediction: {e}")
-        return "An error occurred."
-# Gradio UI
-interface = gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.Textbox(label="User input")
     ],
-    outputs="text", allow_flagging='never',
-    live=True,
 )
-# Retry logic to launch interface with random delay
-max_retries = 5
-retry_count = 0
-while retry_count < max_retries:
-    try:
-        interface.launch()
-        break
-    except Exception as e:
-        retry_delay = random.uniform(60, 120)  # Increased delay between retries
-        logger.error(f"Failed to launch interface: {e}. Retrying in {retry_delay:.2f} seconds...")
-        retry_count += 1
-        time.sleep(retry_delay)
-logger.debug("Chat interface initialized and launched")

 import os
+from threading import Thread
+from typing import Iterator
 import gradio as gr
+import spaces
 import torch
+from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
+DESCRIPTION = """\
+# Gemma 2 9B IT
+Gemma 2 is Google's latest iteration of open LLMs.
+This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
+For more details, please check [our post](https://huggingface.co/blog/gemma2).
+👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
+"""
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+model.config.sliding_window = 4096
+model.eval()
+@spaces.GPU(duration=90)
+def generate(
+    message: str,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    conversation = chat_history.copy()
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
     ],
+    cache_examples=False,
+    type="messages",
 )
+with gr.Blocks(css="style.css", fill_height=True) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()