Spaces:

BryanBradfo
/

GemmaTextAppeal

Sleeping

App Files Files Community

BryanBradfo commited on Apr 5

Commit

d198e0d

1 Parent(s): 9b002fb

generate output as it comes

Browse files

Files changed (1) hide show

app.py +68 -36

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ st.title("✨ GemmaTextAppeal")
 st.markdown("""
 ### Interactive Demo of Google's Gemma 2-2B-IT Model
 This app demonstrates the text generation capabilities of Google's Gemma 2-2B-IT model.
-Enter a prompt below and see the model generate text!
 """)
 # Function to load model
@@ -141,7 +141,7 @@ user_input = st.text_area("Enter your prompt:",
                           height=100,
                           placeholder="e.g., Write a short story about a robot discovering emotions")
-def generate_text(prompt, max_new_tokens=300, temperature=0.7):
     if not tokenizer or not model:
         st.session_state.error_message = "Model not properly loaded. Please check your Hugging Face token."
         return None
@@ -150,35 +150,71 @@ def generate_text(prompt, max_new_tokens=300, temperature=0.7):
         # Format the prompt according to Gemma's expected format
         formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
-        # Create the status indicator and output area
-        status_text = st.empty()
-        output_area = st.empty()
-        status_text.text("Generating response...")
         # Tokenize the input
-        with torch.no_grad():
-            encoding = tokenizer(formatted_prompt, return_tensors="pt")
-            # Move to the appropriate device
-            if torch.cuda.is_available():
-                encoding = {k: v.to("cuda") for k, v in encoding.items()}
-            # Generate the text - streamlined version
-            output_ids = model.generate(
-                **encoding,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=temperature,
-                pad_token_id=tokenizer.eos_token_id
-            )
-            # Get only the generated part (exclude the prompt)
-            new_tokens = output_ids[0][encoding["input_ids"].shape[1]:]
-            generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
-        # Display the result
-        output_area.markdown(f"**Generated Response:**\n\n{generated_text}")
-        status_text.text("Generation complete!")
         return generated_text
@@ -232,19 +268,15 @@ if st.button("Generate Text"):
         st.error("Hugging Face token is required! Please add your token as described above.")
     elif user_input:
         st.session_state.user_prompt = user_input
-        with st.spinner("Generating text..."):
-            result = generate_text(user_input, max_length, temperature)
-            if result is not None:  # Only set if no error occurred
-                st.session_state.generated_text = result
-                st.session_state.generation_complete = True
     else:
         st.error("Please enter a prompt first!")
-# Display results
-if st.session_state.generation_complete and not st.session_state.error_message:
-    st.markdown("### Generated Text")
-    st.markdown(st.session_state.generated_text)
     # Analysis section
     with st.expander("Text Analysis"):
         col1, col2 = st.columns(2)

 st.markdown("""
 ### Interactive Demo of Google's Gemma 2-2B-IT Model
 This app demonstrates the text generation capabilities of Google's Gemma 2-2B-IT model.
+Enter a prompt below and see the model generate text in real-time!
 """)
 # Function to load model
                           height=100,
                           placeholder="e.g., Write a short story about a robot discovering emotions")
+def generate_text_streaming(prompt, max_new_tokens=300, temperature=0.7):
     if not tokenizer or not model:
         st.session_state.error_message = "Model not properly loaded. Please check your Hugging Face token."
         return None
         # Format the prompt according to Gemma's expected format
         formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
+        # Create the output area
+        output_container = st.empty()
+        response_area = st.container()
+        with response_area:
+            st.markdown("**Generated Response:**")
+            response_text = st.empty()
         # Tokenize the input
+        encoding = tokenizer(formatted_prompt, return_tensors="pt")
+        # Move to the appropriate device
+        if torch.cuda.is_available():
+            encoding = {k: v.to("cuda") for k, v in encoding.items()}
+        # Store the length of the input to track new tokens
+        input_length = encoding["input_ids"].shape[1]
+        # Initialize generated text container
+        generated_text = ""
+        # Generate tokens with streaming
+        generated_ids = []
+        # Set up generation configuration
+        for _ in range(max_new_tokens):
+            with torch.no_grad():
+                if len(generated_ids) == 0:
+                    # First token generation
+                    outputs = model.generate(
+                        **encoding,
+                        max_new_tokens=1,
+                        do_sample=True,
+                        temperature=temperature,
+                        pad_token_id=tokenizer.eos_token_id,
+                        return_dict_in_generate=True,
+                        output_scores=False
+                    )
+                    next_token_id = outputs.sequences[0, input_length:input_length+1]
+                else:
+                    # Subsequent tokens
+                    current_input_ids = torch.cat([encoding["input_ids"], torch.tensor([generated_ids], device=encoding["input_ids"].device)], dim=1)
+                    outputs = model.generate(
+                        input_ids=current_input_ids,
+                        max_new_tokens=1,
+                        do_sample=True,
+                        temperature=temperature,
+                        pad_token_id=tokenizer.eos_token_id,
+                        return_dict_in_generate=True,
+                        output_scores=False
+                    )
+                    next_token_id = outputs.sequences[0, -1].unsqueeze(0)
+            # Convert to Python list and append
+            next_token_id_list = next_token_id.tolist()
+            generated_ids.extend(next_token_id_list)
+            # Check for EOS token
+            if tokenizer.eos_token_id in next_token_id_list:
+                break
+            # Decode the tokens generated so far and update the displayed text
+            current_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+            generated_text = current_text
+            response_text.markdown(generated_text)
         return generated_text
         st.error("Hugging Face token is required! Please add your token as described above.")
     elif user_input:
         st.session_state.user_prompt = user_input
+        result = generate_text_streaming(user_input, max_length, temperature)
+        if result is not None:  # Only set if no error occurred
+            st.session_state.generated_text = result
+            st.session_state.generation_complete = True
     else:
         st.error("Please enter a prompt first!")
+# Analysis section (only show after generation is complete)
+if st.session_state.generation_complete and not st.session_state.error_message and st.session_state.generated_text:
     # Analysis section
     with st.expander("Text Analysis"):
         col1, col2 = st.columns(2)