Spaces:

karths
/

types_issues

Running on Zero

App Files Files Community

karths commited on Feb 21

Commit

2d57f5f

verified ·

1 Parent(s): 668a83f

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -25

app.py CHANGED Viewed

@@ -13,8 +13,7 @@ from collections.abc import Iterator
 import csv
 # Increase CSV field size limit
-csv.field_size_limit(1000000)  # Or an even larger value if needed
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
@@ -63,14 +62,14 @@ quality_mapping = {
 # Pre-load models and tokenizer for quality prediction
 tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
-models = {path: AutoModelForSequenceClassification.from_pretrained(path) for path in model_paths}
 def get_quality_name(model_name):
     return quality_mapping.get(model_name.split('/')[-1], "Unknown Quality")
-@spaces.GPU
 def model_prediction(model, text, device):
-    model.to(device)
     model.eval()
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -79,30 +78,26 @@ def model_prediction(model, text, device):
         logits = outputs.logits
         probs = softmax(logits.cpu().numpy(), axis=1)
     avg_prob = np.mean(probs[:, 1])
     return avg_prob
 # --- Llama 3.2 3B Model Setup ---
 LLAMA_MAX_MAX_NEW_TOKENS = 2048
-LLAMA_DEFAULT_MAX_NEW_TOKENS = 1024
-LLAMA_MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-llama_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # Explicitly define device
 llama_model_id = "meta-llama/Llama-3.2-3B-Instruct"
 llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_id)
 llama_model = AutoModelForCausalLM.from_pretrained(
     llama_model_id,
-    device_map="auto",  # Automatically distribute model across devices
     torch_dtype=torch.bfloat16,
 )
 llama_model.eval()
-# --- IMPORTANT: Set Pad Token ---
-# Llama3 does *not* have a default pad token.  We *must* set one.
-# Using the EOS token as the PAD token is a common and recommended practice.
 if llama_tokenizer.pad_token is None:
     llama_tokenizer.pad_token = llama_tokenizer.eos_token
-@spaces.GPU(duration=150)
 def llama_generate(
     message: str,
     max_new_tokens: int = LLAMA_DEFAULT_MAX_NEW_TOKENS,
@@ -113,7 +108,6 @@ def llama_generate(
 ) -> Iterator[str]:
     inputs = llama_tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=LLAMA_MAX_INPUT_TOKEN_LENGTH).to(llama_model.device)
-    #The line above was changed to add attention mask
     if inputs.input_ids.shape[1] > LLAMA_MAX_INPUT_TOKEN_LENGTH:
         inputs.input_ids = inputs.input_ids[:, -LLAMA_MAX_INPUT_TOKEN_LENGTH:]
@@ -121,7 +115,7 @@ def llama_generate(
     streamer = TextIteratorStreamer(llama_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        inputs,  # Pass the entire inputs dictionary
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
@@ -137,7 +131,7 @@ def llama_generate(
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
 def generate_explanation(issue_text, top_qualities):
@@ -156,14 +150,14 @@ def generate_explanation(issue_text, top_qualities):
     explanation = ""
     try:
         for chunk in llama_generate(prompt):
-            explanation += chunk  # Accumulate generated text
     except Exception as e:
         logging.error(f"Error during Llama generation: {e}")
         return "An error occurred while generating the explanation."
     return explanation
 def main_interface(text):
     if not text.strip():
         return "<div style='color: red;'>No text provided. Please enter a valid issue description.</div>", "", ""
@@ -171,25 +165,24 @@ def main_interface(text):
     if len(text) < 30:
         return "<div style='color: red;'>Text is less than 30 characters.</div>", "", ""
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     results = []
     for model_path, model in models.items():
         quality_name = get_quality_name(model_path)
-        avg_prob = model_prediction(model, text, device)
         if avg_prob >= 0.95:
             results.append((quality_name, avg_prob))
         logging.info(f"Model: {model_path}, Quality: {quality_name}, Average Probability: {avg_prob:.3f}")
     if not results:
         return "<div style='color: red;'>No recommendation. Prediction probability is below the threshold. </div>", "", ""
     top_qualities = sorted(results, key=lambda x: x[1], reverse=True)[:3]
     output_html = render_html_output(top_qualities)
-    # Generate explanation using the top qualities and the original input text
     explanation = generate_explanation(text, top_qualities)
-    return output_html, "", explanation  # Return explanation as the third output
 def render_html_output(top_qualities):
     styles = """
@@ -244,7 +237,7 @@ interface = gr.Interface(
     outputs=[
         gr.HTML(label="Prediction Output"),
         gr.Textbox(label="Predictions", visible=False),
-        gr.Textbox(label="Explanation", lines=5)  # Added Textbox for explanation
     ],
     title="QualityTagger",
     description="This tool classifies text into different quality domains such as Security, Usability, etc., and provides explanations.",

 import csv
 # Increase CSV field size limit
+csv.field_size_limit(1000000)
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
 # Pre-load models and tokenizer for quality prediction
 tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+models = {path: AutoModelForSequenceClassification.from_pretrained(path) for path in model_paths}  # Load to CPU initially
 def get_quality_name(model_name):
     return quality_mapping.get(model_name.split('/')[-1], "Unknown Quality")
 def model_prediction(model, text, device):
+    model.to(device) # Move the *specific* model to the GPU
     model.eval()
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     inputs = {k: v.to(device) for k, v in inputs.items()}
         logits = outputs.logits
         probs = softmax(logits.cpu().numpy(), axis=1)
     avg_prob = np.mean(probs[:, 1])
+    model.to("cpu")  # Move the model *back* to the CPU
     return avg_prob
 # --- Llama 3.2 3B Model Setup ---
 LLAMA_MAX_MAX_NEW_TOKENS = 2048
+LLAMA_DEFAULT_MAX_NEW_TOKENS = 512  # Reduced for efficiency
+LLAMA_MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "2048")) # Reduced
+llama_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # Explicit device
 llama_model_id = "meta-llama/Llama-3.2-3B-Instruct"
 llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_id)
 llama_model = AutoModelForCausalLM.from_pretrained(
     llama_model_id,
+    device_map="auto",  # Let Transformers handle optimal device placement
     torch_dtype=torch.bfloat16,
 )
 llama_model.eval()
 if llama_tokenizer.pad_token is None:
     llama_tokenizer.pad_token = llama_tokenizer.eos_token
 def llama_generate(
     message: str,
     max_new_tokens: int = LLAMA_DEFAULT_MAX_NEW_TOKENS,
 ) -> Iterator[str]:
     inputs = llama_tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=LLAMA_MAX_INPUT_TOKEN_LENGTH).to(llama_model.device)
     if inputs.input_ids.shape[1] > LLAMA_MAX_INPUT_TOKEN_LENGTH:
         inputs.input_ids = inputs.input_ids[:, -LLAMA_MAX_INPUT_TOKEN_LENGTH:]
     streamer = TextIteratorStreamer(llama_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
+    torch.cuda.empty_cache()  # Clear cache after each generation
 def generate_explanation(issue_text, top_qualities):
     explanation = ""
     try:
         for chunk in llama_generate(prompt):
+            explanation += chunk
     except Exception as e:
         logging.error(f"Error during Llama generation: {e}")
         return "An error occurred while generating the explanation."
     return explanation
+@spaces.GPU(duration=180)  # Apply the GPU decorator *only* to the main interface
 def main_interface(text):
     if not text.strip():
         return "<div style='color: red;'>No text provided. Please enter a valid issue description.</div>", "", ""
     if len(text) < 30:
         return "<div style='color: red;'>Text is less than 30 characters.</div>", "", ""
+    device = "cuda" if torch.cuda.is_available() else "cpu" # Keep this for model_prediction
     results = []
     for model_path, model in models.items():
         quality_name = get_quality_name(model_path)
+        avg_prob = model_prediction(model, text, device)  # Pass the device
         if avg_prob >= 0.95:
             results.append((quality_name, avg_prob))
         logging.info(f"Model: {model_path}, Quality: {quality_name}, Average Probability: {avg_prob:.3f}")
     if not results:
         return "<div style='color: red;'>No recommendation. Prediction probability is below the threshold. </div>", "", ""
     top_qualities = sorted(results, key=lambda x: x[1], reverse=True)[:3]
     output_html = render_html_output(top_qualities)
     explanation = generate_explanation(text, top_qualities)
+    return output_html, "", explanation
 def render_html_output(top_qualities):
     styles = """
     outputs=[
         gr.HTML(label="Prediction Output"),
         gr.Textbox(label="Predictions", visible=False),
+        gr.Textbox(label="Explanation", lines=5)
     ],
     title="QualityTagger",
     description="This tool classifies text into different quality domains such as Security, Usability, etc., and provides explanations.",