Spaces:

DuckyBlender
/

phi3-youtube-summarizer

Paused

DuckyBlender commited on Aug 14, 2024

Commit

a755228

1 Parent(s): 1bf6bb5

fixed cpu inference?

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,27 +11,34 @@ import torch
 if torch.cuda.is_available():
     device = torch.device("cuda")
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
     import subprocess
     subprocess.run(
         "pip install flash_attn --no-build-isolation --break-system-packages",
         env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
         shell=True,
     )
 else:
     device = torch.device("cpu")
     print("Using CPU")
 # Uncomment and set your Hugging Face token if needed
 token = os.environ["HF_TOKEN"]
-# Configure 4-bit quantization for model loading
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-)
 # Load the Phi-3 model and tokenizer
 print("Loading model and tokenizer...")

 if torch.cuda.is_available():
     device = torch.device("cuda")
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
+    # Install the Flash Attention library
+    print("Installing Flash Attention library...")
     import subprocess
     subprocess.run(
         "pip install flash_attn --no-build-isolation --break-system-packages",
         env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
         shell=True,
     )
+    print("Flash Attention library installed")
+    # Configure 4-bit quantization for model loading
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
 else:
     device = torch.device("cpu")
+    bnb_config = None
     print("Using CPU")
 # Uncomment and set your Hugging Face token if needed
 token = os.environ["HF_TOKEN"]
 # Load the Phi-3 model and tokenizer
 print("Loading model and tokenizer...")