htigenai commited on
Commit
10d28e9
·
verified ·
1 Parent(s): eaa9d4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -32
app.py CHANGED
@@ -26,7 +26,7 @@ def log_system_info():
26
  """Log system information for debugging"""
27
  logger.info(f"Python version: {sys.version}")
28
  logger.info(f"PyTorch version: {torch.__version__}")
29
- logger.info(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
30
 
31
  print("Starting application...")
32
  log_system_info()
@@ -35,53 +35,55 @@ try:
35
  print("Loading model and tokenizer...")
36
  model_id = "htigenai/finetune_test_2_4bit"
37
  base_model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Original base model
38
-
39
  with timer("Loading tokenizer"):
40
  try:
41
- tokenizer = AutoTokenizer.from_pretrained(base_model_id)
42
  tokenizer.pad_token = tokenizer.eos_token
43
  tokenizer.padding_side = "right"
44
  except Exception as e:
45
  logger.error(f"Error loading tokenizer: {str(e)}")
46
  raise
47
  logger.info("Tokenizer loaded successfully")
48
-
49
- # Configure quantization - using 4-bit since the base model was 4-bit
50
  bnb_config = BitsAndBytesConfig(
51
  load_in_4bit=True,
52
  bnb_4bit_quant_type="nf4",
53
- bnb_4bit_compute_dtype=torch.float16,
54
  bnb_4bit_use_double_quant=True,
55
  )
56
-
57
  with timer("Loading model"):
58
  model = AutoModelForCausalLM.from_pretrained(
59
  model_id,
60
  quantization_config=bnb_config,
61
- device_map="auto",
62
  trust_remote_code=True,
63
  )
64
  model.eval()
65
  logger.info("Model loaded successfully")
66
 
67
- def generate_text(prompt, max_tokens=200, temperature=0.7):
68
  """Generate text based on the input prompt."""
69
  try:
70
  logger.info(f"Starting generation for prompt: {prompt[:50]}...")
71
-
72
  with timer("Tokenization"):
73
  inputs = tokenizer(
74
- prompt,
75
  return_tensors="pt",
76
  padding=True,
77
  truncation=True,
78
  max_length=256
79
- ).to(model.device)
80
-
 
81
  with timer("Generation"):
82
- with torch.inference_mode():
83
  outputs = model.generate(
84
- **inputs,
 
85
  max_new_tokens=max_tokens,
86
  temperature=temperature,
87
  top_p=0.95,
@@ -90,20 +92,18 @@ try:
90
  eos_token_id=tokenizer.eos_token_id,
91
  repetition_penalty=1.1,
92
  )
93
-
94
  with timer("Decoding"):
95
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
96
-
97
  logger.info("Text generation completed successfully")
98
-
99
  # Clean up
100
  with timer("Cleanup"):
101
  gc.collect()
102
- if torch.cuda.is_available():
103
- torch.cuda.empty_cache()
104
-
105
  return generated_text
106
-
107
  except Exception as e:
108
  logger.error(f"Error during generation: {str(e)}")
109
  return f"Error during generation: {str(e)}"
@@ -113,14 +113,14 @@ try:
113
  fn=generate_text,
114
  inputs=[
115
  gr.Textbox(
116
- lines=3,
117
  placeholder="Enter your prompt here...",
118
  label="Input Prompt"
119
  ),
120
  gr.Slider(
121
- minimum=50,
122
- maximum=200,
123
- value=100,
124
  step=10,
125
  label="Max Tokens"
126
  ),
@@ -134,20 +134,20 @@ try:
134
  ],
135
  outputs=gr.Textbox(
136
  label="Generated Response",
137
- lines=5
138
  ),
139
  title="HTIGENAI Reflection Analyzer - Test",
140
  description="Enter a prompt to generate text. This model is fine-tuned from Llama 3.1 8B Instruct.",
141
  examples=[
142
- ["What are your thoughts about cats?", 100, 0.7],
143
- ["Write a short story about a magical forest", 150, 0.8],
144
- ["Explain quantum computing to a 5-year-old", 75, 0.5],
145
  ]
146
  )
147
 
148
  # Launch the interface
149
- iface.launch(server_name="0.0.0.0")
150
 
151
  except Exception as e:
152
  logger.error(f"Application startup failed: {str(e)}")
153
- raise
 
26
  """Log system information for debugging"""
27
  logger.info(f"Python version: {sys.version}")
28
  logger.info(f"PyTorch version: {torch.__version__}")
29
+ logger.info(f"Device: CPU")
30
 
31
  print("Starting application...")
32
  log_system_info()
 
35
  print("Loading model and tokenizer...")
36
  model_id = "htigenai/finetune_test_2_4bit"
37
  base_model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Original base model
38
+
39
  with timer("Loading tokenizer"):
40
  try:
41
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
42
  tokenizer.pad_token = tokenizer.eos_token
43
  tokenizer.padding_side = "right"
44
  except Exception as e:
45
  logger.error(f"Error loading tokenizer: {str(e)}")
46
  raise
47
  logger.info("Tokenizer loaded successfully")
48
+
49
+ # Adjust quantization config for CPU
50
  bnb_config = BitsAndBytesConfig(
51
  load_in_4bit=True,
52
  bnb_4bit_quant_type="nf4",
53
+ bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for better CPU support
54
  bnb_4bit_use_double_quant=True,
55
  )
56
+
57
  with timer("Loading model"):
58
  model = AutoModelForCausalLM.from_pretrained(
59
  model_id,
60
  quantization_config=bnb_config,
61
+ device_map={"": "cpu"}, # Explicitly set to CPU
62
  trust_remote_code=True,
63
  )
64
  model.eval()
65
  logger.info("Model loaded successfully")
66
 
67
+ def generate_text(prompt, max_tokens=100, temperature=0.7):
68
  """Generate text based on the input prompt."""
69
  try:
70
  logger.info(f"Starting generation for prompt: {prompt[:50]}...")
71
+
72
  with timer("Tokenization"):
73
  inputs = tokenizer(
74
+ prompt,
75
  return_tensors="pt",
76
  padding=True,
77
  truncation=True,
78
  max_length=256
79
+ )
80
+ inputs = inputs.to("cpu") # Ensure inputs are on CPU
81
+
82
  with timer("Generation"):
83
+ with torch.no_grad():
84
  outputs = model.generate(
85
+ input_ids=inputs["input_ids"],
86
+ attention_mask=inputs["attention_mask"],
87
  max_new_tokens=max_tokens,
88
  temperature=temperature,
89
  top_p=0.95,
 
92
  eos_token_id=tokenizer.eos_token_id,
93
  repetition_penalty=1.1,
94
  )
95
+
96
  with timer("Decoding"):
97
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
98
+
99
  logger.info("Text generation completed successfully")
100
+
101
  # Clean up
102
  with timer("Cleanup"):
103
  gc.collect()
104
+
 
 
105
  return generated_text
106
+
107
  except Exception as e:
108
  logger.error(f"Error during generation: {str(e)}")
109
  return f"Error during generation: {str(e)}"
 
113
  fn=generate_text,
114
  inputs=[
115
  gr.Textbox(
116
+ lines=3,
117
  placeholder="Enter your prompt here...",
118
  label="Input Prompt"
119
  ),
120
  gr.Slider(
121
+ minimum=20,
122
+ maximum=100,
123
+ value=50,
124
  step=10,
125
  label="Max Tokens"
126
  ),
 
134
  ],
135
  outputs=gr.Textbox(
136
  label="Generated Response",
137
+ lines=10
138
  ),
139
  title="HTIGENAI Reflection Analyzer - Test",
140
  description="Enter a prompt to generate text. This model is fine-tuned from Llama 3.1 8B Instruct.",
141
  examples=[
142
+ ["What are your thoughts about cats?", 50, 0.7],
143
+ ["Write a short story about a magical forest", 60, 0.8],
144
+ ["Explain quantum computing to a 5-year-old", 40, 0.5],
145
  ]
146
  )
147
 
148
  # Launch the interface
149
+ iface.launch()
150
 
151
  except Exception as e:
152
  logger.error(f"Application startup failed: {str(e)}")
153
+ raise