zqhuang commited on
Commit
9767b96
1 Parent(s): 7181e0e

Upload UltravoxPipeline

Browse files
README.md CHANGED
@@ -1,4 +1,12 @@
1
  ---
 
 
 
 
 
 
 
 
2
  language:
3
  - ar
4
  - de
@@ -15,16 +23,8 @@ language:
15
  - tr
16
  - uk
17
  - zh
18
- license: mit
19
  library_name: transformers
20
- datasets:
21
- - fixie-ai/librispeech_asr
22
- - fixie-ai/common_voice_17_0
23
- - fixie-ai/peoples_speech
24
- - fixie-ai/gigaspeech
25
- - fixie-ai/multilingual_librispeech
26
- - fixie-ai/wenetspeech
27
- - fixie-ai/covost2
28
  metrics:
29
  - bleu
30
  ---
 
1
  ---
2
+ datasets:
3
+ - fixie-ai/librispeech_asr
4
+ - fixie-ai/common_voice_17_0
5
+ - fixie-ai/peoples_speech
6
+ - fixie-ai/gigaspeech
7
+ - fixie-ai/multilingual_librispeech
8
+ - fixie-ai/wenetspeech
9
+ - fixie-ai/covost2
10
  language:
11
  - ar
12
  - de
 
23
  - tr
24
  - uk
25
  - zh
 
26
  library_name: transformers
27
+ license: mit
 
 
 
 
 
 
 
28
  metrics:
29
  - bleu
30
  ---
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/ultravox-expts/artifacts/model-zhuang.2024-10-09-v0_4_1.stacking-4b.8c44a2e:v8",
3
  "architectures": [
4
  "UltravoxModel"
5
  ],
@@ -28,6 +28,6 @@
28
  "stack_factor": 8,
29
  "text_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
30
  "torch_dtype": "bfloat16",
31
- "transformers_version": "4.46.1",
32
  "vocab_size": 128256
33
  }
 
1
  {
2
+ "_name_or_path": "/Users/zhuang/expts/2024-10-09-v0_4_1/stacking-4b/ultravox/artifacts/model-zhuang.2024-10-09-v0_4_1.stacking-4b.8c44a2e:v8",
3
  "architectures": [
4
  "UltravoxModel"
5
  ],
 
28
  "stack_factor": 8,
29
  "text_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
30
  "torch_dtype": "bfloat16",
31
+ "transformers_version": "4.44.0",
32
  "vocab_size": 128256
33
  }
generation_config.json CHANGED
@@ -7,5 +7,5 @@
7
  128009
8
  ],
9
  "pad_token_id": 128009,
10
- "transformers_version": "4.46.1"
11
  }
 
7
  128009
8
  ],
9
  "pad_token_id": 128009,
10
+ "transformers_version": "4.44.0"
11
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
- size 17209920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79e3e522635f3171300913bb421464a87de6222182a0570b9b2ccba2a964b2b4
3
+ size 9085657
ultravox_config.py CHANGED
@@ -19,8 +19,6 @@ class LoraConfigSimplified:
19
  target_modules: Optional[List[str]] = dataclasses.field(
20
  default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"]
21
  )
22
- # A list of module names regex patterns to unfreeze. Only used if r == 0.
23
- unfreeze_layers: Optional[List[str]] = None
24
 
25
 
26
  class LossFunction(str, Enum):
@@ -30,7 +28,7 @@ class LossFunction(str, Enum):
30
 
31
  @dataclasses.dataclass
32
  class LossConfig:
33
- loss_function: LossFunction = LossFunction.CrossEntropy
34
  kl_temperature: float = 2.0
35
 
36
  @property
 
19
  target_modules: Optional[List[str]] = dataclasses.field(
20
  default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"]
21
  )
 
 
22
 
23
 
24
  class LossFunction(str, Enum):
 
28
 
29
  @dataclasses.dataclass
30
  class LossConfig:
31
+ loss_function: LossFunction = LossFunction.KL_Divergence
32
  kl_temperature: float = 2.0
33
 
34
  @property
ultravox_model.py CHANGED
@@ -1,5 +1,4 @@
1
  import logging
2
- import re
3
  from typing import Any, Dict, Optional, Set, Tuple, Union
4
 
5
  import peft
@@ -35,14 +34,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
35
 
36
  config_class = UltravoxConfig
37
  config: UltravoxConfig # for type hinting
38
- # We minimize the weights in state_dict in order to reduce the size of the checkpoint
39
- # The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
40
- # As such we have to tell is to ignore some keys that are not always in the model
41
- _keys_to_ignore_on_load_unexpected = ["audio_tower.*", "language_model.*"]
42
- # Usually we load encoder weights from a pretrained model, so we don't want to load the decoder weights
43
- # Technically we never hit this issue because these keys are already removed from state_dict() however,
44
- # but there's no harm in keeping it here for when we change that behavior.
45
- _keys_to_ignore_on_load_missing = ["audio_tower.*"]
46
 
47
  def __init__(self, config: UltravoxConfig):
48
  super().__init__(config)
@@ -155,6 +148,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
155
  labels: Optional[torch.Tensor] = None,
156
  attention_mask: Optional[torch.Tensor] = None,
157
  audio_token_start_idx: Optional[torch.Tensor] = None,
 
158
  audio_token_len: Optional[torch.Tensor] = None,
159
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
160
  # the alt_* fields are needed for KL divergence loss
@@ -196,7 +190,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
196
 
197
  # B x A/3200 x D
198
  audio_tower_output = self.audio_tower.forward(
199
- audio_values.to(self.audio_tower.dtype)
 
200
  ).last_hidden_state
201
  audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
202
 
@@ -242,6 +237,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
242
  audio_values: Optional[torch.FloatTensor] = None,
243
  audio_token_start_idx: Optional[torch.Tensor] = None,
244
  audio_token_len: Optional[torch.Tensor] = None,
 
245
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
246
  attention_mask: Optional[torch.Tensor] = None,
247
  inputs_embeds: Optional[torch.Tensor] = None,
@@ -270,6 +266,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
270
  audio_token_start_idx - prefill_start_idx
271
  )
272
  model_input["audio_token_len"] = audio_token_len
 
273
 
274
  return model_input
275
 
@@ -373,6 +370,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
373
 
374
  def push_to_hub(self, *args, **kwargs):
375
  self.merge_and_unload()
 
376
  return super().push_to_hub(*args, **kwargs)
377
 
378
  def save_pretrained(
@@ -424,7 +422,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
424
  )
425
 
426
 
427
- # TODO: refactor common parts to a shared module
428
  def is_cache_empty(
429
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
430
  ) -> bool:
@@ -442,18 +439,12 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
442
  """
443
  Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
444
  """
445
- unfreeze_layers = lora_config.pop("unfreeze_layers", None)
446
  lora_config = peft.LoraConfig(**lora_config or {})
447
 
448
  if lora_config.r == 0:
449
- # freeze the model entirely, except for the specified layers
450
- for name, param in model.named_parameters():
451
- if not unfreeze_layers or not any(
452
- re.match(layer, name) for layer in unfreeze_layers
453
- ):
454
- param.requires_grad = False
455
- else:
456
- logging.info(f"Unfreezing layer: {name} with #{param.numel()} params")
457
  else:
458
  model = peft.get_peft_model(model, lora_config)
459
 
@@ -521,7 +512,7 @@ class UltravoxProjector(nn.Sequential):
521
  return hidden_states
522
 
523
 
524
- class ModifiedWhisperEncoder(whisper.WhisperEncoder):
525
  """
526
  Encoder portion of OpenAI's Whisper model.
527
 
@@ -540,7 +531,7 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
540
  def forward(
541
  self,
542
  input_features,
543
- attention_mask=None,
544
  head_mask=None,
545
  output_attentions=None,
546
  output_hidden_states=None,
@@ -583,6 +574,23 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
583
  encoder_states = () if output_hidden_states else None
584
  all_attentions = () if output_attentions else None
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  # check if head_mask has a correct number of layers specified if desired
587
  if head_mask is not None:
588
  assert head_mask.size()[0] == (
@@ -606,14 +614,14 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
606
  layer_outputs = self._gradient_checkpointing_func(
607
  encoder_layer.__call__,
608
  hidden_states,
609
- None,
610
  (head_mask[idx] if head_mask is not None else None),
611
  output_attentions,
612
  )
613
  else:
614
  layer_outputs = encoder_layer(
615
  hidden_states,
616
- None,
617
  layer_head_mask=(
618
  head_mask[idx] if head_mask is not None else None
619
  ),
 
1
  import logging
 
2
  from typing import Any, Dict, Optional, Set, Tuple, Union
3
 
4
  import peft
 
34
 
35
  config_class = UltravoxConfig
36
  config: UltravoxConfig # for type hinting
37
+ # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
38
+ _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
 
 
 
 
 
 
39
 
40
  def __init__(self, config: UltravoxConfig):
41
  super().__init__(config)
 
148
  labels: Optional[torch.Tensor] = None,
149
  attention_mask: Optional[torch.Tensor] = None,
150
  audio_token_start_idx: Optional[torch.Tensor] = None,
151
+ audio_len: Optional[torch.Tensor] = None,
152
  audio_token_len: Optional[torch.Tensor] = None,
153
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
154
  # the alt_* fields are needed for KL divergence loss
 
190
 
191
  # B x A/3200 x D
192
  audio_tower_output = self.audio_tower.forward(
193
+ audio_values.to(self.audio_tower.dtype),
194
+ audio_len = audio_len
195
  ).last_hidden_state
196
  audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
197
 
 
237
  audio_values: Optional[torch.FloatTensor] = None,
238
  audio_token_start_idx: Optional[torch.Tensor] = None,
239
  audio_token_len: Optional[torch.Tensor] = None,
240
+ audio_len: Optional[torch.Tensor] = None,
241
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
242
  attention_mask: Optional[torch.Tensor] = None,
243
  inputs_embeds: Optional[torch.Tensor] = None,
 
266
  audio_token_start_idx - prefill_start_idx
267
  )
268
  model_input["audio_token_len"] = audio_token_len
269
+ model_input["audio_len"] = audio_len
270
 
271
  return model_input
272
 
 
370
 
371
  def push_to_hub(self, *args, **kwargs):
372
  self.merge_and_unload()
373
+ self.to(self.language_model.dtype)
374
  return super().push_to_hub(*args, **kwargs)
375
 
376
  def save_pretrained(
 
422
  )
423
 
424
 
 
425
  def is_cache_empty(
426
  past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
427
  ) -> bool:
 
439
  """
440
  Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
441
  """
 
442
  lora_config = peft.LoraConfig(**lora_config or {})
443
 
444
  if lora_config.r == 0:
445
+ # freeze the model entirely
446
+ for param in model.parameters():
447
+ param.requires_grad = False
 
 
 
 
 
448
  else:
449
  model = peft.get_peft_model(model, lora_config)
450
 
 
512
  return hidden_states
513
 
514
 
515
+ class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
516
  """
517
  Encoder portion of OpenAI's Whisper model.
518
 
 
531
  def forward(
532
  self,
533
  input_features,
534
+ audio_len=None,
535
  head_mask=None,
536
  output_attentions=None,
537
  output_hidden_states=None,
 
574
  encoder_states = () if output_hidden_states else None
575
  all_attentions = () if output_attentions else None
576
 
577
+ attention_mask = None
578
+ if audio_len != None:
579
+ audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
580
+ batch_size = hidden_states.shape[0]
581
+ max_seq_len = hidden_states.shape[1]
582
+ attention_mask = (
583
+ torch.arange(max_seq_len, device=hidden_states.device)[None, :]
584
+ .expand(batch_size, -1)
585
+ .lt(audio_feature_len.view(batch_size, 1))
586
+ )
587
+ attention_mask = self.get_extended_attention_mask(
588
+ attention_mask,
589
+ None,
590
+ device=hidden_states.device,
591
+ dtype=hidden_states.dtype,
592
+ )
593
+
594
  # check if head_mask has a correct number of layers specified if desired
595
  if head_mask is not None:
596
  assert head_mask.size()[0] == (
 
614
  layer_outputs = self._gradient_checkpointing_func(
615
  encoder_layer.__call__,
616
  hidden_states,
617
+ attention_mask,
618
  (head_mask[idx] if head_mask is not None else None),
619
  output_attentions,
620
  )
621
  else:
622
  layer_outputs = encoder_layer(
623
  hidden_states,
624
+ attention_mask,
625
  layer_head_mask=(
626
  head_mask[idx] if head_mask is not None else None
627
  ),
ultravox_processing.py CHANGED
@@ -62,7 +62,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
62
  super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
63
 
64
  @classmethod
65
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
66
  config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
67
  pretrained_model_name_or_path, **kwargs
68
  )
@@ -154,12 +154,17 @@ class UltravoxProcessor(transformers.ProcessorMixin):
154
  sampling_rate=sampling_rate,
155
  padding="longest",
156
  max_length=audio_len,
 
157
  **kwargs,
158
  )
159
  if "input_features" in x:
160
  data["audio_values"] = x.input_features
161
  else:
162
  data["audio_values"] = x.input_values
 
 
 
 
163
 
164
  if text is not None:
165
  assert isinstance(
 
62
  super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
63
 
64
  @classmethod
65
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
66
  config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
67
  pretrained_model_name_or_path, **kwargs
68
  )
 
154
  sampling_rate=sampling_rate,
155
  padding="longest",
156
  max_length=audio_len,
157
+ return_attention_mask=True,
158
  **kwargs,
159
  )
160
  if "input_features" in x:
161
  data["audio_values"] = x.input_features
162
  else:
163
  data["audio_values"] = x.input_values
164
+ if self.audio_padding == "max_length":
165
+ data["audio_len"] = x.attention_mask.sum(-1) - 1
166
+ else:
167
+ data["audio_len"] = [data["audio_values"].shape[-1]]
168
 
169
  if text is not None:
170
  assert isinstance(