# ################################ # Model: Whisper + TLTR + Audio_Proj + LLaMa3 # Authors: Yingzhi Wang 2024 # ################################ # URL for the LLAMA3 model and its save folder llama_hub: meta-llama/Meta-Llama-3-8B-Instruct # lmsys/vicuna-7b-v1.5 llama3_folder: llama3_checkpoint # llama generation config num_beams: 3 max_new_tokens: 400 top_k: 500 top_p: 0.95 temperature: 0.1 repetition_penalty: 1.1 # lora config lora_dropout: 0.05 lora_alpha: 16 r: 8 bias: "none" task_type: "CAUSAL_LM" lora_target_modules: ["q_proj", "v_proj"] # URL for whisper model. whisper_hub: openai/whisper-large whisper_folder: whisper_checkpoint freeze_whisper: True whisper_output_dim: 1280 # average pooling pooling_kernel: 20 # Audio Tagging model tltr_layers: 32 llama_hidden_size: 4096 # Masks audio_padding_mask: !name:speechbrain.dataio.dataio.length_to_mask text_padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref freeze: !ref save_path: !ref encoder_only: True output_all_hiddens: True avg_pool: !new:speechbrain.nnet.pooling.Pooling1d pool_type: "avg" kernel_size: !ref tltr: !new:speechbrain.lobes.models.TLTR.AT_MODEL n_layer: !ref rep_dim: !ref freeze: True audio_proj: !new:speechbrain.lobes.models.TLTR.AudioProjection input_size: !ref hidden_size: !ref #LLAMA3 model # llama3: null llama3: !new:speechbrain.lobes.models.huggingface_transformers.llama2.LLAMA2 source: !ref freeze: True save_path: !ref max_new_tokens: !ref num_beams: !ref top_k: !ref top_p: !ref temperature: !ref repetition_penalty: !ref with_peft: True lora_alpha: !ref lora_dropout: !ref r: !ref bias: !ref task_type: !ref lora_target_modules: !ref modules: tltr: !ref audio_proj: !ref llama3: !ref model: !new:torch.nn.ModuleList - [!ref , !ref ] pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: llama3: !ref model: !ref