ecker
/

vall-e

Model card Files Files and versions Community

vall-e / models /config.llama.yaml

mrq

replaced ar+nar-tts+stt-llama-8 as the base ar+nar-llama-8 since it pretty much surpasses it now

1ea585e about 1 month ago

2.17 kB

	sample_rate: 24_000
	audio_backend: "vocos"

	models:
	- name: "ar+nar-tts+stt"
	size: "full"
	resp_levels: 8
	prom_levels: 8
	tasks: 9
	langs: 4
	tones: 1
	arch_type: llama
	training: False
	version: 5
	attention: auto
	dropout: 0.1
	#loss_factors:
	# text: 0.01
	# prom: 0.5
	# resp: 1.0
	capabilities: ["ar", "nar"]
	experimental:
	p_rvq_levels: "auto"
	audio_embedding_sums: True
	unified_position_ids: False
	split_classifiers: True
	#
	causal_size: 1
	interleave: False
	rvq_level_range: []
	tie_classifier_to_embedding: False

	#loras:
	#- name : "lora"
	# rank: 128
	# alpha: 128
	# training: True
	# rvq_levels: []

	hyperparameters:
	batch_size: 32
	gradient_accumulation_steps: 8
	gradient_clipping: 1.0
	warmup_steps: 10

	optimizer: Prodigy
	learning_rate: 1.0
	torch_optimizer: True

	scheduler: "" # ScheduleFree
	torch_scheduler: True

	evaluation:
	batch_size: 4
	frequency: 250
	size: 4

	steps: 500
	ar_temperature: 1.0
	nar_temperature: 0.0

	trainer:
	iterations: 1_000_000
	save_frequency: 250
	keep_last_checkpoints: 4

	resize_modules: True

	check_for_oom: False
	gradient_checkpointing: True

	weight_dtype: bfloat16
	amp: True

	backend: deepspeed
	deepspeed:
	inferencing: False
	amp: False

	load_webui: False

	inference:
	backend: local
	normalize: False

	weight_dtype: bfloat16
	amp: True

	optimizations:
	injects: False
	replace: True

	linear: False
	embedding: False
	optimizers: True

	bitsandbytes: False
	dadaptation: False
	bitnet: False
	fp8: False

	dataset:
	use_hdf5: True
	hdf5_flag: r

	use_metadata: True
	validate: True

	workers: 1
	cache: True

	duration_range: [3.0, 12.0]

	random_utterance: 1.0
	max_prompts: 1
	prompt_duration_range: [3.0, 3.0]

	max_resps: 1
	p_resp_append: 0.25

	sample_type: path # path # speaker
	sample_order: duration
	sample_max_duration_batch: 300
	sample_shuffle: False

	tasks_list: [ "tts", "stt" ]

	training: []
	validation: []
	noise: []