MultiLlasa
/

Llasa-1B-Multilingual-German

Safetensors

German

llama

Model card Files Files and versions Community

SebastianBodza commited on 9 days ago

Commit

a4bab8e

verified ·

1 Parent(s): 8832c28

Update README.md

Browse files

bugfix generation with speaker

Files changed (1) hide show

README.md +34 -55

README.md CHANGED Viewed

@@ -155,42 +155,24 @@ whisper_turbo_pipe = pipeline(
     device="cuda",
 )
-def extract_speech_ids(speech_tokens_str_list):
-    """
-    Convert tokens like "<|s_12345|>" into integer ids.
-    """
-    speech_ids = []
-    for token_str in speech_tokens_str_list:
-        if token_str.startswith("<|s_") and token_str.endswith("|>"):
-            num_str = token_str[4:-2]
-            try:
-                speech_ids.append(int(num_str))
-            except ValueError:
-                print("Error converting token:", token_str)
-        else:
-            print(f"Unexpected token: {token_str}")
-    return speech_ids
 waveform, sample_rate = torchaudio.load(sample_audio_path)
 max_secs = 15
-if waveform.shape[1] / sample_rate > max_secs:
-    print("Trimming audio to the first 15 seconds.")
-    waveform = waveform[:, : sample_rate * max_secs]
-    # Pad a bit briefly (0.5 sec) at the end
-    waveform = torch.nn.functional.pad(
-        waveform, (0, int(sample_rate * 0.5)), "constant", 0
-    )
-if waveform.shape[0] > 1:
-    waveform = waveform.mean(dim=0, keepdim=True)
-if sample_rate != 16000:
-    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate,
-                                                new_freq=16000)
-    waveform = resampler(waveform)
-    sample_rate = 16000
 if sample_audio_text is None:
     print("Transcribing audio...")
@@ -208,44 +190,41 @@ elif len(target_text) > 500:
 input_text = transcription + " " + target_text
-formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
-chat = [
-    {"role": "user", "content": "Convert the text to speech:" + formatted_text},
-    {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
-]
-input_ids = tokenizer.apply_chat_template(
-    chat, tokenize=True, return_tensors="pt", continue_final_message=True
-)
-input_ids = input_ids.to("cuda")
-speech_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
-with torch.no_grad():
     outputs = model.generate(
         input_ids,
-        max_length=2048,
         eos_token_id=speech_end_id,
         do_sample=True,
         top_p=1,
         temperature=0.8,
     )
-    generated_ids = outputs[0][input_ids.shape[1] : -1]
-    raw_speech_tokens = tokenizer.batch_decode(generated_ids,
-                                                skip_special_tokens=True)
-    speech_ids = extract_speech_ids(raw_speech_tokens)
-    if len(speech_ids) == 0:
-        raise ValueError("No valid speech tokens were generated!")
-    speech_tokens_tensor = torch.tensor(speech_ids)\
-        .cuda().unsqueeze(0).unsqueeze(0)
-    gen_wav = Codec_model.decode_code(speech_tokens_tensor).cpu().squeeze()
-sf.write(output_filename, gen_wav, 16000)
 ```

     device="cuda",
 )
+def ids_to_speech_tokens(speech_ids):
+    speech_tokens_str = []
+    for speech_id in speech_ids:
+        speech_tokens_str.append(f"<|s_{speech_id}|>")
+    return speech_tokens_str
 waveform, sample_rate = torchaudio.load(sample_audio_path)
 max_secs = 15
+if len(waveform[0]) / sample_rate > 15:
+    print("Warning: Trimming audio to first 15secs.")
+    waveform = waveform[:, : sample_rate * 15]
+    waveform = torch.nn.functional.pad( waveform, (0, int(sample_rate * 0.5)), "constant", 0)
+if waveform.size(0) > 1:
+    waveform = torch.mean(waveform, dim=0, keepdim=True)
+prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
 if sample_audio_text is None:
     print("Transcribing audio...")
 input_text = transcription + " " + target_text
+with torch.no_grad():
+    vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
+    vq_code_prompt = vq_code_prompt[0, 0, :]
+    speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
+    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
+    chat = [
+        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
+        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + "".join(speech_ids_prefix)}
+        ]
+    input_ids = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt", continue_final_message=True)
+    input_ids = input_ids.to("cuda")
+    speech_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
     outputs = model.generate(
         input_ids,
+        max_length=2048,
         eos_token_id=speech_end_id,
         do_sample=True,
         top_p=1,
         temperature=0.8,
+        min_new_tokens=4, # Fix so the model does not directly stop
     )
+    generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix) : -1]
+    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+    speech_tokens = extract_speech_ids(speech_tokens)
+    speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
+    gen_wav = Codec_model.decode_code(speech_tokens)
+    gen_wav = gen_wav[:, :, prompt_wav.shape[1] :]
+    sf.write(output_filename, gen_wav[0, 0, :].cpu().numpy(), 16000)
 ```