googlefan commited on
Commit
d15621b
1 Parent(s): 1ea48bb

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +85 -0
README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ja
4
+ base_model:
5
+ - google/gemma-2-9b-it
6
+ pipeline_tag: any-to-any
7
+ license: gemma
8
+ datasets:
9
+ - fixie-ai/common_voice_17_0
10
+ ---
11
+ ```py
12
+ import transformers
13
+ import librosa
14
+ import torch
15
+ import numpy as np
16
+ from typing import Dict, Any
17
+
18
+ model = transformers.AutoModel.from_pretrained(
19
+ "neody/ultravox-gemma-2-9b-it", trust_remote_code=True
20
+ )
21
+ model.to("cuda", dtype=torch.bfloat16)
22
+ processor = transformers.AutoProcessor.from_pretrained(
23
+ "neody/ultravox-gemma-2-9b-it", trust_remote_code=True
24
+ )
25
+ path = "record.wav"
26
+ audio, sr = librosa.load(path, sr=16000)
27
+
28
+
29
+ def preprocess(inputs: Dict[str, Any], device, dtype):
30
+ turns: list = inputs.get("turns", [])
31
+
32
+ audio = inputs.get("audio", None)
33
+ # Convert to float32 if needed.
34
+ if isinstance(audio, np.ndarray):
35
+ if audio.dtype == np.float64:
36
+ audio = audio.astype(np.float32)
37
+ elif audio.dtype == np.int16:
38
+ audio = audio.astype(np.float32) / np.float32(32768.0)
39
+ elif audio.dtype == np.int32:
40
+ audio = audio.astype(np.float32) / np.float32(2147483648.0)
41
+
42
+ if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"):
43
+ prompt = inputs.get("prompt", "<|audio|>")
44
+ if "<|audio|>" not in prompt:
45
+ print(
46
+ "Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt."
47
+ )
48
+
49
+ prompt += " <|audio|>"
50
+ turns.append({"role": "user", "content": prompt})
51
+
52
+ text = processor.tokenizer.apply_chat_template(
53
+ turns, add_generation_prompt=True, tokenize=False
54
+ )
55
+
56
+ if "sampling_rate" not in inputs and audio is not None:
57
+ print(
58
+ "No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate."
59
+ )
60
+
61
+ output = processor(
62
+ text=text,
63
+ audio=audio,
64
+ sampling_rate=inputs.get("sampling_rate", 16000),
65
+ )
66
+ if "audio_values" in output:
67
+ output["audio_values"] = output["audio_values"].to(device, dtype)
68
+ return output.to(device, dtype)
69
+
70
+
71
+ turns = []
72
+ print(
73
+ processor.tokenizer.decode(
74
+ model.generate(
75
+ **preprocess(
76
+ {"audio": audio, "turns": turns, "sampling_rate": sr},
77
+ "cuda",
78
+ torch.bfloat16,
79
+ ),
80
+ max_new_tokens=300,
81
+ ).squeeze(),
82
+ skip_special_tokens=True,
83
+ )
84
+ )
85
+ ```