zeroMN commited on
Commit
b744e9c
1 Parent(s): 6916e52

Upload 9 files

Browse files
Files changed (9) hide show
  1. AutoModel.pth +3 -0
  2. config.json +28 -0
  3. main.py +0 -0
  4. model.py +212 -0
  5. requirements.txt +4 -0
  6. run_local.py +63 -0
  7. sky.py +71 -0
  8. tokenizer.json +0 -0
  9. vocab.txt +0 -0
AutoModel.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3045413c560025a975f3a5d5ec93adb33adaaabf67603379a8e0c096d94b998
3
+ size 3237240570
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "AutoModel",
3
+ "hidden_size": 768,
4
+ "num_attention_heads": 12,
5
+ "num_hidden_layers": 12,
6
+ "intermediate_size": 3072,
7
+ "hidden_dropout_prob": 0.1,
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "image_size": 224,
10
+ "image_channels": 3,
11
+ "patch_size": 16,
12
+ "max_position_embeddings": 512,
13
+ "vocab_size": 30522,
14
+ "type_vocab_size": 2,
15
+ "audio_sample_rate": 16000,
16
+ "audio_frame_size": 1024,
17
+ "audio_hop_size": 512,
18
+ "enable_vqa": true,
19
+ "enable_caption": true,
20
+ "enable_retrieval": true,
21
+ "enable_asr": true,
22
+ "enable_realtime_asr": true,
23
+ "batch_size": 32,
24
+ "learning_rate": 0.0001,
25
+ "weight_decay": 0.01,
26
+ "warmup_steps": 10000,
27
+ "max_steps": 100000
28
+ }
main.py ADDED
File without changes
model.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import os
5
+ # 配置类定义
6
+ class Config:
7
+ def __init__(self):
8
+ # 模型架构参数
9
+ self.hidden_size = 768
10
+ self.num_attention_heads = 12
11
+ self.num_hidden_layers = 12
12
+ self.intermediate_size = 3072
13
+ self.hidden_dropout_prob = 0.1
14
+ self.attention_probs_dropout_prob = 0.1
15
+
16
+ # 图像相关
17
+ self.image_size = 224
18
+ self.image_channels = 3
19
+ self.patch_size = 16
20
+
21
+ # 文本相关
22
+ self.max_position_embeddings = 512
23
+ self.vocab_size = 30522
24
+ self.type_vocab_size = 2
25
+
26
+ # 语音相关
27
+ self.audio_sample_rate = 16000
28
+ self.audio_frame_size = 1024
29
+ self.audio_hop_size = 512
30
+
31
+ # 任务相关
32
+ self.enable_vqa = True
33
+ self.enable_caption = True
34
+ self.enable_retrieval = True
35
+ self.enable_asr = True # 语音识别
36
+ self.enable_realtime_asr = True # 实时语音识别
37
+
38
+ # 训练相关
39
+ self.batch_size = 32
40
+ self.learning_rate = 1e-4
41
+ self.weight_decay = 0.01
42
+ self.warmup_steps = 10000
43
+ self.max_steps = 100000
44
+
45
+ # 模型相关类定义
46
+ class ImageEncoder(nn.Module):
47
+ def __init__(self, config):
48
+ super(ImageEncoder, self).__init__()
49
+ self.config = config
50
+ self.encoder_layer = nn.Sequential(
51
+ nn.Conv2d(3, 64, kernel_size=3),
52
+ nn.ReLU(),
53
+ nn.MaxPool2d(2, 2),
54
+ nn.Flatten(),
55
+ nn.Linear(64 * 111 * 111, config.hidden_size)
56
+ )
57
+
58
+ def forward(self, image):
59
+ image_features = self.encoder_layer(image)
60
+ return image_features
61
+
62
+ class TextEncoder(nn.Module):
63
+ def __init__(self, config):
64
+ super(TextEncoder, self).__init__()
65
+ self.config = config
66
+ self.transformer_layer = nn.TransformerEncoderLayer(
67
+ d_model=config.hidden_size,
68
+ nhead=config.num_attention_heads,
69
+ batch_first=True
70
+ )
71
+ self.transformer_encoder = nn.TransformerEncoder(
72
+ self.transformer_layer,
73
+ num_layers=config.num_hidden_layers
74
+ )
75
+
76
+ def forward(self, text):
77
+ text_features = self.transformer_encoder(text).mean(dim=1)
78
+ return text_features
79
+
80
+ class AudioEncoder(nn.Module):
81
+ def __init__(self, config):
82
+ super(AudioEncoder, self).__init__()
83
+ self.config = config
84
+ self.encoder_layer = nn.Sequential(
85
+ nn.Linear(config.audio_sample_rate, config.hidden_size),
86
+ nn.ReLU(),
87
+ nn.Linear(config.hidden_size, config.hidden_size)
88
+ )
89
+
90
+ def forward(self, audio):
91
+ audio_features = self.encoder_layer(audio)
92
+ return audio_features
93
+
94
+ class FusionLayer(nn.Module):
95
+ def __init__(self, config):
96
+ super(FusionLayer, self).__init__()
97
+ self.config = config
98
+ self.fusion_layer = nn.Linear(config.hidden_size * 3, config.hidden_size)
99
+
100
+ def forward(self, image_features, text_features, audio_features):
101
+ fused_features = torch.cat((image_features, text_features, audio_features), dim=1)
102
+ fused_features = self.fusion_layer(fused_features)
103
+ return fused_features
104
+
105
+ class VQALayer(nn.Module):
106
+ def __init__(self, config):
107
+ super(VQALayer, self).__init__()
108
+ self.config = config
109
+ self.vqa_layer = nn.Linear(config.hidden_size, config.vocab_size)
110
+
111
+ def forward(self, fused_features):
112
+ vqa_output = self.vqa_layer(fused_features)
113
+ return vqa_output
114
+
115
+ class CaptionLayer(nn.Module):
116
+ def __init__(self, config):
117
+ super(CaptionLayer, self).__init__()
118
+ self.config = config
119
+ self.caption_layer = nn.Linear(config.hidden_size, config.vocab_size)
120
+
121
+ def forward(self, fused_features):
122
+ caption_output = self.caption_layer(fused_features)
123
+ return caption_output
124
+
125
+ class RetrievalLayer(nn.Module):
126
+ def __init__(self, config):
127
+ super(RetrievalLayer, self).__init__()
128
+ self.config = config
129
+ self.retrieval_layer = nn.Linear(config.hidden_size, config.vocab_size)
130
+
131
+ def forward(self, fused_features):
132
+ retrieval_output = self.retrieval_layer(fused_features)
133
+ return retrieval_output
134
+
135
+ class ASRLayer(nn.Module):
136
+ def __init__(self, config):
137
+ super(ASRLayer, self).__init__()
138
+ self.config = config
139
+ self.asr_layer = nn.Linear(config.hidden_size, config.vocab_size)
140
+
141
+ def forward(self, fused_features):
142
+ asr_output = self.asr_layer(fused_features)
143
+ return asr_output
144
+
145
+ class RealtimeASRLayer(nn.Module):
146
+ def __init__(self, config):
147
+ super(RealtimeASRLayer, self).__init__()
148
+ self.config = config
149
+ self.realtime_asr_layer = nn.Linear(config.hidden_size, config.vocab_size)
150
+
151
+ def forward(self, fused_features):
152
+ realtime_asr_output = self.realtime_asr_layer(fused_features)
153
+ return realtime_asr_output
154
+
155
+ # 主模型定义
156
+ class AutoModel(nn.Module):
157
+ def __init__(self, config):
158
+ super(AutoModel, self).__init__()
159
+ self.config = config
160
+ self.image_encoder = ImageEncoder(config)
161
+ self.text_encoder = TextEncoder(config)
162
+ self.audio_encoder = AudioEncoder(config)
163
+ self.fusion_layer = FusionLayer(config)
164
+ self.vqa_layer = VQALayer(config)
165
+ self.caption_layer = CaptionLayer(config)
166
+ self.retrieval_layer = RetrievalLayer(config)
167
+ self.asr_layer = ASRLayer(config)
168
+ self.realtime_asr_layer = RealtimeASRLayer(config)
169
+
170
+ def forward(self, image, text, audio):
171
+ image_features = self.image_encoder(image)
172
+ text_features = self.text_encoder(text)
173
+ audio_features = self.audio_encoder(audio)
174
+ fused_features = self.fusion_layer(image_features, text_features, audio_features)
175
+ vqa_output = self.vqa_layer(fused_features)
176
+ caption_output = self.caption_layer(fused_features)
177
+ retrieval_output = self.retrieval_layer(fused_features)
178
+ asr_output = self.asr_layer(fused_features)
179
+ realtime_asr_output = self.realtime_asr_layer(fused_features)
180
+ return vqa_output, caption_output, retrieval_output, asr_output, realtime_asr_output
181
+
182
+ # 测试代码
183
+ config = Config()
184
+ model = AutoModel(config)
185
+ image = torch.randn(1, 3, 224, 224)
186
+ text = torch.randn(1, config.max_position_embeddings, config.hidden_size)
187
+ audio = torch.randn(1, config.audio_sample_rate)
188
+ vqa_output, caption_output, retrieval_output, asr_output, realtime_asr_output = model(image, text, audio)
189
+
190
+ # 输出结果
191
+ print("VQA output shape:", vqa_output.shape)
192
+ print("Caption output shape:", caption_output.shape)
193
+ print("Retrieval output shape:", retrieval_output.shape)
194
+ print("ASR output shape:", asr_output.shape)
195
+ print("Realtime ASR output shape:", realtime_asr_output.shape)
196
+
197
+ # 打印总参数数量
198
+ total_params = sum(p.numel() for p in model.parameters())
199
+ print(f"\n总参数数量: {total_params}")
200
+
201
+ # 定义保存路径
202
+ save_dir = "./" # 当前目录
203
+ os.makedirs(save_dir, exist_ok=True)
204
+ save_path = os.path.join(save_dir, "AutoModel.pth")
205
+
206
+ # 保存模型权重
207
+ torch.save(model.state_dict(), save_path)
208
+ print(f"模型权重已保存到: {save_path}")
209
+
210
+
211
+
212
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch>=1.9.0
2
+ transformers>=4.10.0
3
+ numpy>=1.21.0
4
+ gradio>=3.0.0
run_local.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from model import AutoModel, Config
4
+
5
+ def load_model(model_path, config_path):
6
+ """
7
+ 加载模型权重和配置
8
+ """
9
+ # 加载配置
10
+ if not os.path.exists(config_path):
11
+ raise FileNotFoundError(f"配置文件未找到: {config_path}")
12
+ print(f"加载配置文件: {config_path}")
13
+ config = Config()
14
+
15
+ # 初始化模型
16
+ model = AutoModel(config)
17
+
18
+ # 加载权重
19
+ if not os.path.exists(model_path):
20
+ raise FileNotFoundError(f"模型文件未找到: {model_path}")
21
+ print(f"加载模型权重: {model_path}")
22
+ state_dict = torch.load(model_path, map_location=torch.device("cpu"))
23
+ model.load_state_dict(state_dict)
24
+ model.eval()
25
+ print("模型加载成功并设置为评估模式。")
26
+
27
+ return model, config
28
+
29
+
30
+ def run_inference(model, config):
31
+ """
32
+ 使用模型运行推理
33
+ """
34
+ # 模拟示例输入
35
+ image = torch.randn(1, 3, 224, 224) # 图像输入
36
+ text = torch.randn(1, config.max_position_embeddings, config.hidden_size) # 文本输入
37
+ audio = torch.randn(1, config.audio_sample_rate) # 音频输入
38
+
39
+ # 模型推理
40
+ outputs = model(image, text, audio)
41
+ vqa_output, caption_output, retrieval_output, asr_output, realtime_asr_output = outputs
42
+
43
+ # 打印结果
44
+ print("\n推理结果:")
45
+ print(f"VQA output shape: {vqa_output.shape}")
46
+ print(f"Caption output shape: {caption_output.shape}")
47
+ print(f"Retrieval output shape: {retrieval_output.shape}")
48
+ print(f"ASR output shape: {asr_output.shape}")
49
+ print(f"Realtime ASR output shape: {realtime_asr_output.shape}")
50
+
51
+ if __name__ == "__main__":
52
+ # 文件路径
53
+ model_path = "AutoModel.pth"
54
+ config_path = "config.json"
55
+
56
+ # 加载模型
57
+ try:
58
+ model, config = load_model(model_path, config_path)
59
+
60
+ # 运行推理
61
+ run_inference(model, config)
62
+ except Exception as e:
63
+ print(f"运行失败: {e}")
sky.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ # 定义配置参数
5
+ config_data = {
6
+ "hidden_size": 768,
7
+ "num_attention_heads": 12,
8
+ "num_hidden_layers": 12,
9
+ "intermediate_size": 3072,
10
+ "hidden_dropout_prob": 0.1,
11
+ "attention_probs_dropout_prob": 0.1,
12
+ "image_size": 224,
13
+ "image_channels": 3,
14
+ "patch_size": 16,
15
+ "max_position_embeddings": 512,
16
+ "vocab_size": 30522,
17
+ "type_vocab_size": 2,
18
+ "audio_sample_rate": 16000,
19
+ "audio_frame_size": 1024,
20
+ "audio_hop_size": 512,
21
+ "enable_vqa": True,
22
+ "enable_caption": True,
23
+ "enable_retrieval": True,
24
+ "enable_asr": True,
25
+ "enable_realtime_asr": True,
26
+ "batch_size": 32,
27
+ "learning_rate": 0.0001,
28
+ "weight_decay": 0.01,
29
+ "warmup_steps": 10000,
30
+ "max_steps": 100000
31
+ }
32
+
33
+ # 文件路径
34
+ config_path = r"C:\Users\baby7\Desktop\zero_sg-pytorch-zero-v4\config.json"
35
+
36
+ # 保存配置文件
37
+ os.makedirs(os.path.dirname(config_path), exist_ok=True)
38
+ with open(config_path, "w") as f:
39
+ json.dump(config_data, f, indent=4)
40
+
41
+ print(f"配置文件已保存到: {config_path}")
42
+
43
+ from transformers import BertTokenizer
44
+ import os
45
+
46
+ # 初始化分词器
47
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
48
+
49
+ # 保存分词器到目标路径
50
+ tokenizer_path = r"C:\Users\baby7\Desktop\zero_sg-pytorch-zero-v4\tokenizer"
51
+ os.makedirs(tokenizer_path, exist_ok=True)
52
+ tokenizer.save_pretrained(tokenizer_path)
53
+
54
+ print(f"分词器已保存到: {tokenizer_path}")
55
+
56
+
57
+ #### **加载配置文件**
58
+ from model import Config # 假设您有Config类
59
+
60
+ config_file = r"C:\Users\baby7\Desktop\zero_sg-pytorch-zero-v4\config.json"
61
+ config = Config(config_file)
62
+ print("加载的配置: ", config.__dict__)
63
+
64
+ from transformers import BertTokenizer
65
+
66
+ tokenizer_path = r"C:\Users\baby7\Desktop\zero_sg-pytorch-zero-v4\tokenizer"
67
+ tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
68
+ text = "Hello, how are you?"
69
+ encoded_input = tokenizer(text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
70
+
71
+ print("分词器输出: ", encoded_input["input_ids"].shape)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.txt ADDED
The diff for this file is too large to render. See raw diff