ohayonguy commited on
Commit
b7f3942
·
1 Parent(s): 1b8b226

first commit fixed

Browse files
app.py CHANGED
@@ -24,17 +24,12 @@ if not os.path.exists(realesr_model_path):
24
  os.system(
25
  "wget https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth -O experiments/pretrained_models/RealESRGAN_x4plus.pth")
26
 
27
- pmrf_model_path = 'blind_face_restoration_pmrf.ckpt'
28
-
29
  # background enhancer with RealESRGAN
30
  model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
31
  half = True if torch.cuda.is_available() else False
32
  upsampler = RealESRGANer(scale=4, model_path=realesr_model_path, model=model, tile=0, tile_pad=10, pre_pad=0, half=half)
33
 
34
- pmrf = MMSERectifiedFlow.load_from_checkpoint('./blind_face_restoration_pmrf.ckpt',
35
- mmse_model_arch='swinir_L',
36
- mmse_model_ckpt_path=None,
37
- map_location='cpu').to(device)
38
 
39
  os.makedirs('output', exist_ok=True)
40
 
 
24
  os.system(
25
  "wget https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth -O experiments/pretrained_models/RealESRGAN_x4plus.pth")
26
 
 
 
27
  # background enhancer with RealESRGAN
28
  model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
29
  half = True if torch.cuda.is_available() else False
30
  upsampler = RealESRGANer(scale=4, model_path=realesr_model_path, model=model, tile=0, tile_pad=10, pre_pad=0, half=half)
31
 
32
+ pmrf = MMSERectifiedFlow.from_pretrained('ohayonguy/PMRF_blind_face_image_restoration').to(device)
 
 
 
33
 
34
  os.makedirs('output', exist_ok=True)
35
 
arch/hourglass/__init__.py ADDED
File without changes
arch/hourglass/axial_rope.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """k-diffusion transformer diffusion models, version 2.
2
+ Codes adopted from https://github.com/crowsonkb/k-diffusion
3
+ """
4
+
5
+ import math
6
+
7
+ import torch
8
+ import torch._dynamo
9
+ from torch import nn
10
+
11
+ from . import flags
12
+
13
+ if flags.get_use_compile():
14
+ torch._dynamo.config.suppress_errors = True
15
+
16
+
17
+ def rotate_half(x):
18
+ x1, x2 = x[..., 0::2], x[..., 1::2]
19
+ x = torch.stack((-x2, x1), dim=-1)
20
+ *shape, d, r = x.shape
21
+ return x.view(*shape, d * r)
22
+
23
+
24
+ @flags.compile_wrap
25
+ def apply_rotary_emb(freqs, t, start_index=0, scale=1.0):
26
+ freqs = freqs.to(t)
27
+ rot_dim = freqs.shape[-1]
28
+ end_index = start_index + rot_dim
29
+ assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
30
+ t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
31
+ t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
32
+ return torch.cat((t_left, t, t_right), dim=-1)
33
+
34
+
35
+ def centers(start, stop, num, dtype=None, device=None):
36
+ edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
37
+ return (edges[:-1] + edges[1:]) / 2
38
+
39
+
40
+ def make_grid(h_pos, w_pos):
41
+ grid = torch.stack(torch.meshgrid(h_pos, w_pos, indexing='ij'), dim=-1)
42
+ h, w, d = grid.shape
43
+ return grid.view(h * w, d)
44
+
45
+
46
+ def bounding_box(h, w, pixel_aspect_ratio=1.0):
47
+ # Adjusted dimensions
48
+ w_adj = w
49
+ h_adj = h * pixel_aspect_ratio
50
+
51
+ # Adjusted aspect ratio
52
+ ar_adj = w_adj / h_adj
53
+
54
+ # Determine bounding box based on the adjusted aspect ratio
55
+ y_min, y_max, x_min, x_max = -1.0, 1.0, -1.0, 1.0
56
+ if ar_adj > 1:
57
+ y_min, y_max = -1 / ar_adj, 1 / ar_adj
58
+ elif ar_adj < 1:
59
+ x_min, x_max = -ar_adj, ar_adj
60
+
61
+ return y_min, y_max, x_min, x_max
62
+
63
+
64
+ def make_axial_pos(h, w, pixel_aspect_ratio=1.0, align_corners=False, dtype=None, device=None):
65
+ y_min, y_max, x_min, x_max = bounding_box(h, w, pixel_aspect_ratio)
66
+ if align_corners:
67
+ h_pos = torch.linspace(y_min, y_max, h, dtype=dtype, device=device)
68
+ w_pos = torch.linspace(x_min, x_max, w, dtype=dtype, device=device)
69
+ else:
70
+ h_pos = centers(y_min, y_max, h, dtype=dtype, device=device)
71
+ w_pos = centers(x_min, x_max, w, dtype=dtype, device=device)
72
+ return make_grid(h_pos, w_pos)
73
+
74
+
75
+ def freqs_pixel(max_freq=10.0):
76
+ def init(shape):
77
+ freqs = torch.linspace(1.0, max_freq / 2, shape[-1]) * math.pi
78
+ return freqs.log().expand(shape)
79
+ return init
80
+
81
+
82
+ def freqs_pixel_log(max_freq=10.0):
83
+ def init(shape):
84
+ log_min = math.log(math.pi)
85
+ log_max = math.log(max_freq * math.pi / 2)
86
+ return torch.linspace(log_min, log_max, shape[-1]).expand(shape)
87
+ return init
88
+
89
+
90
+ class AxialRoPE(nn.Module):
91
+ def __init__(self, dim, n_heads, start_index=0, freqs_init=freqs_pixel_log(max_freq=10.0)):
92
+ super().__init__()
93
+ self.n_heads = n_heads
94
+ self.start_index = start_index
95
+ log_freqs = freqs_init((n_heads, dim // 4))
96
+ self.freqs_h = nn.Parameter(log_freqs.clone())
97
+ self.freqs_w = nn.Parameter(log_freqs.clone())
98
+
99
+ def extra_repr(self):
100
+ dim = (self.freqs_h.shape[-1] + self.freqs_w.shape[-1]) * 2
101
+ return f"dim={dim}, n_heads={self.n_heads}, start_index={self.start_index}"
102
+
103
+ def get_freqs(self, pos):
104
+ if pos.shape[-1] != 2:
105
+ raise ValueError("input shape must be (..., 2)")
106
+ freqs_h = pos[..., None, None, 0] * self.freqs_h.exp()
107
+ freqs_w = pos[..., None, None, 1] * self.freqs_w.exp()
108
+ freqs = torch.cat((freqs_h, freqs_w), dim=-1).repeat_interleave(2, dim=-1)
109
+ return freqs.transpose(-2, -3)
110
+
111
+ def forward(self, x, pos):
112
+ freqs = self.get_freqs(pos)
113
+ return apply_rotary_emb(freqs, x, self.start_index)
arch/hourglass/flags.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """k-diffusion transformer diffusion models, version 2.
2
+ Codes adopted from https://github.com/crowsonkb/k-diffusion
3
+ """
4
+
5
+ from contextlib import contextmanager
6
+ from functools import update_wrapper
7
+ import os
8
+ import threading
9
+
10
+ import torch
11
+
12
+
13
+ def get_use_compile():
14
+ return os.environ.get("K_DIFFUSION_USE_COMPILE", "1") == "1"
15
+
16
+
17
+ def get_use_flash_attention_2():
18
+ return os.environ.get("K_DIFFUSION_USE_FLASH_2", "1") == "1"
19
+
20
+
21
+ state = threading.local()
22
+ state.checkpointing = False
23
+
24
+
25
+ @contextmanager
26
+ def checkpointing(enable=True):
27
+ try:
28
+ old_checkpointing, state.checkpointing = state.checkpointing, enable
29
+ yield
30
+ finally:
31
+ state.checkpointing = old_checkpointing
32
+
33
+
34
+ def get_checkpointing():
35
+ return getattr(state, "checkpointing", False)
36
+
37
+
38
+ class compile_wrap:
39
+ def __init__(self, function, *args, **kwargs):
40
+ self.function = function
41
+ self.args = args
42
+ self.kwargs = kwargs
43
+ self._compiled_function = None
44
+ update_wrapper(self, function)
45
+
46
+ @property
47
+ def compiled_function(self):
48
+ if self._compiled_function is not None:
49
+ return self._compiled_function
50
+ if get_use_compile():
51
+ try:
52
+ self._compiled_function = torch.compile(self.function, *self.args, **self.kwargs)
53
+ except RuntimeError:
54
+ self._compiled_function = self.function
55
+ else:
56
+ self._compiled_function = self.function
57
+ return self._compiled_function
58
+
59
+ def __call__(self, *args, **kwargs):
60
+ return self.compiled_function(*args, **kwargs)
arch/hourglass/flops.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """k-diffusion transformer diffusion models, version 2.
2
+ Codes adopted from https://github.com/crowsonkb/k-diffusion
3
+ """
4
+
5
+ from contextlib import contextmanager
6
+ import math
7
+ import threading
8
+
9
+
10
+ state = threading.local()
11
+ state.flop_counter = None
12
+
13
+
14
+ @contextmanager
15
+ def flop_counter(enable=True):
16
+ try:
17
+ old_flop_counter = state.flop_counter
18
+ state.flop_counter = FlopCounter() if enable else None
19
+ yield state.flop_counter
20
+ finally:
21
+ state.flop_counter = old_flop_counter
22
+
23
+
24
+ class FlopCounter:
25
+ def __init__(self):
26
+ self.ops = []
27
+
28
+ def op(self, op, *args, **kwargs):
29
+ self.ops.append((op, args, kwargs))
30
+
31
+ @property
32
+ def flops(self):
33
+ flops = 0
34
+ for op, args, kwargs in self.ops:
35
+ flops += op(*args, **kwargs)
36
+ return flops
37
+
38
+
39
+ def op(op, *args, **kwargs):
40
+ if getattr(state, "flop_counter", None):
41
+ state.flop_counter.op(op, *args, **kwargs)
42
+
43
+
44
+ def op_linear(x, weight):
45
+ return math.prod(x) * weight[0]
46
+
47
+
48
+ def op_attention(q, k, v):
49
+ *b, s_q, d_q = q
50
+ *b, s_k, d_k = k
51
+ *b, s_v, d_v = v
52
+ return math.prod(b) * s_q * s_k * (d_q + d_v)
53
+
54
+
55
+ def op_natten(q, k, v, kernel_size):
56
+ *q_rest, d_q = q
57
+ *_, d_v = v
58
+ return math.prod(q_rest) * (d_q + d_v) * kernel_size**2
arch/hourglass/image_transformer_v2.py ADDED
@@ -0,0 +1,772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """k-diffusion transformer diffusion models, version 2.
2
+ Codes adopted from https://github.com/crowsonkb/k-diffusion
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+ from functools import lru_cache, reduce
7
+ import math
8
+ from typing import Union
9
+
10
+ from einops import rearrange
11
+ import torch
12
+ from torch import nn
13
+ import torch._dynamo
14
+ from torch.nn import functional as F
15
+
16
+ from . import flags, flops
17
+ from .axial_rope import make_axial_pos
18
+
19
+
20
+ try:
21
+ import natten
22
+ except ImportError:
23
+ natten = None
24
+
25
+ try:
26
+ import flash_attn
27
+ except ImportError:
28
+ flash_attn = None
29
+
30
+
31
+ if flags.get_use_compile():
32
+ torch._dynamo.config.cache_size_limit = max(64, torch._dynamo.config.cache_size_limit)
33
+ torch._dynamo.config.suppress_errors = True
34
+
35
+
36
+ # Helpers
37
+
38
+ def zero_init(layer):
39
+ nn.init.zeros_(layer.weight)
40
+ if layer.bias is not None:
41
+ nn.init.zeros_(layer.bias)
42
+ return layer
43
+
44
+
45
+ def checkpoint(function, *args, **kwargs):
46
+ if flags.get_checkpointing():
47
+ kwargs.setdefault("use_reentrant", True)
48
+ return torch.utils.checkpoint.checkpoint(function, *args, **kwargs)
49
+ else:
50
+ return function(*args, **kwargs)
51
+
52
+
53
+ def downscale_pos(pos):
54
+ pos = rearrange(pos, "... (h nh) (w nw) e -> ... h w (nh nw) e", nh=2, nw=2)
55
+ return torch.mean(pos, dim=-2)
56
+
57
+
58
+ # Param tags
59
+
60
+ def tag_param(param, tag):
61
+ if not hasattr(param, "_tags"):
62
+ param._tags = set([tag])
63
+ else:
64
+ param._tags.add(tag)
65
+ return param
66
+
67
+
68
+ def tag_module(module, tag):
69
+ for param in module.parameters():
70
+ tag_param(param, tag)
71
+ return module
72
+
73
+
74
+ def apply_wd(module):
75
+ for name, param in module.named_parameters():
76
+ if name.endswith("weight"):
77
+ tag_param(param, "wd")
78
+ return module
79
+
80
+
81
+ def filter_params(function, module):
82
+ for param in module.parameters():
83
+ tags = getattr(param, "_tags", set())
84
+ if function(tags):
85
+ yield param
86
+
87
+
88
+ # Kernels
89
+
90
+ @flags.compile_wrap
91
+ def linear_geglu(x, weight, bias=None):
92
+ x = x @ weight.mT
93
+ if bias is not None:
94
+ x = x + bias
95
+ x, gate = x.chunk(2, dim=-1)
96
+ return x * F.gelu(gate)
97
+
98
+
99
+ @flags.compile_wrap
100
+ def rms_norm(x, scale, eps):
101
+ dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
102
+ mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True)
103
+ scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
104
+ return x * scale.to(x.dtype)
105
+
106
+
107
+ @flags.compile_wrap
108
+ def scale_for_cosine_sim(q, k, scale, eps):
109
+ dtype = reduce(torch.promote_types, (q.dtype, k.dtype, scale.dtype, torch.float32))
110
+ sum_sq_q = torch.sum(q.to(dtype)**2, dim=-1, keepdim=True)
111
+ sum_sq_k = torch.sum(k.to(dtype)**2, dim=-1, keepdim=True)
112
+ sqrt_scale = torch.sqrt(scale.to(dtype))
113
+ scale_q = sqrt_scale * torch.rsqrt(sum_sq_q + eps)
114
+ scale_k = sqrt_scale * torch.rsqrt(sum_sq_k + eps)
115
+ return q * scale_q.to(q.dtype), k * scale_k.to(k.dtype)
116
+
117
+
118
+ @flags.compile_wrap
119
+ def scale_for_cosine_sim_qkv(qkv, scale, eps):
120
+ q, k, v = qkv.unbind(2)
121
+ q, k = scale_for_cosine_sim(q, k, scale[:, None], eps)
122
+ return torch.stack((q, k, v), dim=2)
123
+
124
+
125
+ # Layers
126
+
127
+ class Linear(nn.Linear):
128
+ def forward(self, x):
129
+ flops.op(flops.op_linear, x.shape, self.weight.shape)
130
+ return super().forward(x)
131
+
132
+
133
+ class LinearGEGLU(nn.Linear):
134
+ def __init__(self, in_features, out_features, bias=True):
135
+ super().__init__(in_features, out_features * 2, bias=bias)
136
+ self.out_features = out_features
137
+
138
+ def forward(self, x):
139
+ flops.op(flops.op_linear, x.shape, self.weight.shape)
140
+ return linear_geglu(x, self.weight, self.bias)
141
+
142
+
143
+ class FourierFeatures(nn.Module):
144
+ def __init__(self, in_features, out_features, std=1.):
145
+ super().__init__()
146
+ assert out_features % 2 == 0
147
+ self.register_buffer('weight', torch.randn([out_features // 2, in_features]) * std)
148
+
149
+ def forward(self, input):
150
+ f = 2 * math.pi * input @ self.weight.T
151
+ return torch.cat([f.cos(), f.sin()], dim=-1)
152
+
153
+ class RMSNorm(nn.Module):
154
+ def __init__(self, shape, eps=1e-6):
155
+ super().__init__()
156
+ self.eps = eps
157
+ self.scale = nn.Parameter(torch.ones(shape))
158
+
159
+ def extra_repr(self):
160
+ return f"shape={tuple(self.scale.shape)}, eps={self.eps}"
161
+
162
+ def forward(self, x):
163
+ return rms_norm(x, self.scale, self.eps)
164
+
165
+
166
+ class AdaRMSNorm(nn.Module):
167
+ def __init__(self, features, cond_features, eps=1e-6):
168
+ super().__init__()
169
+ self.eps = eps
170
+ self.linear = apply_wd(zero_init(Linear(cond_features, features, bias=False)))
171
+ tag_module(self.linear, "mapping")
172
+
173
+ def extra_repr(self):
174
+ return f"eps={self.eps},"
175
+
176
+ def forward(self, x, cond):
177
+ return rms_norm(x, self.linear(cond)[:, None, None, :] + 1, self.eps)
178
+
179
+
180
+ # Rotary position embeddings
181
+
182
+ @flags.compile_wrap
183
+ def apply_rotary_emb(x, theta, conj=False):
184
+ out_dtype = x.dtype
185
+ dtype = reduce(torch.promote_types, (x.dtype, theta.dtype, torch.float32))
186
+ d = theta.shape[-1]
187
+ assert d * 2 <= x.shape[-1]
188
+ x1, x2, x3 = x[..., :d], x[..., d : d * 2], x[..., d * 2 :]
189
+ x1, x2, theta = x1.to(dtype), x2.to(dtype), theta.to(dtype)
190
+ cos, sin = torch.cos(theta), torch.sin(theta)
191
+ sin = -sin if conj else sin
192
+ y1 = x1 * cos - x2 * sin
193
+ y2 = x2 * cos + x1 * sin
194
+ y1, y2 = y1.to(out_dtype), y2.to(out_dtype)
195
+ return torch.cat((y1, y2, x3), dim=-1)
196
+
197
+
198
+ @flags.compile_wrap
199
+ def _apply_rotary_emb_inplace(x, theta, conj):
200
+ dtype = reduce(torch.promote_types, (x.dtype, theta.dtype, torch.float32))
201
+ d = theta.shape[-1]
202
+ assert d * 2 <= x.shape[-1]
203
+ x1, x2 = x[..., :d], x[..., d : d * 2]
204
+ x1_, x2_, theta = x1.to(dtype), x2.to(dtype), theta.to(dtype)
205
+ cos, sin = torch.cos(theta), torch.sin(theta)
206
+ sin = -sin if conj else sin
207
+ y1 = x1_ * cos - x2_ * sin
208
+ y2 = x2_ * cos + x1_ * sin
209
+ x1.copy_(y1)
210
+ x2.copy_(y2)
211
+
212
+
213
+ class ApplyRotaryEmbeddingInplace(torch.autograd.Function):
214
+ @staticmethod
215
+ def forward(x, theta, conj):
216
+ _apply_rotary_emb_inplace(x, theta, conj=conj)
217
+ return x
218
+
219
+ @staticmethod
220
+ def setup_context(ctx, inputs, output):
221
+ _, theta, conj = inputs
222
+ ctx.save_for_backward(theta)
223
+ ctx.conj = conj
224
+
225
+ @staticmethod
226
+ def backward(ctx, grad_output):
227
+ theta, = ctx.saved_tensors
228
+ _apply_rotary_emb_inplace(grad_output, theta, conj=not ctx.conj)
229
+ return grad_output, None, None
230
+
231
+
232
+ def apply_rotary_emb_(x, theta):
233
+ return ApplyRotaryEmbeddingInplace.apply(x, theta, False)
234
+
235
+
236
+ class AxialRoPE(nn.Module):
237
+ def __init__(self, dim, n_heads):
238
+ super().__init__()
239
+ log_min = math.log(math.pi)
240
+ log_max = math.log(10.0 * math.pi)
241
+ freqs = torch.linspace(log_min, log_max, n_heads * dim // 4 + 1)[:-1].exp()
242
+ self.register_buffer("freqs", freqs.view(dim // 4, n_heads).T.contiguous())
243
+
244
+ def extra_repr(self):
245
+ return f"dim={self.freqs.shape[1] * 4}, n_heads={self.freqs.shape[0]}"
246
+
247
+ def forward(self, pos):
248
+ theta_h = pos[..., None, 0:1] * self.freqs.to(pos.dtype)
249
+ theta_w = pos[..., None, 1:2] * self.freqs.to(pos.dtype)
250
+ return torch.cat((theta_h, theta_w), dim=-1)
251
+
252
+
253
+ # Shifted window attention
254
+
255
+ def window(window_size, x):
256
+ *b, h, w, c = x.shape
257
+ x = torch.reshape(
258
+ x,
259
+ (*b, h // window_size, window_size, w // window_size, window_size, c),
260
+ )
261
+ x = torch.permute(
262
+ x,
263
+ (*range(len(b)), -5, -3, -4, -2, -1),
264
+ )
265
+ return x
266
+
267
+
268
+ def unwindow(x):
269
+ *b, h, w, wh, ww, c = x.shape
270
+ x = torch.permute(x, (*range(len(b)), -5, -3, -4, -2, -1))
271
+ x = torch.reshape(x, (*b, h * wh, w * ww, c))
272
+ return x
273
+
274
+
275
+ def shifted_window(window_size, window_shift, x):
276
+ x = torch.roll(x, shifts=(window_shift, window_shift), dims=(-2, -3))
277
+ windows = window(window_size, x)
278
+ return windows
279
+
280
+
281
+ def shifted_unwindow(window_shift, x):
282
+ x = unwindow(x)
283
+ x = torch.roll(x, shifts=(-window_shift, -window_shift), dims=(-2, -3))
284
+ return x
285
+
286
+
287
+ @lru_cache
288
+ def make_shifted_window_masks(n_h_w, n_w_w, w_h, w_w, shift, device=None):
289
+ ph_coords = torch.arange(n_h_w, device=device)
290
+ pw_coords = torch.arange(n_w_w, device=device)
291
+ h_coords = torch.arange(w_h, device=device)
292
+ w_coords = torch.arange(w_w, device=device)
293
+ patch_h, patch_w, q_h, q_w, k_h, k_w = torch.meshgrid(
294
+ ph_coords,
295
+ pw_coords,
296
+ h_coords,
297
+ w_coords,
298
+ h_coords,
299
+ w_coords,
300
+ indexing="ij",
301
+ )
302
+ is_top_patch = patch_h == 0
303
+ is_left_patch = patch_w == 0
304
+ q_above_shift = q_h < shift
305
+ k_above_shift = k_h < shift
306
+ q_left_of_shift = q_w < shift
307
+ k_left_of_shift = k_w < shift
308
+ m_corner = (
309
+ is_left_patch
310
+ & is_top_patch
311
+ & (q_left_of_shift == k_left_of_shift)
312
+ & (q_above_shift == k_above_shift)
313
+ )
314
+ m_left = is_left_patch & ~is_top_patch & (q_left_of_shift == k_left_of_shift)
315
+ m_top = ~is_left_patch & is_top_patch & (q_above_shift == k_above_shift)
316
+ m_rest = ~is_left_patch & ~is_top_patch
317
+ m = m_corner | m_left | m_top | m_rest
318
+ return m
319
+
320
+
321
+ def apply_window_attention(window_size, window_shift, q, k, v, scale=None):
322
+ # prep windows and masks
323
+ q_windows = shifted_window(window_size, window_shift, q)
324
+ k_windows = shifted_window(window_size, window_shift, k)
325
+ v_windows = shifted_window(window_size, window_shift, v)
326
+ b, heads, h, w, wh, ww, d_head = q_windows.shape
327
+ mask = make_shifted_window_masks(h, w, wh, ww, window_shift, device=q.device)
328
+ q_seqs = torch.reshape(q_windows, (b, heads, h, w, wh * ww, d_head))
329
+ k_seqs = torch.reshape(k_windows, (b, heads, h, w, wh * ww, d_head))
330
+ v_seqs = torch.reshape(v_windows, (b, heads, h, w, wh * ww, d_head))
331
+ mask = torch.reshape(mask, (h, w, wh * ww, wh * ww))
332
+
333
+ # do the attention here
334
+ flops.op(flops.op_attention, q_seqs.shape, k_seqs.shape, v_seqs.shape)
335
+ qkv = F.scaled_dot_product_attention(q_seqs, k_seqs, v_seqs, mask, scale=scale)
336
+
337
+ # unwindow
338
+ qkv = torch.reshape(qkv, (b, heads, h, w, wh, ww, d_head))
339
+ return shifted_unwindow(window_shift, qkv)
340
+
341
+
342
+ # Transformer layers
343
+
344
+
345
+ def use_flash_2(x):
346
+ if not flags.get_use_flash_attention_2():
347
+ return False
348
+ if flash_attn is None:
349
+ return False
350
+ if x.device.type != "cuda":
351
+ return False
352
+ if x.dtype not in (torch.float16, torch.bfloat16):
353
+ return False
354
+ return True
355
+
356
+
357
+ class SelfAttentionBlock(nn.Module):
358
+ def __init__(self, d_model, d_head, cond_features, dropout=0.0):
359
+ super().__init__()
360
+ self.d_head = d_head
361
+ self.n_heads = d_model // d_head
362
+ self.norm = AdaRMSNorm(d_model, cond_features)
363
+ self.qkv_proj = apply_wd(Linear(d_model, d_model * 3, bias=False))
364
+ self.scale = nn.Parameter(torch.full([self.n_heads], 10.0))
365
+ self.pos_emb = AxialRoPE(d_head // 2, self.n_heads)
366
+ self.dropout = nn.Dropout(dropout)
367
+ self.out_proj = apply_wd(zero_init(Linear(d_model, d_model, bias=False)))
368
+
369
+ def extra_repr(self):
370
+ return f"d_head={self.d_head},"
371
+
372
+ def forward(self, x, pos, cond):
373
+ skip = x
374
+ x = self.norm(x, cond)
375
+ qkv = self.qkv_proj(x)
376
+ pos = rearrange(pos, "... h w e -> ... (h w) e").to(qkv.dtype)
377
+ theta = self.pos_emb(pos)
378
+ if use_flash_2(qkv):
379
+ qkv = rearrange(qkv, "n h w (t nh e) -> n (h w) t nh e", t=3, e=self.d_head)
380
+ qkv = scale_for_cosine_sim_qkv(qkv, self.scale, 1e-6)
381
+ theta = torch.stack((theta, theta, torch.zeros_like(theta)), dim=-3)
382
+ qkv = apply_rotary_emb_(qkv, theta)
383
+ flops_shape = qkv.shape[-5], qkv.shape[-2], qkv.shape[-4], qkv.shape[-1]
384
+ flops.op(flops.op_attention, flops_shape, flops_shape, flops_shape)
385
+ x = flash_attn.flash_attn_qkvpacked_func(qkv, softmax_scale=1.0)
386
+ x = rearrange(x, "n (h w) nh e -> n h w (nh e)", h=skip.shape[-3], w=skip.shape[-2])
387
+ else:
388
+ q, k, v = rearrange(qkv, "n h w (t nh e) -> t n nh (h w) e", t=3, e=self.d_head)
389
+ q, k = scale_for_cosine_sim(q, k, self.scale[:, None, None], 1e-6)
390
+ theta = theta.movedim(-2, -3)
391
+ q = apply_rotary_emb_(q, theta)
392
+ k = apply_rotary_emb_(k, theta)
393
+ flops.op(flops.op_attention, q.shape, k.shape, v.shape)
394
+ x = F.scaled_dot_product_attention(q, k, v, scale=1.0)
395
+ x = rearrange(x, "n nh (h w) e -> n h w (nh e)", h=skip.shape[-3], w=skip.shape[-2])
396
+ x = self.dropout(x)
397
+ x = self.out_proj(x)
398
+ return x + skip
399
+
400
+
401
+ class NeighborhoodSelfAttentionBlock(nn.Module):
402
+ def __init__(self, d_model, d_head, cond_features, kernel_size, dropout=0.0):
403
+ super().__init__()
404
+ self.d_head = d_head
405
+ self.n_heads = d_model // d_head
406
+ self.kernel_size = kernel_size
407
+ self.norm = AdaRMSNorm(d_model, cond_features)
408
+ self.qkv_proj = apply_wd(Linear(d_model, d_model * 3, bias=False))
409
+ self.scale = nn.Parameter(torch.full([self.n_heads], 10.0))
410
+ self.pos_emb = AxialRoPE(d_head // 2, self.n_heads)
411
+ self.dropout = nn.Dropout(dropout)
412
+ self.out_proj = apply_wd(zero_init(Linear(d_model, d_model, bias=False)))
413
+
414
+ def extra_repr(self):
415
+ return f"d_head={self.d_head}, kernel_size={self.kernel_size}"
416
+
417
+ def forward(self, x, pos, cond):
418
+ skip = x
419
+ x = self.norm(x, cond)
420
+ qkv = self.qkv_proj(x)
421
+ if natten is None:
422
+ raise ModuleNotFoundError("natten is required for neighborhood attention")
423
+ if natten.has_fused_na():
424
+ q, k, v = rearrange(qkv, "n h w (t nh e) -> t n h w nh e", t=3, e=self.d_head)
425
+ q, k = scale_for_cosine_sim(q, k, self.scale[:, None], 1e-6)
426
+ theta = self.pos_emb(pos)
427
+ q = apply_rotary_emb_(q, theta)
428
+ k = apply_rotary_emb_(k, theta)
429
+ flops.op(flops.op_natten, q.shape, k.shape, v.shape, self.kernel_size)
430
+ x = natten.functional.na2d(q, k, v, self.kernel_size, scale=1.0)
431
+ x = rearrange(x, "n h w nh e -> n h w (nh e)")
432
+ else:
433
+ q, k, v = rearrange(qkv, "n h w (t nh e) -> t n nh h w e", t=3, e=self.d_head)
434
+ q, k = scale_for_cosine_sim(q, k, self.scale[:, None, None, None], 1e-6)
435
+ theta = self.pos_emb(pos).movedim(-2, -4)
436
+ q = apply_rotary_emb_(q, theta)
437
+ k = apply_rotary_emb_(k, theta)
438
+ flops.op(flops.op_natten, q.shape, k.shape, v.shape, self.kernel_size)
439
+ qk = natten.functional.na2d_qk(q, k, self.kernel_size)
440
+ a = torch.softmax(qk, dim=-1).to(v.dtype)
441
+ x = natten.functional.na2d_av(a, v, self.kernel_size)
442
+ x = rearrange(x, "n nh h w e -> n h w (nh e)")
443
+ x = self.dropout(x)
444
+ x = self.out_proj(x)
445
+ return x + skip
446
+
447
+
448
+ class ShiftedWindowSelfAttentionBlock(nn.Module):
449
+ def __init__(self, d_model, d_head, cond_features, window_size, window_shift, dropout=0.0):
450
+ super().__init__()
451
+ self.d_head = d_head
452
+ self.n_heads = d_model // d_head
453
+ self.window_size = window_size
454
+ self.window_shift = window_shift
455
+ self.norm = AdaRMSNorm(d_model, cond_features)
456
+ self.qkv_proj = apply_wd(Linear(d_model, d_model * 3, bias=False))
457
+ self.scale = nn.Parameter(torch.full([self.n_heads], 10.0))
458
+ self.pos_emb = AxialRoPE(d_head // 2, self.n_heads)
459
+ self.dropout = nn.Dropout(dropout)
460
+ self.out_proj = apply_wd(zero_init(Linear(d_model, d_model, bias=False)))
461
+
462
+ def extra_repr(self):
463
+ return f"d_head={self.d_head}, window_size={self.window_size}, window_shift={self.window_shift}"
464
+
465
+ def forward(self, x, pos, cond):
466
+ skip = x
467
+ x = self.norm(x, cond)
468
+ qkv = self.qkv_proj(x)
469
+ q, k, v = rearrange(qkv, "n h w (t nh e) -> t n nh h w e", t=3, e=self.d_head)
470
+ q, k = scale_for_cosine_sim(q, k, self.scale[:, None, None, None], 1e-6)
471
+ theta = self.pos_emb(pos).movedim(-2, -4)
472
+ q = apply_rotary_emb_(q, theta)
473
+ k = apply_rotary_emb_(k, theta)
474
+ x = apply_window_attention(self.window_size, self.window_shift, q, k, v, scale=1.0)
475
+ x = rearrange(x, "n nh h w e -> n h w (nh e)")
476
+ x = self.dropout(x)
477
+ x = self.out_proj(x)
478
+ return x + skip
479
+
480
+
481
+ class FeedForwardBlock(nn.Module):
482
+ def __init__(self, d_model, d_ff, cond_features, dropout=0.0):
483
+ super().__init__()
484
+ self.norm = AdaRMSNorm(d_model, cond_features)
485
+ self.up_proj = apply_wd(LinearGEGLU(d_model, d_ff, bias=False))
486
+ self.dropout = nn.Dropout(dropout)
487
+ self.down_proj = apply_wd(zero_init(Linear(d_ff, d_model, bias=False)))
488
+
489
+ def forward(self, x, cond):
490
+ skip = x
491
+ x = self.norm(x, cond)
492
+ x = self.up_proj(x)
493
+ x = self.dropout(x)
494
+ x = self.down_proj(x)
495
+ return x + skip
496
+
497
+
498
+ class GlobalTransformerLayer(nn.Module):
499
+ def __init__(self, d_model, d_ff, d_head, cond_features, dropout=0.0):
500
+ super().__init__()
501
+ self.self_attn = SelfAttentionBlock(d_model, d_head, cond_features, dropout=dropout)
502
+ self.ff = FeedForwardBlock(d_model, d_ff, cond_features, dropout=dropout)
503
+
504
+ def forward(self, x, pos, cond):
505
+ x = checkpoint(self.self_attn, x, pos, cond)
506
+ x = checkpoint(self.ff, x, cond)
507
+ return x
508
+
509
+
510
+ class NeighborhoodTransformerLayer(nn.Module):
511
+ def __init__(self, d_model, d_ff, d_head, cond_features, kernel_size, dropout=0.0):
512
+ super().__init__()
513
+ self.self_attn = NeighborhoodSelfAttentionBlock(d_model, d_head, cond_features, kernel_size, dropout=dropout)
514
+ self.ff = FeedForwardBlock(d_model, d_ff, cond_features, dropout=dropout)
515
+
516
+ def forward(self, x, pos, cond):
517
+ x = checkpoint(self.self_attn, x, pos, cond)
518
+ x = checkpoint(self.ff, x, cond)
519
+ return x
520
+
521
+
522
+ class ShiftedWindowTransformerLayer(nn.Module):
523
+ def __init__(self, d_model, d_ff, d_head, cond_features, window_size, index, dropout=0.0):
524
+ super().__init__()
525
+ window_shift = window_size // 2 if index % 2 == 1 else 0
526
+ self.self_attn = ShiftedWindowSelfAttentionBlock(d_model, d_head, cond_features, window_size, window_shift, dropout=dropout)
527
+ self.ff = FeedForwardBlock(d_model, d_ff, cond_features, dropout=dropout)
528
+
529
+ def forward(self, x, pos, cond):
530
+ x = checkpoint(self.self_attn, x, pos, cond)
531
+ x = checkpoint(self.ff, x, cond)
532
+ return x
533
+
534
+
535
+ class NoAttentionTransformerLayer(nn.Module):
536
+ def __init__(self, d_model, d_ff, cond_features, dropout=0.0):
537
+ super().__init__()
538
+ self.ff = FeedForwardBlock(d_model, d_ff, cond_features, dropout=dropout)
539
+
540
+ def forward(self, x, pos, cond):
541
+ x = checkpoint(self.ff, x, cond)
542
+ return x
543
+
544
+
545
+ class Level(nn.ModuleList):
546
+ def forward(self, x, *args, **kwargs):
547
+ for layer in self:
548
+ x = layer(x, *args, **kwargs)
549
+ return x
550
+
551
+
552
+ # Mapping network
553
+
554
+ class MappingFeedForwardBlock(nn.Module):
555
+ def __init__(self, d_model, d_ff, dropout=0.0):
556
+ super().__init__()
557
+ self.norm = RMSNorm(d_model)
558
+ self.up_proj = apply_wd(LinearGEGLU(d_model, d_ff, bias=False))
559
+ self.dropout = nn.Dropout(dropout)
560
+ self.down_proj = apply_wd(zero_init(Linear(d_ff, d_model, bias=False)))
561
+
562
+ def forward(self, x):
563
+ skip = x
564
+ x = self.norm(x)
565
+ x = self.up_proj(x)
566
+ x = self.dropout(x)
567
+ x = self.down_proj(x)
568
+ return x + skip
569
+
570
+
571
+ class MappingNetwork(nn.Module):
572
+ def __init__(self, n_layers, d_model, d_ff, dropout=0.0):
573
+ super().__init__()
574
+ self.in_norm = RMSNorm(d_model)
575
+ self.blocks = nn.ModuleList([MappingFeedForwardBlock(d_model, d_ff, dropout=dropout) for _ in range(n_layers)])
576
+ self.out_norm = RMSNorm(d_model)
577
+
578
+ def forward(self, x):
579
+ x = self.in_norm(x)
580
+ for block in self.blocks:
581
+ x = block(x)
582
+ x = self.out_norm(x)
583
+ return x
584
+
585
+
586
+ # Token merging and splitting
587
+
588
+ class TokenMerge(nn.Module):
589
+ def __init__(self, in_features, out_features, patch_size=(2, 2)):
590
+ super().__init__()
591
+ self.h = patch_size[0]
592
+ self.w = patch_size[1]
593
+ self.proj = apply_wd(Linear(in_features * self.h * self.w, out_features, bias=False))
594
+
595
+ def forward(self, x):
596
+ x = rearrange(x, "... (h nh) (w nw) e -> ... h w (nh nw e)", nh=self.h, nw=self.w)
597
+ return self.proj(x)
598
+
599
+
600
+ class TokenSplitWithoutSkip(nn.Module):
601
+ def __init__(self, in_features, out_features, patch_size=(2, 2)):
602
+ super().__init__()
603
+ self.h = patch_size[0]
604
+ self.w = patch_size[1]
605
+ self.proj = apply_wd(Linear(in_features, out_features * self.h * self.w, bias=False))
606
+
607
+ def forward(self, x):
608
+ x = self.proj(x)
609
+ return rearrange(x, "... h w (nh nw e) -> ... (h nh) (w nw) e", nh=self.h, nw=self.w)
610
+
611
+
612
+ class TokenSplit(nn.Module):
613
+ def __init__(self, in_features, out_features, patch_size=(2, 2)):
614
+ super().__init__()
615
+ self.h = patch_size[0]
616
+ self.w = patch_size[1]
617
+ self.proj = apply_wd(Linear(in_features, out_features * self.h * self.w, bias=False))
618
+ self.fac = nn.Parameter(torch.ones(1) * 0.5)
619
+
620
+ def forward(self, x, skip):
621
+ x = self.proj(x)
622
+ x = rearrange(x, "... h w (nh nw e) -> ... (h nh) (w nw) e", nh=self.h, nw=self.w)
623
+ return torch.lerp(skip, x, self.fac.to(x.dtype))
624
+
625
+
626
+ # Configuration
627
+
628
+ @dataclass
629
+ class GlobalAttentionSpec:
630
+ d_head: int
631
+
632
+
633
+ @dataclass
634
+ class NeighborhoodAttentionSpec:
635
+ d_head: int
636
+ kernel_size: int
637
+
638
+
639
+ @dataclass
640
+ class ShiftedWindowAttentionSpec:
641
+ d_head: int
642
+ window_size: int
643
+
644
+
645
+ @dataclass
646
+ class NoAttentionSpec:
647
+ pass
648
+
649
+
650
+ @dataclass
651
+ class LevelSpec:
652
+ depth: int
653
+ width: int
654
+ d_ff: int
655
+ self_attn: Union[GlobalAttentionSpec, NeighborhoodAttentionSpec, ShiftedWindowAttentionSpec, NoAttentionSpec]
656
+ dropout: float
657
+
658
+
659
+ @dataclass
660
+ class MappingSpec:
661
+ depth: int
662
+ width: int
663
+ d_ff: int
664
+ dropout: float
665
+
666
+
667
+ # Model class
668
+
669
+ class ImageTransformerDenoiserModelV2(nn.Module):
670
+ def __init__(self, levels, mapping, in_channels, out_channels, patch_size, num_classes=0, mapping_cond_dim=0, degradation_params_dim=None):
671
+ super().__init__()
672
+ self.num_classes = num_classes
673
+ self.patch_in = TokenMerge(in_channels, levels[0].width, patch_size)
674
+ self.mapping_width = mapping.width
675
+ self.time_emb = FourierFeatures(1, mapping.width)
676
+ self.time_in_proj = Linear(mapping.width, mapping.width, bias=False)
677
+ self.aug_emb = FourierFeatures(9, mapping.width)
678
+ self.aug_in_proj = Linear(mapping.width, mapping.width, bias=False)
679
+ self.degradation_proj = Linear(degradation_params_dim, mapping.width, bias=False) if degradation_params_dim else None
680
+ self.class_emb = nn.Embedding(num_classes, mapping.width) if num_classes else None
681
+ self.mapping_cond_in_proj = Linear(mapping_cond_dim, mapping.width, bias=False) if mapping_cond_dim else None
682
+ self.mapping = tag_module(MappingNetwork(mapping.depth, mapping.width, mapping.d_ff, dropout=mapping.dropout), "mapping")
683
+
684
+ self.down_levels, self.up_levels = nn.ModuleList(), nn.ModuleList()
685
+ for i, spec in enumerate(levels):
686
+ if isinstance(spec.self_attn, GlobalAttentionSpec):
687
+ layer_factory = lambda _: GlobalTransformerLayer(spec.width, spec.d_ff, spec.self_attn.d_head, mapping.width, dropout=spec.dropout)
688
+ elif isinstance(spec.self_attn, NeighborhoodAttentionSpec):
689
+ layer_factory = lambda _: NeighborhoodTransformerLayer(spec.width, spec.d_ff, spec.self_attn.d_head, mapping.width, spec.self_attn.kernel_size, dropout=spec.dropout)
690
+ elif isinstance(spec.self_attn, ShiftedWindowAttentionSpec):
691
+ layer_factory = lambda i: ShiftedWindowTransformerLayer(spec.width, spec.d_ff, spec.self_attn.d_head, mapping.width, spec.self_attn.window_size, i, dropout=spec.dropout)
692
+ elif isinstance(spec.self_attn, NoAttentionSpec):
693
+ layer_factory = lambda _: NoAttentionTransformerLayer(spec.width, spec.d_ff, mapping.width, dropout=spec.dropout)
694
+ else:
695
+ raise ValueError(f"unsupported self attention spec {spec.self_attn}")
696
+
697
+ if i < len(levels) - 1:
698
+ self.down_levels.append(Level([layer_factory(i) for i in range(spec.depth)]))
699
+ self.up_levels.append(Level([layer_factory(i + spec.depth) for i in range(spec.depth)]))
700
+ else:
701
+ self.mid_level = Level([layer_factory(i) for i in range(spec.depth)])
702
+
703
+ self.merges = nn.ModuleList([TokenMerge(spec_1.width, spec_2.width) for spec_1, spec_2 in zip(levels[:-1], levels[1:])])
704
+ self.splits = nn.ModuleList([TokenSplit(spec_2.width, spec_1.width) for spec_1, spec_2 in zip(levels[:-1], levels[1:])])
705
+
706
+ self.out_norm = RMSNorm(levels[0].width)
707
+ self.patch_out = TokenSplitWithoutSkip(levels[0].width, out_channels, patch_size)
708
+ nn.init.zeros_(self.patch_out.proj.weight)
709
+
710
+ def param_groups(self, base_lr=5e-4, mapping_lr_scale=1 / 3):
711
+ wd = filter_params(lambda tags: "wd" in tags and "mapping" not in tags, self)
712
+ no_wd = filter_params(lambda tags: "wd" not in tags and "mapping" not in tags, self)
713
+ mapping_wd = filter_params(lambda tags: "wd" in tags and "mapping" in tags, self)
714
+ mapping_no_wd = filter_params(lambda tags: "wd" not in tags and "mapping" in tags, self)
715
+ groups = [
716
+ {"params": list(wd), "lr": base_lr},
717
+ {"params": list(no_wd), "lr": base_lr, "weight_decay": 0.0},
718
+ {"params": list(mapping_wd), "lr": base_lr * mapping_lr_scale},
719
+ {"params": list(mapping_no_wd), "lr": base_lr * mapping_lr_scale, "weight_decay": 0.0}
720
+ ]
721
+ return groups
722
+
723
+ def forward(self, x, sigma=None, aug_cond=None, class_cond=None, mapping_cond=None, degradation_params=None):
724
+ # Patching
725
+ x = x.movedim(-3, -1)
726
+ x = self.patch_in(x)
727
+ # TODO: pixel aspect ratio for nonsquare patches
728
+ pos = make_axial_pos(x.shape[-3], x.shape[-2], device=x.device).view(x.shape[-3], x.shape[-2], 2)
729
+
730
+ # Mapping network
731
+ if class_cond is None and self.class_emb is not None:
732
+ raise ValueError("class_cond must be specified if num_classes > 0")
733
+ if mapping_cond is None and self.mapping_cond_in_proj is not None:
734
+ raise ValueError("mapping_cond must be specified if mapping_cond_dim > 0")
735
+
736
+ # c_noise = torch.log(sigma) / 4
737
+ # c_noise = (sigma * 2.0 - 1.0)
738
+ # c_noise = sigma * 2 - 1
739
+ if sigma is not None:
740
+ time_emb = self.time_in_proj(self.time_emb(sigma[..., None]))
741
+ else:
742
+ time_emb = self.time_in_proj(torch.ones(1, 1, device=x.device, dtype=x.dtype).expand(x.shape[0], self.mapping_width))
743
+ # time_emb = self.time_in_proj(sigma[..., None])
744
+
745
+ aug_cond = x.new_zeros([x.shape[0], 9]) if aug_cond is None else aug_cond
746
+ aug_emb = self.aug_in_proj(self.aug_emb(aug_cond))
747
+ class_emb = self.class_emb(class_cond) if self.class_emb is not None else 0
748
+ mapping_emb = self.mapping_cond_in_proj(mapping_cond) if self.mapping_cond_in_proj is not None else 0
749
+ degradation_emb = self.degradation_proj(degradation_params) if degradation_params is not None else 0
750
+ cond = self.mapping(time_emb + aug_emb + class_emb + mapping_emb + degradation_emb)
751
+
752
+ # Hourglass transformer
753
+ skips, poses = [], []
754
+ for down_level, merge in zip(self.down_levels, self.merges):
755
+ x = down_level(x, pos, cond)
756
+ skips.append(x)
757
+ poses.append(pos)
758
+ x = merge(x)
759
+ pos = downscale_pos(pos)
760
+
761
+ x = self.mid_level(x, pos, cond)
762
+
763
+ for up_level, split, skip, pos in reversed(list(zip(self.up_levels, self.splits, skips, poses))):
764
+ x = split(x, skip)
765
+ x = up_level(x, pos, cond)
766
+
767
+ # Unpatching
768
+ x = self.out_norm(x)
769
+ x = self.patch_out(x)
770
+ x = x.movedim(-1, -3)
771
+
772
+ return x
arch/swinir/__init__.py ADDED
File without changes
arch/swinir/swinir.py ADDED
@@ -0,0 +1,904 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -----------------------------------------------------------------------------------
2
+ # SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
3
+ # Originally Written by Ze Liu, Modified by Jingyun Liang.
4
+ # -----------------------------------------------------------------------------------
5
+ # Borrowed from DifFace (https://github.com/zsyOAOA/DifFace/blob/master/models/swinir.py)
6
+
7
+ import math
8
+ from typing import Set
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import torch.utils.checkpoint as checkpoint
14
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
15
+
16
+
17
+ class Mlp(nn.Module):
18
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
19
+ super().__init__()
20
+ out_features = out_features or in_features
21
+ hidden_features = hidden_features or in_features
22
+ self.fc1 = nn.Linear(in_features, hidden_features)
23
+ self.act = act_layer()
24
+ self.fc2 = nn.Linear(hidden_features, out_features)
25
+ self.drop = nn.Dropout(drop)
26
+
27
+ def forward(self, x):
28
+ x = self.fc1(x)
29
+ x = self.act(x)
30
+ x = self.drop(x)
31
+ x = self.fc2(x)
32
+ x = self.drop(x)
33
+ return x
34
+
35
+
36
+ def window_partition(x, window_size):
37
+ """
38
+ Args:
39
+ x: (B, H, W, C)
40
+ window_size (int): window size
41
+
42
+ Returns:
43
+ windows: (num_windows*B, window_size, window_size, C)
44
+ """
45
+ B, H, W, C = x.shape
46
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
47
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
48
+ return windows
49
+
50
+
51
+ def window_reverse(windows, window_size, H, W):
52
+ """
53
+ Args:
54
+ windows: (num_windows*B, window_size, window_size, C)
55
+ window_size (int): Window size
56
+ H (int): Height of image
57
+ W (int): Width of image
58
+
59
+ Returns:
60
+ x: (B, H, W, C)
61
+ """
62
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
63
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
64
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
65
+ return x
66
+
67
+
68
+ class WindowAttention(nn.Module):
69
+ r""" Window based multi-head self attention (W-MSA) module with relative position bias.
70
+ It supports both of shifted and non-shifted window.
71
+
72
+ Args:
73
+ dim (int): Number of input channels.
74
+ window_size (tuple[int]): The height and width of the window.
75
+ num_heads (int): Number of attention heads.
76
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
77
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
78
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
79
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
80
+ """
81
+
82
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
83
+
84
+ super().__init__()
85
+ self.dim = dim
86
+ self.window_size = window_size # Wh, Ww
87
+ self.num_heads = num_heads
88
+ head_dim = dim // num_heads
89
+ self.scale = qk_scale or head_dim ** -0.5
90
+
91
+ # define a parameter table of relative position bias
92
+ self.relative_position_bias_table = nn.Parameter(
93
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
94
+
95
+ # get pair-wise relative position index for each token inside the window
96
+ coords_h = torch.arange(self.window_size[0])
97
+ coords_w = torch.arange(self.window_size[1])
98
+ # coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
99
+ # Fix: Pass indexing="ij" to avoid warning
100
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij")) # 2, Wh, Ww
101
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
102
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
103
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
104
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
105
+ relative_coords[:, :, 1] += self.window_size[1] - 1
106
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
107
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
108
+ self.register_buffer("relative_position_index", relative_position_index)
109
+
110
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
111
+ self.attn_drop = nn.Dropout(attn_drop)
112
+ self.proj = nn.Linear(dim, dim)
113
+
114
+ self.proj_drop = nn.Dropout(proj_drop)
115
+
116
+ trunc_normal_(self.relative_position_bias_table, std=.02)
117
+ self.softmax = nn.Softmax(dim=-1)
118
+
119
+ def forward(self, x, mask=None):
120
+ """
121
+ Args:
122
+ x: input features with shape of (num_windows*B, N, C)
123
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
124
+ """
125
+ B_, N, C = x.shape
126
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
127
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
128
+
129
+ q = q * self.scale
130
+ attn = (q @ k.transpose(-2, -1))
131
+
132
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
133
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
134
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
135
+ attn = attn + relative_position_bias.unsqueeze(0)
136
+
137
+ if mask is not None:
138
+ nW = mask.shape[0]
139
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
140
+ attn = attn.view(-1, self.num_heads, N, N)
141
+ attn = self.softmax(attn)
142
+ else:
143
+ attn = self.softmax(attn)
144
+
145
+ attn = self.attn_drop(attn)
146
+
147
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
148
+ x = self.proj(x)
149
+ x = self.proj_drop(x)
150
+ return x
151
+
152
+ def extra_repr(self) -> str:
153
+ return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
154
+
155
+ def flops(self, N):
156
+ # calculate flops for 1 window with token length of N
157
+ flops = 0
158
+ # qkv = self.qkv(x)
159
+ flops += N * self.dim * 3 * self.dim
160
+ # attn = (q @ k.transpose(-2, -1))
161
+ flops += self.num_heads * N * (self.dim // self.num_heads) * N
162
+ # x = (attn @ v)
163
+ flops += self.num_heads * N * N * (self.dim // self.num_heads)
164
+ # x = self.proj(x)
165
+ flops += N * self.dim * self.dim
166
+ return flops
167
+
168
+
169
+ class SwinTransformerBlock(nn.Module):
170
+ r""" Swin Transformer Block.
171
+
172
+ Args:
173
+ dim (int): Number of input channels.
174
+ input_resolution (tuple[int]): Input resulotion.
175
+ num_heads (int): Number of attention heads.
176
+ window_size (int): Window size.
177
+ shift_size (int): Shift size for SW-MSA.
178
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
179
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
180
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
181
+ drop (float, optional): Dropout rate. Default: 0.0
182
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
183
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
184
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
185
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
186
+ """
187
+
188
+ def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
189
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
190
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
191
+ super().__init__()
192
+ self.dim = dim
193
+ self.input_resolution = input_resolution
194
+ self.num_heads = num_heads
195
+ self.window_size = window_size
196
+ self.shift_size = shift_size
197
+ self.mlp_ratio = mlp_ratio
198
+ if min(self.input_resolution) <= self.window_size:
199
+ # if window size is larger than input resolution, we don't partition windows
200
+ self.shift_size = 0
201
+ self.window_size = min(self.input_resolution)
202
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
203
+
204
+ self.norm1 = norm_layer(dim)
205
+ self.attn = WindowAttention(
206
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
207
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
208
+
209
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
210
+ self.norm2 = norm_layer(dim)
211
+ mlp_hidden_dim = int(dim * mlp_ratio)
212
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
213
+
214
+ if self.shift_size > 0:
215
+ attn_mask = self.calculate_mask(self.input_resolution)
216
+ else:
217
+ attn_mask = None
218
+
219
+ self.register_buffer("attn_mask", attn_mask)
220
+
221
+ def calculate_mask(self, x_size):
222
+ # calculate attention mask for SW-MSA
223
+ H, W = x_size
224
+ img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1
225
+ h_slices = (slice(0, -self.window_size),
226
+ slice(-self.window_size, -self.shift_size),
227
+ slice(-self.shift_size, None))
228
+ w_slices = (slice(0, -self.window_size),
229
+ slice(-self.window_size, -self.shift_size),
230
+ slice(-self.shift_size, None))
231
+ cnt = 0
232
+ for h in h_slices:
233
+ for w in w_slices:
234
+ img_mask[:, h, w, :] = cnt
235
+ cnt += 1
236
+
237
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
238
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
239
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
240
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
241
+
242
+ return attn_mask
243
+
244
+ def forward(self, x, x_size):
245
+ H, W = x_size
246
+ B, L, C = x.shape
247
+ # assert L == H * W, "input feature has wrong size"
248
+
249
+ shortcut = x
250
+ x = self.norm1(x)
251
+ x = x.view(B, H, W, C)
252
+
253
+ # cyclic shift
254
+ if self.shift_size > 0:
255
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
256
+ else:
257
+ shifted_x = x
258
+
259
+ # partition windows
260
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
261
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
262
+
263
+ # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
264
+ if self.input_resolution == x_size:
265
+ attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C
266
+ else:
267
+ attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
268
+
269
+ # merge windows
270
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
271
+ shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
272
+
273
+ # reverse cyclic shift
274
+ if self.shift_size > 0:
275
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
276
+ else:
277
+ x = shifted_x
278
+ x = x.view(B, H * W, C)
279
+
280
+ # FFN
281
+ x = shortcut + self.drop_path(x)
282
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
283
+
284
+ return x
285
+
286
+ def extra_repr(self) -> str:
287
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
288
+ f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
289
+
290
+ def flops(self):
291
+ flops = 0
292
+ H, W = self.input_resolution
293
+ # norm1
294
+ flops += self.dim * H * W
295
+ # W-MSA/SW-MSA
296
+ nW = H * W / self.window_size / self.window_size
297
+ flops += nW * self.attn.flops(self.window_size * self.window_size)
298
+ # mlp
299
+ flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
300
+ # norm2
301
+ flops += self.dim * H * W
302
+ return flops
303
+
304
+
305
+ class PatchMerging(nn.Module):
306
+ r""" Patch Merging Layer.
307
+
308
+ Args:
309
+ input_resolution (tuple[int]): Resolution of input feature.
310
+ dim (int): Number of input channels.
311
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
312
+ """
313
+
314
+ def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
315
+ super().__init__()
316
+ self.input_resolution = input_resolution
317
+ self.dim = dim
318
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
319
+ self.norm = norm_layer(4 * dim)
320
+
321
+ def forward(self, x):
322
+ """
323
+ x: B, H*W, C
324
+ """
325
+ H, W = self.input_resolution
326
+ B, L, C = x.shape
327
+ assert L == H * W, "input feature has wrong size"
328
+ assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
329
+
330
+ x = x.view(B, H, W, C)
331
+
332
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
333
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
334
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
335
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
336
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
337
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
338
+
339
+ x = self.norm(x)
340
+ x = self.reduction(x)
341
+
342
+ return x
343
+
344
+ def extra_repr(self) -> str:
345
+ return f"input_resolution={self.input_resolution}, dim={self.dim}"
346
+
347
+ def flops(self):
348
+ H, W = self.input_resolution
349
+ flops = H * W * self.dim
350
+ flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
351
+ return flops
352
+
353
+
354
+ class BasicLayer(nn.Module):
355
+ """ A basic Swin Transformer layer for one stage.
356
+
357
+ Args:
358
+ dim (int): Number of input channels.
359
+ input_resolution (tuple[int]): Input resolution.
360
+ depth (int): Number of blocks.
361
+ num_heads (int): Number of attention heads.
362
+ window_size (int): Local window size.
363
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
364
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
365
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
366
+ drop (float, optional): Dropout rate. Default: 0.0
367
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
368
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
369
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
370
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
371
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
372
+ """
373
+
374
+ def __init__(self, dim, input_resolution, depth, num_heads, window_size,
375
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
376
+ drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
377
+
378
+ super().__init__()
379
+ self.dim = dim
380
+ self.input_resolution = input_resolution
381
+ self.depth = depth
382
+ self.use_checkpoint = use_checkpoint
383
+
384
+ # build blocks
385
+ self.blocks = nn.ModuleList([
386
+ SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
387
+ num_heads=num_heads, window_size=window_size,
388
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
389
+ mlp_ratio=mlp_ratio,
390
+ qkv_bias=qkv_bias, qk_scale=qk_scale,
391
+ drop=drop, attn_drop=attn_drop,
392
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
393
+ norm_layer=norm_layer)
394
+ for i in range(depth)])
395
+
396
+ # patch merging layer
397
+ if downsample is not None:
398
+ self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
399
+ else:
400
+ self.downsample = None
401
+
402
+ def forward(self, x, x_size):
403
+ for blk in self.blocks:
404
+ if self.use_checkpoint:
405
+ x = checkpoint.checkpoint(blk, x, x_size)
406
+ else:
407
+ x = blk(x, x_size)
408
+ if self.downsample is not None:
409
+ x = self.downsample(x)
410
+ return x
411
+
412
+ def extra_repr(self) -> str:
413
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
414
+
415
+ def flops(self):
416
+ flops = 0
417
+ for blk in self.blocks:
418
+ flops += blk.flops()
419
+ if self.downsample is not None:
420
+ flops += self.downsample.flops()
421
+ return flops
422
+
423
+
424
+ class RSTB(nn.Module):
425
+ """Residual Swin Transformer Block (RSTB).
426
+
427
+ Args:
428
+ dim (int): Number of input channels.
429
+ input_resolution (tuple[int]): Input resolution.
430
+ depth (int): Number of blocks.
431
+ num_heads (int): Number of attention heads.
432
+ window_size (int): Local window size.
433
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
434
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
435
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
436
+ drop (float, optional): Dropout rate. Default: 0.0
437
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
438
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
439
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
440
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
441
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
442
+ img_size: Input image size.
443
+ patch_size: Patch size.
444
+ resi_connection: The convolutional block before residual connection.
445
+ """
446
+
447
+ def __init__(self, dim, input_resolution, depth, num_heads, window_size,
448
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
449
+ drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
450
+ img_size=224, patch_size=4, resi_connection='1conv'):
451
+ super(RSTB, self).__init__()
452
+
453
+ self.dim = dim
454
+ self.input_resolution = input_resolution
455
+
456
+ self.residual_group = BasicLayer(dim=dim,
457
+ input_resolution=input_resolution,
458
+ depth=depth,
459
+ num_heads=num_heads,
460
+ window_size=window_size,
461
+ mlp_ratio=mlp_ratio,
462
+ qkv_bias=qkv_bias, qk_scale=qk_scale,
463
+ drop=drop, attn_drop=attn_drop,
464
+ drop_path=drop_path,
465
+ norm_layer=norm_layer,
466
+ downsample=downsample,
467
+ use_checkpoint=use_checkpoint)
468
+
469
+ if resi_connection == '1conv':
470
+ self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
471
+ elif resi_connection == '3conv':
472
+ # to save parameters and memory
473
+ self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
474
+ nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
475
+ nn.LeakyReLU(negative_slope=0.2, inplace=True),
476
+ nn.Conv2d(dim // 4, dim, 3, 1, 1))
477
+
478
+ self.patch_embed = PatchEmbed(
479
+ img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
480
+ norm_layer=None)
481
+
482
+ self.patch_unembed = PatchUnEmbed(
483
+ img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
484
+ norm_layer=None)
485
+
486
+ def forward(self, x, x_size):
487
+ return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
488
+
489
+ def flops(self):
490
+ flops = 0
491
+ flops += self.residual_group.flops()
492
+ H, W = self.input_resolution
493
+ flops += H * W * self.dim * self.dim * 9
494
+ flops += self.patch_embed.flops()
495
+ flops += self.patch_unembed.flops()
496
+
497
+ return flops
498
+
499
+
500
+ class PatchEmbed(nn.Module):
501
+ r""" Image to Patch Embedding
502
+
503
+ Args:
504
+ img_size (int): Image size. Default: 224.
505
+ patch_size (int): Patch token size. Default: 4.
506
+ in_chans (int): Number of input image channels. Default: 3.
507
+ embed_dim (int): Number of linear projection output channels. Default: 96.
508
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
509
+ """
510
+
511
+ def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
512
+ super().__init__()
513
+ img_size = to_2tuple(img_size)
514
+ patch_size = to_2tuple(patch_size)
515
+ patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
516
+ self.img_size = img_size
517
+ self.patch_size = patch_size
518
+ self.patches_resolution = patches_resolution
519
+ self.num_patches = patches_resolution[0] * patches_resolution[1]
520
+
521
+ self.in_chans = in_chans
522
+ self.embed_dim = embed_dim
523
+
524
+ if norm_layer is not None:
525
+ self.norm = norm_layer(embed_dim)
526
+ else:
527
+ self.norm = None
528
+
529
+ def forward(self, x):
530
+ x = x.flatten(2).transpose(1, 2) # B Ph*Pw C
531
+ if self.norm is not None:
532
+ x = self.norm(x)
533
+ return x
534
+
535
+ def flops(self):
536
+ flops = 0
537
+ H, W = self.img_size
538
+ if self.norm is not None:
539
+ flops += H * W * self.embed_dim
540
+ return flops
541
+
542
+
543
+ class PatchUnEmbed(nn.Module):
544
+ r""" Image to Patch Unembedding
545
+
546
+ Args:
547
+ img_size (int): Image size. Default: 224.
548
+ patch_size (int): Patch token size. Default: 4.
549
+ in_chans (int): Number of input image channels. Default: 3.
550
+ embed_dim (int): Number of linear projection output channels. Default: 96.
551
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
552
+ """
553
+
554
+ def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
555
+ super().__init__()
556
+ img_size = to_2tuple(img_size)
557
+ patch_size = to_2tuple(patch_size)
558
+ patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
559
+ self.img_size = img_size
560
+ self.patch_size = patch_size
561
+ self.patches_resolution = patches_resolution
562
+ self.num_patches = patches_resolution[0] * patches_resolution[1]
563
+
564
+ self.in_chans = in_chans
565
+ self.embed_dim = embed_dim
566
+
567
+ def forward(self, x, x_size):
568
+ B, HW, C = x.shape
569
+ x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1]) # B Ph*Pw C
570
+ return x
571
+
572
+ def flops(self):
573
+ flops = 0
574
+ return flops
575
+
576
+
577
+ class Upsample(nn.Sequential):
578
+ """Upsample module.
579
+
580
+ Args:
581
+ scale (int): Scale factor. Supported scales: 2^n and 3.
582
+ num_feat (int): Channel number of intermediate features.
583
+ """
584
+
585
+ def __init__(self, scale, num_feat):
586
+ m = []
587
+ if (scale & (scale - 1)) == 0: # scale = 2^n
588
+ for _ in range(int(math.log(scale, 2))):
589
+ m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
590
+ m.append(nn.PixelShuffle(2))
591
+ elif scale == 3:
592
+ m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
593
+ m.append(nn.PixelShuffle(3))
594
+ else:
595
+ raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
596
+ super(Upsample, self).__init__(*m)
597
+
598
+
599
+ class UpsampleOneStep(nn.Sequential):
600
+ """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
601
+ Used in lightweight SR to save parameters.
602
+
603
+ Args:
604
+ scale (int): Scale factor. Supported scales: 2^n and 3.
605
+ num_feat (int): Channel number of intermediate features.
606
+
607
+ """
608
+
609
+ def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
610
+ self.num_feat = num_feat
611
+ self.input_resolution = input_resolution
612
+ m = []
613
+ m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))
614
+ m.append(nn.PixelShuffle(scale))
615
+ super(UpsampleOneStep, self).__init__(*m)
616
+
617
+ def flops(self):
618
+ H, W = self.input_resolution
619
+ flops = H * W * self.num_feat * 3 * 9
620
+ return flops
621
+
622
+
623
+ class SwinIR(nn.Module):
624
+ r""" SwinIR
625
+ A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
626
+
627
+ Args:
628
+ img_size (int | tuple(int)): Input image size. Default 64
629
+ patch_size (int | tuple(int)): Patch size. Default: 1
630
+ in_chans (int): Number of input image channels. Default: 3
631
+ embed_dim (int): Patch embedding dimension. Default: 96
632
+ depths (tuple(int)): Depth of each Swin Transformer layer.
633
+ num_heads (tuple(int)): Number of attention heads in different layers.
634
+ window_size (int): Window size. Default: 7
635
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
636
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
637
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
638
+ drop_rate (float): Dropout rate. Default: 0
639
+ attn_drop_rate (float): Attention dropout rate. Default: 0
640
+ drop_path_rate (float): Stochastic depth rate. Default: 0.1
641
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
642
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
643
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True
644
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
645
+ sf: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
646
+ img_range: Image range. 1. or 255.
647
+ upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
648
+ resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
649
+ """
650
+
651
+ def __init__(
652
+ self,
653
+ img_size=64,
654
+ patch_size=1,
655
+ in_chans=3,
656
+ num_out_ch=3,
657
+ embed_dim=96,
658
+ depths=[6, 6, 6, 6],
659
+ num_heads=[6, 6, 6, 6],
660
+ window_size=7,
661
+ mlp_ratio=4.,
662
+ qkv_bias=True,
663
+ qk_scale=None,
664
+ drop_rate=0.,
665
+ attn_drop_rate=0.,
666
+ drop_path_rate=0.1,
667
+ norm_layer=nn.LayerNorm,
668
+ ape=False,
669
+ patch_norm=True,
670
+ use_checkpoint=False,
671
+ sf=4,
672
+ img_range=1.,
673
+ upsampler='',
674
+ resi_connection='1conv',
675
+ unshuffle=False,
676
+ unshuffle_scale=None,
677
+ hq_key: str = "jpg",
678
+ lq_key: str = "hint",
679
+ learning_rate: float = None,
680
+ weight_decay: float = None
681
+ ) -> "SwinIR":
682
+ super(SwinIR, self).__init__()
683
+ num_in_ch = in_chans * (unshuffle_scale ** 2) if unshuffle else in_chans
684
+ num_feat = 64
685
+ self.img_range = img_range
686
+ if in_chans == 3:
687
+ rgb_mean = (0.4488, 0.4371, 0.4040)
688
+ self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
689
+ else:
690
+ self.mean = torch.zeros(1, 1, 1, 1)
691
+ self.upscale = sf
692
+ self.upsampler = upsampler
693
+ self.window_size = window_size
694
+ self.unshuffle_scale = unshuffle_scale
695
+ self.unshuffle = unshuffle
696
+
697
+ #####################################################################################################
698
+ ################################### 1, shallow feature extraction ###################################
699
+ if unshuffle:
700
+ assert unshuffle_scale is not None
701
+ self.conv_first = nn.Sequential(
702
+ nn.PixelUnshuffle(sf),
703
+ nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1),
704
+ )
705
+ else:
706
+ self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
707
+
708
+ #####################################################################################################
709
+ ################################### 2, deep feature extraction ######################################
710
+ self.num_layers = len(depths)
711
+ self.embed_dim = embed_dim
712
+ self.ape = ape
713
+ self.patch_norm = patch_norm
714
+ self.num_features = embed_dim
715
+ self.mlp_ratio = mlp_ratio
716
+
717
+ # split image into non-overlapping patches
718
+ self.patch_embed = PatchEmbed(
719
+ img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
720
+ norm_layer=norm_layer if self.patch_norm else None
721
+ )
722
+ num_patches = self.patch_embed.num_patches
723
+ patches_resolution = self.patch_embed.patches_resolution
724
+ self.patches_resolution = patches_resolution
725
+
726
+ # merge non-overlapping patches into image
727
+ self.patch_unembed = PatchUnEmbed(
728
+ img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
729
+ norm_layer=norm_layer if self.patch_norm else None
730
+ )
731
+
732
+ # absolute position embedding
733
+ if self.ape:
734
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
735
+ trunc_normal_(self.absolute_pos_embed, std=.02)
736
+
737
+ self.pos_drop = nn.Dropout(p=drop_rate)
738
+
739
+ # stochastic depth
740
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
741
+
742
+ # build Residual Swin Transformer blocks (RSTB)
743
+ self.layers = nn.ModuleList()
744
+ for i_layer in range(self.num_layers):
745
+ layer = RSTB(
746
+ dim=embed_dim,
747
+ input_resolution=(patches_resolution[0], patches_resolution[1]),
748
+ depth=depths[i_layer],
749
+ num_heads=num_heads[i_layer],
750
+ window_size=window_size,
751
+ mlp_ratio=self.mlp_ratio,
752
+ qkv_bias=qkv_bias, qk_scale=qk_scale,
753
+ drop=drop_rate, attn_drop=attn_drop_rate,
754
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], # no impact on SR results
755
+ norm_layer=norm_layer,
756
+ downsample=None,
757
+ use_checkpoint=use_checkpoint,
758
+ img_size=img_size,
759
+ patch_size=patch_size,
760
+ resi_connection=resi_connection
761
+ )
762
+ self.layers.append(layer)
763
+ self.norm = norm_layer(self.num_features)
764
+
765
+ # build the last conv layer in deep feature extraction
766
+ if resi_connection == '1conv':
767
+ self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
768
+ elif resi_connection == '3conv':
769
+ # to save parameters and memory
770
+ self.conv_after_body = nn.Sequential(
771
+ nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),
772
+ nn.LeakyReLU(negative_slope=0.2, inplace=True),
773
+ nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),
774
+ nn.LeakyReLU(negative_slope=0.2, inplace=True),
775
+ nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1)
776
+ )
777
+
778
+ #####################################################################################################
779
+ ################################ 3, high quality image reconstruction ################################
780
+ if self.upsampler == 'pixelshuffle':
781
+ # for classical SR
782
+ self.conv_before_upsample = nn.Sequential(
783
+ nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
784
+ nn.LeakyReLU(inplace=True)
785
+ )
786
+ self.upsample = Upsample(sf, num_feat)
787
+ self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
788
+ elif self.upsampler == 'pixelshuffledirect':
789
+ # for lightweight SR (to save parameters)
790
+ self.upsample = UpsampleOneStep(
791
+ sf, embed_dim, num_out_ch,
792
+ (patches_resolution[0], patches_resolution[1])
793
+ )
794
+ elif self.upsampler == 'nearest+conv':
795
+ # for real-world SR (less artifacts)
796
+ self.conv_before_upsample = nn.Sequential(
797
+ nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
798
+ nn.LeakyReLU(inplace=True)
799
+ )
800
+ self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
801
+ if self.upscale == 4:
802
+ self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
803
+ elif self.upscale == 8:
804
+ self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
805
+ self.conv_up3 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
806
+ self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
807
+ self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
808
+ self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
809
+ else:
810
+ # for image denoising and JPEG compression artifact reduction
811
+ self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
812
+
813
+ self.apply(self._init_weights)
814
+
815
+ def _init_weights(self, m: nn.Module) -> None:
816
+ if isinstance(m, nn.Linear):
817
+ trunc_normal_(m.weight, std=.02)
818
+ if isinstance(m, nn.Linear) and m.bias is not None:
819
+ nn.init.constant_(m.bias, 0)
820
+ elif isinstance(m, nn.LayerNorm):
821
+ nn.init.constant_(m.bias, 0)
822
+ nn.init.constant_(m.weight, 1.0)
823
+
824
+ # TODO: What's this ?
825
+ @torch.jit.ignore
826
+ def no_weight_decay(self) -> Set[str]:
827
+ return {'absolute_pos_embed'}
828
+
829
+ @torch.jit.ignore
830
+ def no_weight_decay_keywords(self) -> Set[str]:
831
+ return {'relative_position_bias_table'}
832
+
833
+ def check_image_size(self, x: torch.Tensor) -> torch.Tensor:
834
+ _, _, h, w = x.size()
835
+ mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
836
+ mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
837
+ x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
838
+ return x
839
+
840
+ def forward_features(self, x: torch.Tensor) -> torch.Tensor:
841
+ x_size = (x.shape[2], x.shape[3])
842
+ x = self.patch_embed(x)
843
+ if self.ape:
844
+ x = x + self.absolute_pos_embed
845
+ x = self.pos_drop(x)
846
+
847
+ for layer in self.layers:
848
+ x = layer(x, x_size)
849
+
850
+ x = self.norm(x) # B L C
851
+ x = self.patch_unembed(x, x_size)
852
+
853
+ return x
854
+
855
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
856
+ H, W = x.shape[2:]
857
+ x = self.check_image_size(x)
858
+
859
+ self.mean = self.mean.type_as(x)
860
+ x = (x - self.mean) * self.img_range
861
+
862
+ if self.upsampler == 'pixelshuffle':
863
+ # for classical SR
864
+ x = self.conv_first(x)
865
+ x = self.conv_after_body(self.forward_features(x)) + x
866
+ x = self.conv_before_upsample(x)
867
+ x = self.conv_last(self.upsample(x))
868
+ elif self.upsampler == 'pixelshuffledirect':
869
+ # for lightweight SR
870
+ x = self.conv_first(x)
871
+ x = self.conv_after_body(self.forward_features(x)) + x
872
+ x = self.upsample(x)
873
+ elif self.upsampler == 'nearest+conv':
874
+ # for real-world SR
875
+ x = self.conv_first(x)
876
+ x = self.conv_after_body(self.forward_features(x)) + x
877
+ x = self.conv_before_upsample(x)
878
+ x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
879
+ if self.upscale == 4:
880
+ x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
881
+ elif self.upscale == 8:
882
+ x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
883
+ x = self.lrelu(self.conv_up3(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
884
+ x = self.conv_last(self.lrelu(self.conv_hr(x)))
885
+ else:
886
+ # for image denoising and JPEG compression artifact reduction
887
+ x_first = self.conv_first(x)
888
+ res = self.conv_after_body(self.forward_features(x_first)) + x_first
889
+ x = x + self.conv_last(res)
890
+
891
+ x = x / self.img_range + self.mean
892
+
893
+ return x[:, :, :H * self.upscale, :W * self.upscale]
894
+
895
+ def flops(self) -> int:
896
+ flops = 0
897
+ H, W = self.patches_resolution
898
+ flops += H * W * 3 * self.embed_dim * 9
899
+ flops += self.patch_embed.flops()
900
+ for i, layer in enumerate(self.layers):
901
+ flops += layer.flops()
902
+ flops += H * W * 3 * self.embed_dim * self.embed_dim
903
+ flops += self.upsample.flops()
904
+ return flops
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ libsm6
3
+ libxext6
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.2.2
2
+ facexlib==0.2.5
3
+ realesrgan==0.2.5
4
+ numpy
5
+ opencv-python
6
+ torchvision
7
+ pytorch-lightning==2.4.0
8
+ scipy
9
+ tqdm
10
+ lmdb
11
+ pyyaml
12
+ basicsr==1.4.2
13
+ yapf
14
+ dctorch
15
+ einops
16
+ torch-ema==0.3
17
+ huggingface_hub==0.24.5
18
+ natten==0.17.1
19
+ wandb
20
+ timm
21
+ huggingface_hub==0.24.5
utils/__init__.py ADDED
File without changes
utils/basicsr_custom.py ADDED
@@ -0,0 +1,954 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/data/degradations.py
2
+ # Copyright (c) OpenMMLab. All rights reserved.
3
+ # https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
4
+
5
+ import math
6
+ import random
7
+ import re
8
+ from abc import ABCMeta, abstractmethod
9
+ from pathlib import Path
10
+ from typing import List, Dict
11
+ from typing import Mapping, Any
12
+ from typing import Optional, Union
13
+
14
+ import cv2
15
+ import numpy as np
16
+ import torch
17
+ from PIL import Image
18
+ from scipy import special
19
+ from scipy.stats import multivariate_normal
20
+ from torch import Tensor
21
+ # from torchvision.transforms.functional_tensor import rgb_to_grayscale
22
+ from torchvision.transforms._functional_tensor import rgb_to_grayscale
23
+
24
+
25
+ # -------------------------------------------------------------------- #
26
+ # --------------------------- blur kernels --------------------------- #
27
+ # -------------------------------------------------------------------- #
28
+
29
+
30
+ # --------------------------- util functions --------------------------- #
31
+ def sigma_matrix2(sig_x, sig_y, theta):
32
+ """Calculate the rotated sigma matrix (two dimensional matrix).
33
+
34
+ Args:
35
+ sig_x (float):
36
+ sig_y (float):
37
+ theta (float): Radian measurement.
38
+
39
+ Returns:
40
+ ndarray: Rotated sigma matrix.
41
+ """
42
+ d_matrix = np.array([[sig_x ** 2, 0], [0, sig_y ** 2]])
43
+ u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
44
+ return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
45
+
46
+
47
+ def mesh_grid(kernel_size):
48
+ """Generate the mesh grid, centering at zero.
49
+
50
+ Args:
51
+ kernel_size (int):
52
+
53
+ Returns:
54
+ xy (ndarray): with the shape (kernel_size, kernel_size, 2)
55
+ xx (ndarray): with the shape (kernel_size, kernel_size)
56
+ yy (ndarray): with the shape (kernel_size, kernel_size)
57
+ """
58
+ ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
59
+ xx, yy = np.meshgrid(ax, ax)
60
+ xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
61
+ 1))).reshape(kernel_size, kernel_size, 2)
62
+ return xy, xx, yy
63
+
64
+
65
+ def pdf2(sigma_matrix, grid):
66
+ """Calculate PDF of the bivariate Gaussian distribution.
67
+
68
+ Args:
69
+ sigma_matrix (ndarray): with the shape (2, 2)
70
+ grid (ndarray): generated by :func:`mesh_grid`,
71
+ with the shape (K, K, 2), K is the kernel size.
72
+
73
+ Returns:
74
+ kernel (ndarrray): un-normalized kernel.
75
+ """
76
+ inverse_sigma = np.linalg.inv(sigma_matrix)
77
+ kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
78
+ return kernel
79
+
80
+
81
+ def cdf2(d_matrix, grid):
82
+ """Calculate the CDF of the standard bivariate Gaussian distribution.
83
+ Used in skewed Gaussian distribution.
84
+
85
+ Args:
86
+ d_matrix (ndarrasy): skew matrix.
87
+ grid (ndarray): generated by :func:`mesh_grid`,
88
+ with the shape (K, K, 2), K is the kernel size.
89
+
90
+ Returns:
91
+ cdf (ndarray): skewed cdf.
92
+ """
93
+ rv = multivariate_normal([0, 0], [[1, 0], [0, 1]])
94
+ grid = np.dot(grid, d_matrix)
95
+ cdf = rv.cdf(grid)
96
+ return cdf
97
+
98
+
99
+ def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
100
+ """Generate a bivariate isotropic or anisotropic Gaussian kernel.
101
+
102
+ In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
103
+
104
+ Args:
105
+ kernel_size (int):
106
+ sig_x (float):
107
+ sig_y (float):
108
+ theta (float): Radian measurement.
109
+ grid (ndarray, optional): generated by :func:`mesh_grid`,
110
+ with the shape (K, K, 2), K is the kernel size. Default: None
111
+ isotropic (bool):
112
+
113
+ Returns:
114
+ kernel (ndarray): normalized kernel.
115
+ """
116
+ if grid is None:
117
+ grid, _, _ = mesh_grid(kernel_size)
118
+ if isotropic:
119
+ sigma_matrix = np.array([[sig_x ** 2, 0], [0, sig_x ** 2]])
120
+ else:
121
+ sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
122
+ kernel = pdf2(sigma_matrix, grid)
123
+ kernel = kernel / np.sum(kernel)
124
+ return kernel
125
+
126
+
127
+ def bivariate_generalized_Gaussian(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
128
+ """Generate a bivariate generalized Gaussian kernel.
129
+
130
+ ``Paper: Parameter Estimation For Multivariate Generalized Gaussian Distributions``
131
+
132
+ In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
133
+
134
+ Args:
135
+ kernel_size (int):
136
+ sig_x (float):
137
+ sig_y (float):
138
+ theta (float): Radian measurement.
139
+ beta (float): shape parameter, beta = 1 is the normal distribution.
140
+ grid (ndarray, optional): generated by :func:`mesh_grid`,
141
+ with the shape (K, K, 2), K is the kernel size. Default: None
142
+
143
+ Returns:
144
+ kernel (ndarray): normalized kernel.
145
+ """
146
+ if grid is None:
147
+ grid, _, _ = mesh_grid(kernel_size)
148
+ if isotropic:
149
+ sigma_matrix = np.array([[sig_x ** 2, 0], [0, sig_x ** 2]])
150
+ else:
151
+ sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
152
+ inverse_sigma = np.linalg.inv(sigma_matrix)
153
+ kernel = np.exp(-0.5 * np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta))
154
+ kernel = kernel / np.sum(kernel)
155
+ return kernel
156
+
157
+
158
+ def bivariate_plateau(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
159
+ """Generate a plateau-like anisotropic kernel.
160
+
161
+ 1 / (1+x^(beta))
162
+
163
+ Reference: https://stats.stackexchange.com/questions/203629/is-there-a-plateau-shaped-distribution
164
+
165
+ In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
166
+
167
+ Args:
168
+ kernel_size (int):
169
+ sig_x (float):
170
+ sig_y (float):
171
+ theta (float): Radian measurement.
172
+ beta (float): shape parameter, beta = 1 is the normal distribution.
173
+ grid (ndarray, optional): generated by :func:`mesh_grid`,
174
+ with the shape (K, K, 2), K is the kernel size. Default: None
175
+
176
+ Returns:
177
+ kernel (ndarray): normalized kernel.
178
+ """
179
+ if grid is None:
180
+ grid, _, _ = mesh_grid(kernel_size)
181
+ if isotropic:
182
+ sigma_matrix = np.array([[sig_x ** 2, 0], [0, sig_x ** 2]])
183
+ else:
184
+ sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
185
+ inverse_sigma = np.linalg.inv(sigma_matrix)
186
+ kernel = np.reciprocal(np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta) + 1)
187
+ kernel = kernel / np.sum(kernel)
188
+ return kernel
189
+
190
+
191
+ def random_bivariate_Gaussian(kernel_size,
192
+ sigma_x_range,
193
+ sigma_y_range,
194
+ rotation_range,
195
+ noise_range=None,
196
+ isotropic=True):
197
+ """Randomly generate bivariate isotropic or anisotropic Gaussian kernels.
198
+
199
+ In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
200
+
201
+ Args:
202
+ kernel_size (int):
203
+ sigma_x_range (tuple): [0.6, 5]
204
+ sigma_y_range (tuple): [0.6, 5]
205
+ rotation range (tuple): [-math.pi, math.pi]
206
+ noise_range(tuple, optional): multiplicative kernel noise,
207
+ [0.75, 1.25]. Default: None
208
+
209
+ Returns:
210
+ kernel (ndarray):
211
+ """
212
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
213
+ assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
214
+ sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
215
+ if isotropic is False:
216
+ assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
217
+ assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
218
+ sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
219
+ rotation = np.random.uniform(rotation_range[0], rotation_range[1])
220
+ else:
221
+ sigma_y = sigma_x
222
+ rotation = 0
223
+
224
+ kernel = bivariate_Gaussian(kernel_size, sigma_x, sigma_y, rotation, isotropic=isotropic)
225
+
226
+ # add multiplicative noise
227
+ if noise_range is not None:
228
+ assert noise_range[0] < noise_range[1], 'Wrong noise range.'
229
+ noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
230
+ kernel = kernel * noise
231
+ kernel = kernel / np.sum(kernel)
232
+ return kernel
233
+
234
+
235
+ def random_bivariate_generalized_Gaussian(kernel_size,
236
+ sigma_x_range,
237
+ sigma_y_range,
238
+ rotation_range,
239
+ beta_range,
240
+ noise_range=None,
241
+ isotropic=True):
242
+ """Randomly generate bivariate generalized Gaussian kernels.
243
+
244
+ In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
245
+
246
+ Args:
247
+ kernel_size (int):
248
+ sigma_x_range (tuple): [0.6, 5]
249
+ sigma_y_range (tuple): [0.6, 5]
250
+ rotation range (tuple): [-math.pi, math.pi]
251
+ beta_range (tuple): [0.5, 8]
252
+ noise_range(tuple, optional): multiplicative kernel noise,
253
+ [0.75, 1.25]. Default: None
254
+
255
+ Returns:
256
+ kernel (ndarray):
257
+ """
258
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
259
+ assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
260
+ sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
261
+ if isotropic is False:
262
+ assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
263
+ assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
264
+ sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
265
+ rotation = np.random.uniform(rotation_range[0], rotation_range[1])
266
+ else:
267
+ sigma_y = sigma_x
268
+ rotation = 0
269
+
270
+ # assume beta_range[0] < 1 < beta_range[1]
271
+ if np.random.uniform() < 0.5:
272
+ beta = np.random.uniform(beta_range[0], 1)
273
+ else:
274
+ beta = np.random.uniform(1, beta_range[1])
275
+
276
+ kernel = bivariate_generalized_Gaussian(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
277
+
278
+ # add multiplicative noise
279
+ if noise_range is not None:
280
+ assert noise_range[0] < noise_range[1], 'Wrong noise range.'
281
+ noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
282
+ kernel = kernel * noise
283
+ kernel = kernel / np.sum(kernel)
284
+ return kernel
285
+
286
+
287
+ def random_bivariate_plateau(kernel_size,
288
+ sigma_x_range,
289
+ sigma_y_range,
290
+ rotation_range,
291
+ beta_range,
292
+ noise_range=None,
293
+ isotropic=True):
294
+ """Randomly generate bivariate plateau kernels.
295
+
296
+ In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
297
+
298
+ Args:
299
+ kernel_size (int):
300
+ sigma_x_range (tuple): [0.6, 5]
301
+ sigma_y_range (tuple): [0.6, 5]
302
+ rotation range (tuple): [-math.pi/2, math.pi/2]
303
+ beta_range (tuple): [1, 4]
304
+ noise_range(tuple, optional): multiplicative kernel noise,
305
+ [0.75, 1.25]. Default: None
306
+
307
+ Returns:
308
+ kernel (ndarray):
309
+ """
310
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
311
+ assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
312
+ sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
313
+ if isotropic is False:
314
+ assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
315
+ assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
316
+ sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
317
+ rotation = np.random.uniform(rotation_range[0], rotation_range[1])
318
+ else:
319
+ sigma_y = sigma_x
320
+ rotation = 0
321
+
322
+ # TODO: this may be not proper
323
+ if np.random.uniform() < 0.5:
324
+ beta = np.random.uniform(beta_range[0], 1)
325
+ else:
326
+ beta = np.random.uniform(1, beta_range[1])
327
+
328
+ kernel = bivariate_plateau(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
329
+ # add multiplicative noise
330
+ if noise_range is not None:
331
+ assert noise_range[0] < noise_range[1], 'Wrong noise range.'
332
+ noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
333
+ kernel = kernel * noise
334
+ kernel = kernel / np.sum(kernel)
335
+
336
+ return kernel
337
+
338
+
339
+ def random_mixed_kernels(kernel_list,
340
+ kernel_prob,
341
+ kernel_size=21,
342
+ sigma_x_range=(0.6, 5),
343
+ sigma_y_range=(0.6, 5),
344
+ rotation_range=(-math.pi, math.pi),
345
+ betag_range=(0.5, 8),
346
+ betap_range=(0.5, 8),
347
+ noise_range=None):
348
+ """Randomly generate mixed kernels.
349
+
350
+ Args:
351
+ kernel_list (tuple): a list name of kernel types,
352
+ support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso',
353
+ 'plateau_aniso']
354
+ kernel_prob (tuple): corresponding kernel probability for each
355
+ kernel type
356
+ kernel_size (int):
357
+ sigma_x_range (tuple): [0.6, 5]
358
+ sigma_y_range (tuple): [0.6, 5]
359
+ rotation range (tuple): [-math.pi, math.pi]
360
+ beta_range (tuple): [0.5, 8]
361
+ noise_range(tuple, optional): multiplicative kernel noise,
362
+ [0.75, 1.25]. Default: None
363
+
364
+ Returns:
365
+ kernel (ndarray):
366
+ """
367
+ kernel_type = random.choices(kernel_list, kernel_prob)[0]
368
+ if kernel_type == 'iso':
369
+ kernel = random_bivariate_Gaussian(
370
+ kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=True)
371
+ elif kernel_type == 'aniso':
372
+ kernel = random_bivariate_Gaussian(
373
+ kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=False)
374
+ elif kernel_type == 'generalized_iso':
375
+ kernel = random_bivariate_generalized_Gaussian(
376
+ kernel_size,
377
+ sigma_x_range,
378
+ sigma_y_range,
379
+ rotation_range,
380
+ betag_range,
381
+ noise_range=noise_range,
382
+ isotropic=True)
383
+ elif kernel_type == 'generalized_aniso':
384
+ kernel = random_bivariate_generalized_Gaussian(
385
+ kernel_size,
386
+ sigma_x_range,
387
+ sigma_y_range,
388
+ rotation_range,
389
+ betag_range,
390
+ noise_range=noise_range,
391
+ isotropic=False)
392
+ elif kernel_type == 'plateau_iso':
393
+ kernel = random_bivariate_plateau(
394
+ kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=True)
395
+ elif kernel_type == 'plateau_aniso':
396
+ kernel = random_bivariate_plateau(
397
+ kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=False)
398
+ return kernel
399
+
400
+
401
+ np.seterr(divide='ignore', invalid='ignore')
402
+
403
+
404
+ def circular_lowpass_kernel(cutoff, kernel_size, pad_to=0):
405
+ """2D sinc filter
406
+
407
+ Reference: https://dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter
408
+
409
+ Args:
410
+ cutoff (float): cutoff frequency in radians (pi is max)
411
+ kernel_size (int): horizontal and vertical size, must be odd.
412
+ pad_to (int): pad kernel size to desired size, must be odd or zero.
413
+ """
414
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
415
+ kernel = np.fromfunction(
416
+ lambda x, y: cutoff * special.j1(cutoff * np.sqrt(
417
+ (x - (kernel_size - 1) / 2) ** 2 + (y - (kernel_size - 1) / 2) ** 2)) / (2 * np.pi * np.sqrt(
418
+ (x - (kernel_size - 1) / 2) ** 2 + (y - (kernel_size - 1) / 2) ** 2)), [kernel_size, kernel_size])
419
+ kernel[(kernel_size - 1) // 2, (kernel_size - 1) // 2] = cutoff ** 2 / (4 * np.pi)
420
+ kernel = kernel / np.sum(kernel)
421
+ if pad_to > kernel_size:
422
+ pad_size = (pad_to - kernel_size) // 2
423
+ kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size)))
424
+ return kernel
425
+
426
+
427
+ # ------------------------------------------------------------- #
428
+ # --------------------------- noise --------------------------- #
429
+ # ------------------------------------------------------------- #
430
+
431
+ # ----------------------- Gaussian Noise ----------------------- #
432
+
433
+ def instantiate_from_config(config: Mapping[str, Any]) -> Any:
434
+ if not "target" in config:
435
+ raise KeyError("Expected key `target` to instantiate.")
436
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
437
+
438
+
439
+ class BaseStorageBackend(metaclass=ABCMeta):
440
+ """Abstract class of storage backends.
441
+
442
+ All backends need to implement two apis: ``get()`` and ``get_text()``.
443
+ ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
444
+ as texts.
445
+ """
446
+
447
+ @property
448
+ def name(self) -> str:
449
+ return self.__class__.__name__
450
+
451
+ @abstractmethod
452
+ def get(self, filepath: str) -> bytes:
453
+ pass
454
+
455
+
456
+ class PetrelBackend(BaseStorageBackend):
457
+ """Petrel storage backend (for internal use).
458
+
459
+ PetrelBackend supports reading and writing data to multiple clusters.
460
+ If the file path contains the cluster name, PetrelBackend will read data
461
+ from specified cluster or write data to it. Otherwise, PetrelBackend will
462
+ access the default cluster.
463
+
464
+ Args:
465
+ path_mapping (dict, optional): Path mapping dict from local path to
466
+ Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
467
+ ``filepath`` will be replaced by ``dst``. Default: None.
468
+ enable_mc (bool, optional): Whether to enable memcached support.
469
+ Default: True.
470
+ conf_path (str, optional): Config path of Petrel client. Default: None.
471
+ `New in version 1.7.1`.
472
+
473
+ Examples:
474
+ >>> filepath1 = 's3://path/of/file'
475
+ >>> filepath2 = 'cluster-name:s3://path/of/file'
476
+ >>> client = PetrelBackend()
477
+ >>> client.get(filepath1) # get data from default cluster
478
+ >>> client.get(filepath2) # get data from 'cluster-name' cluster
479
+ """
480
+
481
+ def __init__(self,
482
+ path_mapping: Optional[dict] = None,
483
+ enable_mc: bool = False,
484
+ conf_path: str = None):
485
+ try:
486
+ from petrel_client import client
487
+ except ImportError:
488
+ raise ImportError('Please install petrel_client to enable '
489
+ 'PetrelBackend.')
490
+
491
+ self._client = client.Client(conf_path=conf_path, enable_mc=enable_mc)
492
+ assert isinstance(path_mapping, dict) or path_mapping is None
493
+ self.path_mapping = path_mapping
494
+
495
+ def _map_path(self, filepath: Union[str, Path]) -> str:
496
+ """Map ``filepath`` to a string path whose prefix will be replaced by
497
+ :attr:`self.path_mapping`.
498
+
499
+ Args:
500
+ filepath (str): Path to be mapped.
501
+ """
502
+ filepath = str(filepath)
503
+ if self.path_mapping is not None:
504
+ for k, v in self.path_mapping.items():
505
+ filepath = filepath.replace(k, v, 1)
506
+ return filepath
507
+
508
+ def _format_path(self, filepath: str) -> str:
509
+ """Convert a ``filepath`` to standard format of petrel oss.
510
+
511
+ If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
512
+ environment, the ``filepath`` will be the format of
513
+ 's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
514
+ above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
515
+
516
+ Args:
517
+ filepath (str): Path to be formatted.
518
+ """
519
+ return re.sub(r'\\+', '/', filepath)
520
+
521
+ def get(self, filepath: Union[str, Path]) -> bytes:
522
+ """Read data from a given ``filepath`` with 'rb' mode.
523
+
524
+ Args:
525
+ filepath (str or Path): Path to read data.
526
+
527
+ Returns:
528
+ bytes: The loaded bytes.
529
+ """
530
+ filepath = self._map_path(filepath)
531
+ filepath = self._format_path(filepath)
532
+ value = self._client.Get(filepath)
533
+ return value
534
+
535
+
536
+ class HardDiskBackend(BaseStorageBackend):
537
+ """Raw hard disks storage backend."""
538
+
539
+ def get(self, filepath: Union[str, Path]) -> bytes:
540
+ """Read data from a given ``filepath`` with 'rb' mode.
541
+
542
+ Args:
543
+ filepath (str or Path): Path to read data.
544
+
545
+ Returns:
546
+ bytes: Expected bytes object.
547
+ """
548
+ with open(filepath, 'rb') as f:
549
+ value_buf = f.read()
550
+ return value_buf
551
+
552
+
553
+ def generate_gaussian_noise(img, sigma=10, gray_noise=False):
554
+ """Generate Gaussian noise.
555
+
556
+ Args:
557
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
558
+ sigma (float): Noise scale (measured in range 255). Default: 10.
559
+
560
+ Returns:
561
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
562
+ float32.
563
+ """
564
+ if gray_noise:
565
+ noise = np.float32(np.random.randn(*(img.shape[0:2]))) * sigma / 255.
566
+ noise = np.expand_dims(noise, axis=2).repeat(3, axis=2)
567
+ else:
568
+ noise = np.float32(np.random.randn(*(img.shape))) * sigma / 255.
569
+ return noise
570
+
571
+
572
+ def add_gaussian_noise(img, sigma=10, clip=True, rounds=False, gray_noise=False):
573
+ """Add Gaussian noise.
574
+
575
+ Args:
576
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
577
+ sigma (float): Noise scale (measured in range 255). Default: 10.
578
+
579
+ Returns:
580
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
581
+ float32.
582
+ """
583
+ noise = generate_gaussian_noise(img, sigma, gray_noise)
584
+ out = img + noise
585
+ if clip and rounds:
586
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.
587
+ elif clip:
588
+ out = np.clip(out, 0, 1)
589
+ elif rounds:
590
+ out = (out * 255.0).round() / 255.
591
+ return out
592
+
593
+
594
+ def generate_gaussian_noise_pt(img, sigma=10, gray_noise=0):
595
+ """Add Gaussian noise (PyTorch version).
596
+
597
+ Args:
598
+ img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
599
+ scale (float | Tensor): Noise scale. Default: 1.0.
600
+
601
+ Returns:
602
+ (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
603
+ float32.
604
+ """
605
+ b, _, h, w = img.size()
606
+ if not isinstance(sigma, (float, int)):
607
+ sigma = sigma.view(img.size(0), 1, 1, 1)
608
+ if isinstance(gray_noise, (float, int)):
609
+ cal_gray_noise = gray_noise > 0
610
+ else:
611
+ gray_noise = gray_noise.view(b, 1, 1, 1)
612
+ cal_gray_noise = torch.sum(gray_noise) > 0
613
+
614
+ if cal_gray_noise:
615
+ noise_gray = torch.randn(*img.size()[2:4], dtype=img.dtype, device=img.device) * sigma / 255.
616
+ noise_gray = noise_gray.view(b, 1, h, w)
617
+
618
+ # always calculate color noise
619
+ noise = torch.randn(*img.size(), dtype=img.dtype, device=img.device) * sigma / 255.
620
+
621
+ if cal_gray_noise:
622
+ noise = noise * (1 - gray_noise) + noise_gray * gray_noise
623
+ return noise
624
+
625
+
626
+ def add_gaussian_noise_pt(img, sigma=10, gray_noise=0, clip=True, rounds=False):
627
+ """Add Gaussian noise (PyTorch version).
628
+
629
+ Args:
630
+ img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
631
+ scale (float | Tensor): Noise scale. Default: 1.0.
632
+
633
+ Returns:
634
+ (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
635
+ float32.
636
+ """
637
+ noise = generate_gaussian_noise_pt(img, sigma, gray_noise)
638
+ out = img + noise
639
+ if clip and rounds:
640
+ out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
641
+ elif clip:
642
+ out = torch.clamp(out, 0, 1)
643
+ elif rounds:
644
+ out = (out * 255.0).round() / 255.
645
+ return out
646
+
647
+
648
+ # ----------------------- Random Gaussian Noise ----------------------- #
649
+ def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0):
650
+ sigma = np.random.uniform(sigma_range[0], sigma_range[1])
651
+ if np.random.uniform() < gray_prob:
652
+ gray_noise = True
653
+ else:
654
+ gray_noise = False
655
+ return generate_gaussian_noise(img, sigma, gray_noise)
656
+
657
+
658
+ def random_add_gaussian_noise(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
659
+ noise = random_generate_gaussian_noise(img, sigma_range, gray_prob)
660
+ out = img + noise
661
+ if clip and rounds:
662
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.
663
+ elif clip:
664
+ out = np.clip(out, 0, 1)
665
+ elif rounds:
666
+ out = (out * 255.0).round() / 255.
667
+ return out
668
+
669
+
670
+ def random_generate_gaussian_noise_pt(img, sigma_range=(0, 10), gray_prob=0):
671
+ sigma = torch.rand(
672
+ img.size(0), dtype=img.dtype, device=img.device) * (sigma_range[1] - sigma_range[0]) + sigma_range[0]
673
+ gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
674
+ gray_noise = (gray_noise < gray_prob).float()
675
+ return generate_gaussian_noise_pt(img, sigma, gray_noise)
676
+
677
+
678
+ def random_add_gaussian_noise_pt(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
679
+ noise = random_generate_gaussian_noise_pt(img, sigma_range, gray_prob)
680
+ out = img + noise
681
+ if clip and rounds:
682
+ out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
683
+ elif clip:
684
+ out = torch.clamp(out, 0, 1)
685
+ elif rounds:
686
+ out = (out * 255.0).round() / 255.
687
+ return out
688
+
689
+
690
+ # ----------------------- Poisson (Shot) Noise ----------------------- #
691
+
692
+
693
+ def generate_poisson_noise(img, scale=1.0, gray_noise=False):
694
+ """Generate poisson noise.
695
+
696
+ Reference: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/noise.py#L37-L219
697
+
698
+ Args:
699
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
700
+ scale (float): Noise scale. Default: 1.0.
701
+ gray_noise (bool): Whether generate gray noise. Default: False.
702
+
703
+ Returns:
704
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
705
+ float32.
706
+ """
707
+ if gray_noise:
708
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
709
+ # round and clip image for counting vals correctly
710
+ img = np.clip((img * 255.0).round(), 0, 255) / 255.
711
+ vals = len(np.unique(img))
712
+ vals = 2 ** np.ceil(np.log2(vals))
713
+ out = np.float32(np.random.poisson(img * vals) / float(vals))
714
+ noise = out - img
715
+ if gray_noise:
716
+ noise = np.repeat(noise[:, :, np.newaxis], 3, axis=2)
717
+ return noise * scale
718
+
719
+
720
+ def add_poisson_noise(img, scale=1.0, clip=True, rounds=False, gray_noise=False):
721
+ """Add poisson noise.
722
+
723
+ Args:
724
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
725
+ scale (float): Noise scale. Default: 1.0.
726
+ gray_noise (bool): Whether generate gray noise. Default: False.
727
+
728
+ Returns:
729
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
730
+ float32.
731
+ """
732
+ noise = generate_poisson_noise(img, scale, gray_noise)
733
+ out = img + noise
734
+ if clip and rounds:
735
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.
736
+ elif clip:
737
+ out = np.clip(out, 0, 1)
738
+ elif rounds:
739
+ out = (out * 255.0).round() / 255.
740
+ return out
741
+
742
+
743
+ def generate_poisson_noise_pt(img, scale=1.0, gray_noise=0):
744
+ """Generate a batch of poisson noise (PyTorch version)
745
+
746
+ Args:
747
+ img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
748
+ scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
749
+ Default: 1.0.
750
+ gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
751
+ 0 for False, 1 for True. Default: 0.
752
+
753
+ Returns:
754
+ (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
755
+ float32.
756
+ """
757
+ b, _, h, w = img.size()
758
+ if isinstance(gray_noise, (float, int)):
759
+ cal_gray_noise = gray_noise > 0
760
+ else:
761
+ gray_noise = gray_noise.view(b, 1, 1, 1)
762
+ cal_gray_noise = torch.sum(gray_noise) > 0
763
+ if cal_gray_noise:
764
+ img_gray = rgb_to_grayscale(img, num_output_channels=1)
765
+ # round and clip image for counting vals correctly
766
+ img_gray = torch.clamp((img_gray * 255.0).round(), 0, 255) / 255.
767
+ # use for-loop to get the unique values for each sample
768
+ vals_list = [len(torch.unique(img_gray[i, :, :, :])) for i in range(b)]
769
+ vals_list = [2 ** np.ceil(np.log2(vals)) for vals in vals_list]
770
+ vals = img_gray.new_tensor(vals_list).view(b, 1, 1, 1)
771
+ out = torch.poisson(img_gray * vals) / vals
772
+ noise_gray = out - img_gray
773
+ noise_gray = noise_gray.expand(b, 3, h, w)
774
+
775
+ # always calculate color noise
776
+ # round and clip image for counting vals correctly
777
+ img = torch.clamp((img * 255.0).round(), 0, 255) / 255.
778
+ # use for-loop to get the unique values for each sample
779
+ vals_list = [len(torch.unique(img[i, :, :, :])) for i in range(b)]
780
+ vals_list = [2 ** np.ceil(np.log2(vals)) for vals in vals_list]
781
+ vals = img.new_tensor(vals_list).view(b, 1, 1, 1)
782
+ out = torch.poisson(img * vals) / vals
783
+ noise = out - img
784
+ if cal_gray_noise:
785
+ noise = noise * (1 - gray_noise) + noise_gray * gray_noise
786
+ if not isinstance(scale, (float, int)):
787
+ scale = scale.view(b, 1, 1, 1)
788
+ return noise * scale
789
+
790
+
791
+ def add_poisson_noise_pt(img, scale=1.0, clip=True, rounds=False, gray_noise=0):
792
+ """Add poisson noise to a batch of images (PyTorch version).
793
+
794
+ Args:
795
+ img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
796
+ scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
797
+ Default: 1.0.
798
+ gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
799
+ 0 for False, 1 for True. Default: 0.
800
+
801
+ Returns:
802
+ (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
803
+ float32.
804
+ """
805
+ noise = generate_poisson_noise_pt(img, scale, gray_noise)
806
+ out = img + noise
807
+ if clip and rounds:
808
+ out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
809
+ elif clip:
810
+ out = torch.clamp(out, 0, 1)
811
+ elif rounds:
812
+ out = (out * 255.0).round() / 255.
813
+ return out
814
+
815
+
816
+ # ----------------------- Random Poisson (Shot) Noise ----------------------- #
817
+
818
+
819
+ def random_generate_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0):
820
+ scale = np.random.uniform(scale_range[0], scale_range[1])
821
+ if np.random.uniform() < gray_prob:
822
+ gray_noise = True
823
+ else:
824
+ gray_noise = False
825
+ return generate_poisson_noise(img, scale, gray_noise)
826
+
827
+
828
+ def random_add_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
829
+ noise = random_generate_poisson_noise(img, scale_range, gray_prob)
830
+ out = img + noise
831
+ if clip and rounds:
832
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.
833
+ elif clip:
834
+ out = np.clip(out, 0, 1)
835
+ elif rounds:
836
+ out = (out * 255.0).round() / 255.
837
+ return out
838
+
839
+
840
+ def random_generate_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0):
841
+ scale = torch.rand(
842
+ img.size(0), dtype=img.dtype, device=img.device) * (scale_range[1] - scale_range[0]) + scale_range[0]
843
+ gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
844
+ gray_noise = (gray_noise < gray_prob).float()
845
+ return generate_poisson_noise_pt(img, scale, gray_noise)
846
+
847
+
848
+ def random_add_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
849
+ noise = random_generate_poisson_noise_pt(img, scale_range, gray_prob)
850
+ out = img + noise
851
+ if clip and rounds:
852
+ out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
853
+ elif clip:
854
+ out = torch.clamp(out, 0, 1)
855
+ elif rounds:
856
+ out = (out * 255.0).round() / 255.
857
+ return out
858
+
859
+
860
+ # ------------------------------------------------------------------------ #
861
+ # --------------------------- JPEG compression --------------------------- #
862
+ # ------------------------------------------------------------------------ #
863
+
864
+
865
+ def add_jpg_compression(img, quality=90):
866
+ """Add JPG compression artifacts.
867
+
868
+ Args:
869
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
870
+ quality (float): JPG compression quality. 0 for lowest quality, 100 for
871
+ best quality. Default: 90.
872
+
873
+ Returns:
874
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
875
+ float32.
876
+ """
877
+ img = np.clip(img, 0, 1)
878
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
879
+ _, encimg = cv2.imencode('.jpg', img * 255., encode_param)
880
+ img = np.float32(cv2.imdecode(encimg, 1)) / 255.
881
+ return img
882
+
883
+
884
+ def random_add_jpg_compression(img, quality_range=(90, 100)):
885
+ """Randomly add JPG compression artifacts.
886
+
887
+ Args:
888
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
889
+ quality_range (tuple[float] | list[float]): JPG compression quality
890
+ range. 0 for lowest quality, 100 for best quality.
891
+ Default: (90, 100).
892
+
893
+ Returns:
894
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
895
+ float32.
896
+ """
897
+ quality = np.random.uniform(quality_range[0], quality_range[1])
898
+ return add_jpg_compression(img, int(quality))
899
+
900
+
901
+ def load_file_list(file_list_path: str) -> List[Dict[str, str]]:
902
+ files = []
903
+ with open(file_list_path, "r") as fin:
904
+ for line in fin:
905
+ path = line.strip()
906
+ if path:
907
+ files.append({"image_path": path, "prompt": ""})
908
+ return files
909
+
910
+
911
+ # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/image_datasets.py
912
+ def center_crop_arr(pil_image, image_size):
913
+ # We are not on a new enough PIL to support the `reducing_gap`
914
+ # argument, which uses BOX downsampling at powers of two first.
915
+ # Thus, we do it by hand to improve downsample quality.
916
+ while min(*pil_image.size) >= 2 * image_size:
917
+ pil_image = pil_image.resize(
918
+ tuple(x // 2 for x in pil_image.size), resample=Image.BOX
919
+ )
920
+
921
+ scale = image_size / min(*pil_image.size)
922
+ pil_image = pil_image.resize(
923
+ tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
924
+ )
925
+
926
+ arr = np.array(pil_image)
927
+ crop_y = (arr.shape[0] - image_size) // 2
928
+ crop_x = (arr.shape[1] - image_size) // 2
929
+ return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]
930
+
931
+
932
+ # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/image_datasets.py
933
+ def random_crop_arr(pil_image, image_size, min_crop_frac=0.8, max_crop_frac=1.0):
934
+ min_smaller_dim_size = math.ceil(image_size / max_crop_frac)
935
+ max_smaller_dim_size = math.ceil(image_size / min_crop_frac)
936
+ smaller_dim_size = random.randrange(min_smaller_dim_size, max_smaller_dim_size + 1)
937
+
938
+ # We are not on a new enough PIL to support the `reducing_gap`
939
+ # argument, which uses BOX downsampling at powers of two first.
940
+ # Thus, we do it by hand to improve downsample quality.
941
+ while min(*pil_image.size) >= 2 * smaller_dim_size:
942
+ pil_image = pil_image.resize(
943
+ tuple(x // 2 for x in pil_image.size), resample=Image.BOX
944
+ )
945
+
946
+ scale = smaller_dim_size / min(*pil_image.size)
947
+ pil_image = pil_image.resize(
948
+ tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
949
+ )
950
+
951
+ arr = np.array(pil_image)
952
+ crop_y = random.randrange(arr.shape[0] - image_size + 1)
953
+ crop_x = random.randrange(arr.shape[1] - image_size + 1)
954
+ return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]
utils/create_arch.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from arch.hourglass import image_transformer_v2 as itv2
2
+ from arch.hourglass.image_transformer_v2 import ImageTransformerDenoiserModelV2
3
+ from arch.swinir.swinir import SwinIR
4
+
5
+
6
+ def create_arch(arch, condition_channels=0):
7
+ # arch should be, e.g., swinir_XL, or hdit_XL
8
+ arch_name, arch_size = arch.split('_')
9
+ arch_config = arch_configs[arch_name][arch_size].copy()
10
+ arch_config['in_channels'] += condition_channels
11
+ return arch_name_to_object[arch_name](**arch_config)
12
+
13
+
14
+ arch_configs = {
15
+ 'hdit': {
16
+ "ImageNet256Sp4": {
17
+ 'in_channels': 3,
18
+ 'out_channels': 3,
19
+ 'widths': [256, 512, 1024],
20
+ 'depths': [2, 2, 8],
21
+ 'patch_size': [4, 4],
22
+ 'self_attns': [
23
+ {"type": "neighborhood", "d_head": 64, "kernel_size": 7},
24
+ {"type": "neighborhood", "d_head": 64, "kernel_size": 7},
25
+ {"type": "global", "d_head": 64}
26
+ ],
27
+ 'mapping_depth': 2,
28
+ 'mapping_width': 768,
29
+ 'dropout_rate': [0, 0, 0],
30
+ 'mapping_dropout_rate': 0.0
31
+ },
32
+ "XL2": {
33
+ 'in_channels': 3,
34
+ 'out_channels': 3,
35
+ 'widths': [384, 768],
36
+ 'depths': [2, 11],
37
+ 'patch_size': [4, 4],
38
+ 'self_attns': [
39
+ {"type": "neighborhood", "d_head": 64, "kernel_size": 7},
40
+ {"type": "global", "d_head": 64}
41
+ ],
42
+ 'mapping_depth': 2,
43
+ 'mapping_width': 768,
44
+ 'dropout_rate': [0, 0],
45
+ 'mapping_dropout_rate': 0.0
46
+ }
47
+
48
+ },
49
+ 'swinir': {
50
+ "M": {
51
+ 'in_channels': 3,
52
+ 'out_channels': 3,
53
+ 'embed_dim': 120,
54
+ 'depths': [6, 6, 6, 6, 6],
55
+ 'num_heads': [6, 6, 6, 6, 6],
56
+ 'resi_connection': '1conv',
57
+ 'sf': 8
58
+
59
+ },
60
+ "L": {
61
+ 'in_channels': 3,
62
+ 'out_channels': 3,
63
+ 'embed_dim': 180,
64
+ 'depths': [6, 6, 6, 6, 6, 6, 6, 6],
65
+ 'num_heads': [6, 6, 6, 6, 6, 6, 6, 6],
66
+ 'resi_connection': '1conv',
67
+ 'sf': 8
68
+ },
69
+ },
70
+ }
71
+
72
+
73
+ def create_swinir_model(in_channels, out_channels, embed_dim, depths, num_heads, resi_connection,
74
+ sf):
75
+ return SwinIR(
76
+ img_size=64,
77
+ patch_size=1,
78
+ in_chans=in_channels,
79
+ num_out_ch=out_channels,
80
+ embed_dim=embed_dim,
81
+ depths=depths,
82
+ num_heads=num_heads,
83
+ window_size=8,
84
+ mlp_ratio=2,
85
+ sf=sf,
86
+ img_range=1.0,
87
+ upsampler="nearest+conv",
88
+ resi_connection=resi_connection,
89
+ unshuffle=True,
90
+ unshuffle_scale=8
91
+ )
92
+
93
+
94
+ def create_hdit_model(widths,
95
+ depths,
96
+ self_attns,
97
+ dropout_rate,
98
+ mapping_depth,
99
+ mapping_width,
100
+ mapping_dropout_rate,
101
+ in_channels,
102
+ out_channels,
103
+ patch_size
104
+ ):
105
+ assert len(widths) == len(depths)
106
+ assert len(widths) == len(self_attns)
107
+ assert len(widths) == len(dropout_rate)
108
+ mapping_d_ff = mapping_width * 3
109
+ d_ffs = []
110
+ for width in widths:
111
+ d_ffs.append(width * 3)
112
+
113
+ levels = []
114
+ for depth, width, d_ff, self_attn, dropout in zip(depths, widths, d_ffs, self_attns, dropout_rate):
115
+ if self_attn['type'] == 'global':
116
+ self_attn = itv2.GlobalAttentionSpec(self_attn.get('d_head', 64))
117
+ elif self_attn['type'] == 'neighborhood':
118
+ self_attn = itv2.NeighborhoodAttentionSpec(self_attn.get('d_head', 64), self_attn.get('kernel_size', 7))
119
+ elif self_attn['type'] == 'shifted-window':
120
+ self_attn = itv2.ShiftedWindowAttentionSpec(self_attn.get('d_head', 64), self_attn['window_size'])
121
+ elif self_attn['type'] == 'none':
122
+ self_attn = itv2.NoAttentionSpec()
123
+ else:
124
+ raise ValueError(f'unsupported self attention type {self_attn["type"]}')
125
+ levels.append(itv2.LevelSpec(depth, width, d_ff, self_attn, dropout))
126
+ mapping = itv2.MappingSpec(mapping_depth, mapping_width, mapping_d_ff, mapping_dropout_rate)
127
+ model = ImageTransformerDenoiserModelV2(
128
+ levels=levels,
129
+ mapping=mapping,
130
+ in_channels=in_channels,
131
+ out_channels=out_channels,
132
+ patch_size=patch_size,
133
+ num_classes=0,
134
+ mapping_cond_dim=0,
135
+ )
136
+
137
+ return model
138
+
139
+
140
+ arch_name_to_object = {
141
+ 'hdit': create_hdit_model,
142
+ 'swinir': create_swinir_model,
143
+ }
utils/create_degradation.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from functools import partial
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from basicsr.data import degradations as degradations
8
+ from basicsr.data.transforms import augment
9
+ from basicsr.utils import img2tensor
10
+ from torch.nn.functional import interpolate
11
+ from torchvision.transforms import Compose
12
+ from utils.basicsr_custom import (
13
+ random_mixed_kernels,
14
+ random_add_gaussian_noise,
15
+ random_add_jpg_compression,
16
+ )
17
+
18
+
19
+ def create_degradation(degradation):
20
+ if degradation == 'sr_bicubic_x8_gaussian_noise_005':
21
+ return Compose([
22
+ partial(down_scale, scale_factor=1.0 / 8.0, mode='bicubic'),
23
+ partial(add_gaussian_noise, std=0.05),
24
+ partial(interpolate, scale_factor=8.0, mode='nearest-exact'),
25
+ partial(torch.clip, min=0, max=1),
26
+ partial(torch.squeeze, dim=0),
27
+ lambda x: (x, None)
28
+
29
+ ])
30
+ elif degradation == 'gaussian_noise_035':
31
+ return Compose([
32
+ partial(add_gaussian_noise, std=0.35),
33
+ partial(torch.clip, min=0, max=1),
34
+ partial(torch.squeeze, dim=0),
35
+ lambda x: (x, None)
36
+
37
+ ])
38
+ elif degradation == 'colorization_gaussian_noise_025':
39
+ return Compose([
40
+ lambda x: torch.mean(x, dim=0, keepdim=True),
41
+ partial(add_gaussian_noise, std=0.25),
42
+ partial(torch.clip, min=0, max=1),
43
+ lambda x: (x, None)
44
+ ])
45
+ elif degradation == 'random_inpainting_gaussian_noise_01':
46
+ def inpainting_dps(x):
47
+ total = x.shape[1] ** 2
48
+ # random pixel sampling
49
+ l, h = [0.9, 0.9]
50
+ prob = np.random.uniform(l, h)
51
+ mask_vec = torch.ones([1, x.shape[1] * x.shape[1]])
52
+ samples = np.random.choice(x.shape[1] * x.shape[1], int(total * prob), replace=False)
53
+ mask_vec[:, samples] = 0
54
+ mask_b = mask_vec.view(1, x.shape[1], x.shape[1])
55
+ mask_b = mask_b.repeat(3, 1, 1)
56
+ mask = torch.ones_like(x, device=x.device)
57
+ mask[:, ...] = mask_b
58
+ return add_gaussian_noise(x * mask, 0.1).clip(0, 1), None
59
+
60
+ return inpainting_dps
61
+ elif degradation == 'difface':
62
+ def deg(x):
63
+ blur_kernel_size = 41
64
+ kernel_list = ['iso', 'aniso']
65
+ kernel_prob = [0.5, 0.5]
66
+ blur_sigma = [0.1, 15]
67
+ downsample_range = [0.8, 32]
68
+ noise_range = [0, 20]
69
+ jpeg_range = [30, 100]
70
+ gt_gray = True
71
+ gray_prob = 0.01
72
+ x = x.permute(1, 2, 0).numpy()[..., ::-1].astype(np.float32)
73
+ # random horizontal flip
74
+ img_gt = augment(x.copy(), hflip=True, rotation=False)
75
+ h, w, _ = img_gt.shape
76
+
77
+ # ------------------------ generate lq image ------------------------ #
78
+ # blur
79
+ kernel = degradations.random_mixed_kernels(
80
+ kernel_list,
81
+ kernel_prob,
82
+ blur_kernel_size,
83
+ blur_sigma,
84
+ blur_sigma, [-math.pi, math.pi],
85
+ noise_range=None)
86
+ img_lq = cv2.filter2D(img_gt, -1, kernel)
87
+ # downsample
88
+ scale = np.random.uniform(downsample_range[0], downsample_range[1])
89
+ img_lq = cv2.resize(img_lq, (int(w // scale), int(h // scale)), interpolation=cv2.INTER_LINEAR)
90
+ # noise
91
+ if noise_range is not None:
92
+ img_lq = random_add_gaussian_noise(img_lq, noise_range)
93
+ # jpeg compression
94
+ if jpeg_range is not None:
95
+ img_lq = random_add_jpg_compression(img_lq, jpeg_range)
96
+
97
+ # resize to original size
98
+ img_lq = cv2.resize(img_lq, (w, h), interpolation=cv2.INTER_LINEAR)
99
+
100
+ # random color jitter (only for lq)
101
+ # if self.color_jitter_prob is not None and (np.random.uniform() < self.color_jitter_prob):
102
+ # img_lq = self.color_jitter(img_lq, self.color_jitter_shift)
103
+ # random to gray (only for lq)
104
+ if np.random.uniform() < gray_prob:
105
+ img_lq = cv2.cvtColor(img_lq, cv2.COLOR_BGR2GRAY)
106
+ img_lq = np.tile(img_lq[:, :, None], [1, 1, 3])
107
+ if gt_gray: # whether convert GT to gray images
108
+ img_gt = cv2.cvtColor(img_gt, cv2.COLOR_BGR2GRAY)
109
+ img_gt = np.tile(img_gt[:, :, None], [1, 1, 3]) # repeat the color channels
110
+
111
+ # BGR to RGB, HWC to CHW, numpy to tensor
112
+ img_gt, img_lq = img2tensor([img_gt, img_lq], bgr2rgb=True, float32=True)
113
+
114
+ # random color jitter (pytorch version) (only for lq)
115
+ # if self.color_jitter_pt_prob is not None and (np.random.uniform() < self.color_jitter_pt_prob):
116
+ # brightness = self.opt.get('brightness', (0.5, 1.5))
117
+ # contrast = self.opt.get('contrast', (0.5, 1.5))
118
+ # saturation = self.opt.get('saturation', (0, 1.5))
119
+ # hue = self.opt.get('hue', (-0.1, 0.1))
120
+ # img_lq = self.color_jitter_pt(img_lq, brightness, contrast, saturation, hue)
121
+
122
+ # round and clip
123
+ img_lq = torch.clamp((img_lq * 255.0).round(), 0, 255) / 255.
124
+
125
+ return img_lq, img_gt.clip(0, 1)
126
+
127
+ return deg
128
+ else:
129
+ raise NotImplementedError()
130
+
131
+
132
+ def down_scale(x, scale_factor, mode):
133
+ with torch.no_grad():
134
+ return interpolate(x.unsqueeze(0),
135
+ scale_factor=scale_factor,
136
+ mode=mode,
137
+ antialias=True,
138
+ align_corners=False).clip(0, 1)
139
+
140
+
141
+ def add_gaussian_noise(x, std):
142
+ with torch.no_grad():
143
+ x = x + torch.randn_like(x) * std
144
+ return x
utils/img_utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from torchvision.utils import make_grid
2
+
3
+
4
+ def create_grid(img, normalize=False, num_images=5):
5
+ return make_grid(img[:num_images], padding=0, normalize=normalize, nrow=16)