赞
踩
# 对于全连接层来说,fan_in是上一层的神经元数目,fan_out是当前层的神经元数目
# gain是放缩因子,用于调整初始化的范围,默认为1
def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
a = gain * math.sqrt(6/(fan_in+fan_out)) # w服从(-a,a)的均匀分布
# rand(fan_in,fan_out)用随机数生成上一层和当前层之间的参数矩阵,范围0~1
# 通过乘2减1后,此时参数矩阵中的值落在[-1,1]内
# 然后再乘上a,让最后的参数矩阵服从-a到a的均匀分布
return a * (2*rand(fan_in, fan_out, **kwargs)-1)
def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
std = gain * math.sqrt(2/(fan_in+fan_out))
return std * randn(fan_in, fan_out, **kwargs)
# 使用ReLU激活函数的推荐放缩因子gain=根号2
def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs):
assert nonlinearity == "relu", "Only relu supported currently"
gain = math.sqrt(2)
bound = gain * math.sqrt(3/fan_in)
return bound * (2*rand(fan_in, fan_out, **kwargs)-1)
def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
assert nonlinearity == "relu", "Only relu supported currently"
gain = math.sqrt(2)
std = gain / math.sqrt(fan_in)
return std * randn(fan_in, fan_out, **kwargs)
class Linear(Module): def __init__(self, in_features, out_features, bias=True, device=None, dtype="float32"): super().__init__() self.in_features = in_features self.out_features = out_features ### BEGIN YOUR SOLUTION self.weight = Parameter(init.kaiming_uniform(in_features, out_features, requires_grad=True)) if bias: self.bias = Parameter(init.kaiming_uniform(out_features, 1, requires_grad=True).reshape((1, out_features))) else: self.bias = None ### END YOUR SOLUTION def forward(self, X: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION # y=Wx+b,注意b要广播 X_mul_weight = X @ self.weight if self.bias: return X_mul_weight + self.bias.broadcast_to(X_mul_weight.shape) else: return X_mul_weight ### END YOUR SOLUTION
class ReLU(Module): def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION return ops.relu(x) ### END YOUR SOLUTION # 在ops.py中,ReLU算子的定义如下 class ReLU(TensorOp): def compute(self, a): ### BEGIN YOUR SOLUTION return array_api.maximum(a, 0) ### END YOUR SOLUTION def gradient(self, out_grad, node): ### BEGIN YOUR SOLUTION a = node.inputs[0].realize_cached_data() mask = Tensor(a > 0) return out_grad * mask ### END YOUR SOLUTION
# sequential是一个容器模块,将多个子模块按顺序串联起来形成一个完整的神经网络模型
class Sequential(Module):
def __init__(self, *modules):
super().__init__()
self.modules = modules
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
for module in self.modules:
x = module(x)
return x
### END YOUR SOLUTION
class LogSumExp(TensorOp): def __init__(self, axes: Optional[tuple] = None): self.axes = axes # axes用于指定进行操作的轴或轴组: # 若axes是一个整数,表示在指定的单个轴上执行操作 # 若axes是一个元组或列表,表示在指定的多个轴上执行操作 # 在LogSumExp中,先找到Z在指定轴上的最大值max,然后通过减去最大值作数值稳定化处理 # 接着对稳定化后的结果进行指定轴上的求和,并取对数 # 最后将ret按照指定的轴形状进行调整 # 例如二维张量Z的形状为(3,4),计算Z.LogSumExp(axes=(0,)):先计算第一个轴上的最大值,得到(1,4)的张量 # 然后减去最大值、计算指数、求和、取对数,得到(1,4)的张量 # 最后根据指定的轴形状,将结果调整为(4,)的张量 def compute(self, Z): ### BEGIN YOUR SOLUTION max = array_api.max(Z, axis=self.axes, keepdims=1) ret = array_api.log( array_api.exp(Z-max).sum(axis=self.axes, keepdims=1)) \ + max if self.axes: # 若有指定的轴或轴组,就根据它来确定输出形状 # 列表推导式。enumerate(Z.shape)将返回一个包含索引和形状大小的迭代器 # for i, size in enumerate(Z.shape) if i not in self.axes:若不是指定的轴,就保留(在指定的轴上会进行上面的一系列操作) out_shape = [size for i, size in enumerate(Z.shape) if i not in self.axes] else: out_shape = () ret.resize(tuple(out_shape)) return ret ### END YOUR SOLUTION def gradient(self, out_grad, node): ### BEGIN YOUR SOLUTION Z = node.inputs[0] if self.axes: shape = [1] * len(Z.shape) s = set(self.axes) j = 0 for i in range(len(shape)): if i not in s: shape[i] = node.shape[j] j += 1 node_new = node.reshape(shape) grad_new = out_grad.reshape(shape) else: node_new = node grad_new = out_grad return grad_new * exp(Z-node_new) ### END YOUR SOLUTION
class SoftmaxLoss(Module):
def forward(self, logits: Tensor, y: Tensor):
### BEGIN YOUR SOLUTION
# 用LogSumExp来完成softmaxloss
exp_sum = ops.logsumexp(logits, axes=(1,)).sum()
z_y_sum = (logits*init.one_hot(logits.shape[1],y)).sum()
return (exp_sum-z_y_sum) / logits.shape[0]
### END YOUR SOLUTION
class LayerNorm1d(Module): def __init__(self, dim, eps=1e-5, device=None, dtype="float32"): super().__init__() self.dim = dim self.eps = eps ### BEGIN YOUR SOLUTION self.weight = Parameter(init.ones(self.dim, requires_grad=True)) self.bias = Parameter(init.zeros(self.dim, requires_grad=True)) ### END YOUR SOLUTION def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION batch_size = x.shape[0] feature_size = x.shape[1] # x.sum(axes=(1,))表示沿着列求和,即对一行的元素相加,每一行都如此 mean = x.sum(axes=(1,)).reshape((batch_size,1)) / feature_size x_minus_mean = x - mean.broadcast_to(x.shape) x_std = ((x_minus_mean**2).sum(axes=(1,)).reshape((batch_size,1)) / feature_size + self.eps) ** 0.5 normed = x_minus_mean / x_std.broadcast_to(x.shape) return self.weight.broadcast_to(x.shape) * normed + self.bias.broadcast_to(x.shape) ### END YOUR SOLUTION
class Flatten(Module):
def forward(self, X):
### BEGIN YOUR SOLUTION
# 假设输入X形状为(2,3,4),则X.reshape(2,-1)后的形状为(2,12),即第2、3维度的数据被拉平成了一行
return X.reshape((X.shape[0], -1))
### END YOUR SOLUTION
class BatchNorm1d(Module): def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"): super().__init__() self.dim = dim self.eps = eps self.momentum = momentum ### BEGIN YOUR SOLUTION self.weight = Parameter(init.ones(self.dim, requires_grad=True)) self.bias = Parameter(init.zeros(self.dim, requires_grad=True)) # test时需要使用全局均值和方差 self.running_mean = init.zeros(self.dim) self.running_var = init.ones(self.dim) ### END YOUR SOLUTION def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION batch_size = x.shape[0] mean = x.sum((0,)) / batch_size x_minus_mean = x - mean.broadcast_to(x.shape) var = (x_minus_mean**2).sum((0,)) / batch_size if self.training: self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data self.running_var = (1-self.momentum) * self.running_var + self.momentum * var.data x_std = ((var+self.eps)**0.5).broadcast_to(x.shape) x_normed = x_minus_mean / x_std return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape) else: # test阶段,需要用全局的running_mean和running_var x_normed = (x-self.running_mean) / (self.running_var+self.eps)**0.5 return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape) ### END YOUR SOLUTION
class Dropout(Module):
def __init__(self, p = 0.5):
super().__init__()
self.p = p
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
# 只有训练的时候才会用到dropout
mask = init.randb(*x.shape, p=1-self.p) # 返回与x形状相同的tensor,其每个元素值以概率p被保留为1,概率1-p被设置为0
if self.training:
x_mask = x * mask
return x_mask / (1-self.p)
else:
return x
### END YOUR SOLUTION
class Residual(Module):
def __init__(self, fn: Module):
super().__init__()
self.fn = fn
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
# 残差就是简单的输入加输出
return x + self.fn(x)
### END YOUR SOLUTION
class SGD(Optimizer): def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0): super().__init__(params) self.lr = lr self.momentum = momentum self.u = { } # 表示动量momentum,u[i]表示i号参数的动量值 self.weight_decay = weight_decay # 权重衰减系数 def step(self): ### BEGIN YOUR SOLUTION # SGD with momentum,且包含正则项 for i, param in enumerate(self.params): if i not in self.u: self.u[i] = 0 # 将动量初始化为0 if param.grad is None: # 若梯度为None时,跳过后面对梯度的操作,直接进入下一次循环 continue grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data \ + self.weight_decay * param.data # 这里实现的是 L1 norm self.u[i] = self.momentum * self.u[i] + (1-self.momentum) * grad_data param.data = param.data - self.u[i] * self.lr ### END YOUR SOLUTION
class Adam(Optimizer): def __init__( self, params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0, ): super().__init__(params) self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.eps = eps self.weight_decay = weight_decay self.t = 0 self.m = { } self.v = { } def step(self): ### BEGIN YOUR SOLUTION self.t += 1 for i, param in enumerate(self.params): if i not in self.m: self.m[i] = ndl.init.zeros(*param.shape) self.v[i] = ndl.init.zeros(*param.shape) if param.grad is None: continue # 和SGD momentum一样按照各自的公式写 grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data \ + param.data * self.weight_decay self.m[i] = self.beta1 * self.m[i] \ + (1 - self.beta1) * grad_data self.v[i] = self.beta2 * self.v[i] \ + (1 - self.beta2) * grad_data**2 # 修正 u_hat = (self.m[i]) / (1 - self.beta1 ** self.t) v_hat = (self.v[i]) / (1 - self.beta2 ** self.t) param.data = param.data - self.lr * u_hat / (v_hat ** 0.5 + self.eps) ### END YOUR SOLUTION
目前有很多优化算法,但是SGD with momentum和Adam是相对来说最重要的两个,是了解深度学习必须知道的两个优化算法
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。