赞
踩
全新神经网络KAN kolmogorov arnold network ,论文一出就炸裂了,大家对它都很关注!
论文:https://arxiv.org/abs/2404.19756
飞桨代码:GitHub - yrqUni/PaddleKAN
据说这个代码并不能达到论文的效果,不过可以先了解了解思路。
- import paddle
- import paddle.nn.functional as F
-
- class KANLinear(paddle.nn.Layer):
- def __init__(
- self,
- in_features,
- out_features,
- grid_size=5,
- spline_order=3,
- scale_noise=0.1,
- scale_base=1.0,
- scale_spline=1.0,
- base_activation=paddle.nn.Silu,
- grid_eps=0.02,
- grid_range=[-1, 1],
- ):
- super(KANLinear, self).__init__()
- self.in_features = in_features
- self.out_features = out_features
- self.grid_size = grid_size
- self.spline_order = spline_order
-
- h = (grid_range[1] - grid_range[0]) / grid_size
- grid = (
- paddle.arange(-spline_order, grid_size + spline_order + 1, dtype=paddle.float32) * h
- + grid_range[0]
- ).expand([in_features, -1]).contiguous()
- self.register_buffer("grid", grid)
-
- self.base_weight = self.create_parameter(
- shape=[out_features, in_features], default_initializer=paddle.nn.initializer.Constant(value=scale_base))
- self.spline_weight = self.create_parameter(
- shape=[out_features, in_features, grid_size + spline_order], default_initializer=paddle.nn.initializer.Constant(value=scale_spline))
-
- self.scale_noise = scale_noise
- self.scale_base = scale_base
- self.scale_spline = scale_spline
- self.base_activation = base_activation()
- self.grid_eps = grid_eps
-
- self.reset_parameters()
-
- def reset_parameters(self):
- self.base_weight.set_value(paddle.full([self.out_features, self.in_features], self.scale_base))
- with paddle.no_grad():
- noise = (
- paddle.rand([self.grid_size + 1, self.in_features, self.out_features], dtype=paddle.float32)
- - 0.5
- ) * self.scale_noise / self.grid_size
- self.spline_weight.set_value(
- self.scale_spline
- * self.curve2coeff(
- self.grid.T[self.spline_order:-self.spline_order],
- noise,
- )
- )
-
- def b_splines(self, x: paddle.Tensor):
- """
- Compute the B-spline bases for the given input tensor.
- Args:
- x (paddle.Tensor): Input tensor of shape (batch_size, in_features).
- Returns:
- paddle.Tensor: B-spline bases tensor of shape (batch_size, in_features, grid_size + spline_order).
- """
- assert x.ndim == 2 and x.shape[1] == self.in_features
-
- grid: paddle.Tensor = (
- self.grid
- ) # (in_features, grid_size + 2 * spline_order + 1)
- x = x.unsqueeze(-1)
- bases = ((x >= grid[:, :-1]) & (x < grid[:, 1:])).cast(x.dtype)
- for k in range(1, self.spline_order + 1):
- bases = (
- (x - grid[:, : -(k + 1)])
- / (grid[:, k:-1] - grid[:, : -(k + 1)])
- * bases[:, :, :-1]
- ) + (
- (grid[:, k + 1 :] - x)
- / (grid[:, k + 1 :] - grid[:, 1:(-k)])
- * bases[:, :, 1:]
- )
-
- assert tuple(bases.shape) == tuple((
- x.shape[0],
- self.in_features,
- self.grid_size + self.spline_order,
- ))
- return bases
-
-
- def curve2coeff(self, x: paddle.Tensor, y: paddle.Tensor):
- """
- Compute the coefficients of the curve that interpolates the given points.
- Args:
- x (paddle.Tensor): Input tensor of shape (batch_size, in_features).
- y (paddle.Tensor): Output tensor of shape (batch_size, in_features, out_features).
- Returns:
- paddle.Tensor: Coefficients tensor of shape (out_features, in_features, grid_size + spline_order).
- """
- assert x.ndim == 2 and x.shape[1] == self.in_features
- assert y.shape == [x.shape[0], self.in_features, self.out_features]
-
- A = self.b_splines(x).transpose(
- [1, 0, 2]
- ) # (in_features, batch_size, grid_size + spline_order)
- B = y.transpose([1, 0, 2]) # (in_features, batch_size, out_features)
- solution = paddle.linalg.lstsq(
- A, B
- ) # solution: (in_features, grid_size + spline_order, out_features)
-
- result = solution[0].transpose(
- [2, 0, 1]
- ) # (out_features, in_features, grid_size + spline_order)
-
- assert result.shape == [
- self.out_features,
- self.in_features,
- self.grid_size + self.spline_order,
- ]
- return result
-
- def forward(self, x: paddle.Tensor):
- base_output = F.linear(self.base_activation(x), self.base_weight.transpose([1, 0]))
- spline_output = F.linear(
- self.b_splines(x).reshape([x.shape[0], -1]),
- self.spline_weight.reshape([self.out_features, -1]).transpose([1, 0])
- )
- return base_output + spline_output
-
- @paddle.no_grad()
- def update_grid(self, x: paddle.Tensor, margin=0.01):
- assert x.ndim == 2 and x.shape[1] == self.in_features
- batch = x.shape[0]
-
- splines = self.b_splines(x) # (batch, in, coeff)
- splines = splines.transpose([1, 0, 2]) # (in, batch, coeff)
- orig_coeff = self.spline_weight # (out, in, coeff)
- orig_coeff = orig_coeff.transpose([1, 2, 0]) # (in, coeff, out)
- unreduced_spline_output = paddle.bmm(splines, orig_coeff) # (in, batch, out)
- unreduced_spline_output = unreduced_spline_output.transpose([1, 0, 2]) # (batch, in, out)
-
- # Sort each channel individually to collect data distribution
- x_sorted = paddle.sort(x, axis=0)
- grid_adaptive = x_sorted[
- paddle.linspace(
- 0, batch - 1, self.grid_size + 1, dtype='int64'
- ).astype('int32')
- ]
-
- uniform_step = (x_sorted[-1] - x_sorted[0] + 2 * margin) / self.grid_size
- grid_uniform = (
- paddle.arange(
- self.grid_size + 1, dtype='float32'
- ).unsqueeze(1)
- * uniform_step
- + x_sorted[0]
- - margin
- )
-
- grid = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
- grid = paddle.concat(
- [
- grid[:1]
- - uniform_step
- * paddle.arange(self.spline_order, 0, -1, dtype='float32').unsqueeze(1),
- grid,
- grid[-1:]
- + uniform_step
- * paddle.arange(1, self.spline_order + 1, dtype='float32').unsqueeze(1),
- ],
- axis=0
- )
-
- self.grid.set_value(grid.T)
- self.spline_weight.set_value(self.curve2coeff(x, unreduced_spline_output))
-
- def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
- l1_norm = paddle.mean(paddle.abs(self.spline_weight), axis=-1)
- reg_loss_activation = paddle.sum(l1_norm)
- p = l1_norm / reg_loss_activation
- reg_loss_entropy = -paddle.sum(p * paddle.log(p + 1e-8))
- return (
- regularize_activation * reg_loss_activation +
- regularize_entropy * reg_loss_entropy
- )
-
- class KAN(paddle.nn.Layer):
- def __init__(
- self,
- layers_hidden,
- grid_size=5,
- spline_order=3,
- scale_noise=0.1,
- scale_base=1.0,
- scale_spline=1.0,
- base_activation=paddle.nn.Silu,
- grid_eps=0.02,
- grid_range=[-1, 1],
- ):
- super(KAN, self).__init__()
- self.layers = paddle.nn.LayerList()
- for in_features, out_features in zip(layers_hidden[:-1], layers_hidden[1:]):
- self.layers.append(
- KANLinear(
- in_features,
- out_features,
- grid_size=grid_size,
- spline_order=spline_order,
- scale_noise=scale_noise,
- scale_base=scale_base,
- scale_spline=scale_spline,
- base_activation=base_activation,
- grid_eps=grid_eps,
- grid_range=grid_range,
- )
- )
-
- def forward(self, x: paddle.Tensor, update_grid=False):
- for layer in self.layers:
- if update_grid:
- layer.update_grid(x)
- x = layer(x)
- return x
-
- def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
- return sum(
- layer.regularization_loss(regularize_activation, regularize_entropy)
- for layer in self.layers
- )
周六晚上20:00-22:00 大家可以到集智乐园观看。
据说效果非常好,期待
期待自己能看懂!
受Kolmogorov Arnold表示定理的启发,我们提出了Kolmogorov-Anold网络(KAN)作为多层感知器(MLP)的有前途的替代方案。
虽然MLP在节点(“神经元”)上具有固定的激活功能,但KAN具有可学习性
边上的激活函数(“权重”)。KANs根本没有线性权重——每
权重参数由参数化为样条曲线的单变量函数代替。我们展示
这个看似简单的变化使KANs在准确性方面优于MLP
以及可解释性。就准确性而言,更小的KANs可以实现相当或更好的结果
在数据拟合和PDE求解中,精度远大于MLP。理论上和经验上,KANs拥有比MLP更快的神经缩放定律。对于可解释性,KANs
可以直观地可视化并且可以容易地与人类用户交互。通过数学和物理学中的两个例子,KAN被证明是有用的“合作者”,有助于科学家重新发现数学和物理定律。总之,KANs很有希望
MLP的替代方案,为进一步改进当今严重依赖MLP的模型的深度学习提供了机会。
多层感知器(MLP)[1,2,3],也称为全连接前馈神经网络,是当今深度学习模型的基础构建块。由于MLP表达能力受到普遍逼近定理[3]的保证,MLP是机器学习中用于近似的默认非线性函数模型,他们的重要性毋庸置疑。然而,MLP是我们能建立的最好的非线性回归器吗?尽管普遍使用MLP,它们也有有显著的缺点。例如,在Transformer模型[4]中,MLP几乎消耗了所有非嵌入参数,并且通常不太可解释(相对于注意力层)后分析工具[5]。
我们提出了一种很有前途的MLP替代方案,称为Kolmogorov Arnold Networks(KAN)。
MLP受到普遍近似定理的启发,而KAN受到
Kolmogorov-Anold表示定理[6,7]。与MLP一样,KAN具有完全连接的结构。然而,当MLP在节点(“神经元”)上放置固定的激活函数时,KAN放置
边缘上的可学习激活函数(“权重”),如图0.1所示。因此,KAN
根本没有线性权重矩阵:相反,每个权重参数都被可学习的1D代替
函数参数化为样条曲线。KAN的节点简单地对输入信号求和,而不应用任何
非线性。人们可能会担心KAN的价格高得令人绝望,因为每个MLP的重量
参数变为KAN的样条函数。幸运的是,KAN通常允许比MLP小得多的计算图。例如,我们展示了对于PDE求解,2层宽度为10的KAN(2-Layer width-10 KAN) 100次求解 精度要高于
4层宽度-100 MLP(4-Layer width-100 MLP 10−7 vs 10−5 MSE)求解100次, 参数也更高效(10^2 vs 10^4 parameters)
这个公式太简单了,以至于大家争论非常大。连我这看不懂公式的人,也感觉简单。
拆开之后,结果就是求下面累加和的平均数,简直太简单了。
不过考虑到论文作者是北大数学系毕业,我认为其严谨的工作态度,还真可能就找到这么简洁而漂亮的人工智能密码。
KAN作为人工智能+科学的“语言模型”,大语言模型之所以如此
流行是因为它们对任何会说自然语言的人都很有用,同样KAN也是这样,KAN由可解释的函数组成,因此当用户使用KAN时就像使用函数语言(大模型语言)与它进行通信一样。本段旨在
推广人工智能科学家协作模式,而不是我们的特定工具KAN。就像
人们使用不同的语言进行交流,我们预计在未来KAN将只是一种
人工智能+科学的语言之一,尽管KANs语言将是最早的
使人工智能和人类能够进行通信语言之一。然而,在KAN的支持下,人工智能科学家合作
范式从来没有这么容易和方便,这导致我们重新思考如何
促进人工智能+科学:我们想要人工智能科学家,还是想要帮助科学家的人工智能?
(全自动化)人工智能科学家的内在困难在于很难做出人类的偏好
量化,把人类偏好编码成人工智能目标。事实上,不同领域的科学家
可能对哪些函数是简单的或可解释的有不同的感受。因此,
科学家们更希望拥有一种能说科学语言(功能)并能
方便地与个别科学家的归纳偏见互动,以适应特定的科学
领域的人工智能。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。