当前位置:   article > 正文

很牛的东西PaddleKAN(Paddle kolmogorov arnold network)_kan代码

kan代码

全新神经网络KAN kolmogorov arnold network  ,论文一出就炸裂了,大家对它都很关注!

论文:https://arxiv.org/abs/2404.19756

飞桨代码:GitHub - yrqUni/PaddleKAN

据说这个代码并不能达到论文的效果,不过可以先了解了解思路。 

PaddleKAN代码

  1. import paddle
  2. import paddle.nn.functional as F
  3. class KANLinear(paddle.nn.Layer):
  4. def __init__(
  5. self,
  6. in_features,
  7. out_features,
  8. grid_size=5,
  9. spline_order=3,
  10. scale_noise=0.1,
  11. scale_base=1.0,
  12. scale_spline=1.0,
  13. base_activation=paddle.nn.Silu,
  14. grid_eps=0.02,
  15. grid_range=[-1, 1],
  16. ):
  17. super(KANLinear, self).__init__()
  18. self.in_features = in_features
  19. self.out_features = out_features
  20. self.grid_size = grid_size
  21. self.spline_order = spline_order
  22. h = (grid_range[1] - grid_range[0]) / grid_size
  23. grid = (
  24. paddle.arange(-spline_order, grid_size + spline_order + 1, dtype=paddle.float32) * h
  25. + grid_range[0]
  26. ).expand([in_features, -1]).contiguous()
  27. self.register_buffer("grid", grid)
  28. self.base_weight = self.create_parameter(
  29. shape=[out_features, in_features], default_initializer=paddle.nn.initializer.Constant(value=scale_base))
  30. self.spline_weight = self.create_parameter(
  31. shape=[out_features, in_features, grid_size + spline_order], default_initializer=paddle.nn.initializer.Constant(value=scale_spline))
  32. self.scale_noise = scale_noise
  33. self.scale_base = scale_base
  34. self.scale_spline = scale_spline
  35. self.base_activation = base_activation()
  36. self.grid_eps = grid_eps
  37. self.reset_parameters()
  38. def reset_parameters(self):
  39. self.base_weight.set_value(paddle.full([self.out_features, self.in_features], self.scale_base))
  40. with paddle.no_grad():
  41. noise = (
  42. paddle.rand([self.grid_size + 1, self.in_features, self.out_features], dtype=paddle.float32)
  43. - 0.5
  44. ) * self.scale_noise / self.grid_size
  45. self.spline_weight.set_value(
  46. self.scale_spline
  47. * self.curve2coeff(
  48. self.grid.T[self.spline_order:-self.spline_order],
  49. noise,
  50. )
  51. )
  52. def b_splines(self, x: paddle.Tensor):
  53. """
  54. Compute the B-spline bases for the given input tensor.
  55. Args:
  56. x (paddle.Tensor): Input tensor of shape (batch_size, in_features).
  57. Returns:
  58. paddle.Tensor: B-spline bases tensor of shape (batch_size, in_features, grid_size + spline_order).
  59. """
  60. assert x.ndim == 2 and x.shape[1] == self.in_features
  61. grid: paddle.Tensor = (
  62. self.grid
  63. ) # (in_features, grid_size + 2 * spline_order + 1)
  64. x = x.unsqueeze(-1)
  65. bases = ((x >= grid[:, :-1]) & (x < grid[:, 1:])).cast(x.dtype)
  66. for k in range(1, self.spline_order + 1):
  67. bases = (
  68. (x - grid[:, : -(k + 1)])
  69. / (grid[:, k:-1] - grid[:, : -(k + 1)])
  70. * bases[:, :, :-1]
  71. ) + (
  72. (grid[:, k + 1 :] - x)
  73. / (grid[:, k + 1 :] - grid[:, 1:(-k)])
  74. * bases[:, :, 1:]
  75. )
  76. assert tuple(bases.shape) == tuple((
  77. x.shape[0],
  78. self.in_features,
  79. self.grid_size + self.spline_order,
  80. ))
  81. return bases
  82. def curve2coeff(self, x: paddle.Tensor, y: paddle.Tensor):
  83. """
  84. Compute the coefficients of the curve that interpolates the given points.
  85. Args:
  86. x (paddle.Tensor): Input tensor of shape (batch_size, in_features).
  87. y (paddle.Tensor): Output tensor of shape (batch_size, in_features, out_features).
  88. Returns:
  89. paddle.Tensor: Coefficients tensor of shape (out_features, in_features, grid_size + spline_order).
  90. """
  91. assert x.ndim == 2 and x.shape[1] == self.in_features
  92. assert y.shape == [x.shape[0], self.in_features, self.out_features]
  93. A = self.b_splines(x).transpose(
  94. [1, 0, 2]
  95. ) # (in_features, batch_size, grid_size + spline_order)
  96. B = y.transpose([1, 0, 2]) # (in_features, batch_size, out_features)
  97. solution = paddle.linalg.lstsq(
  98. A, B
  99. ) # solution: (in_features, grid_size + spline_order, out_features)
  100. result = solution[0].transpose(
  101. [2, 0, 1]
  102. ) # (out_features, in_features, grid_size + spline_order)
  103. assert result.shape == [
  104. self.out_features,
  105. self.in_features,
  106. self.grid_size + self.spline_order,
  107. ]
  108. return result
  109. def forward(self, x: paddle.Tensor):
  110. base_output = F.linear(self.base_activation(x), self.base_weight.transpose([1, 0]))
  111. spline_output = F.linear(
  112. self.b_splines(x).reshape([x.shape[0], -1]),
  113. self.spline_weight.reshape([self.out_features, -1]).transpose([1, 0])
  114. )
  115. return base_output + spline_output
  116. @paddle.no_grad()
  117. def update_grid(self, x: paddle.Tensor, margin=0.01):
  118. assert x.ndim == 2 and x.shape[1] == self.in_features
  119. batch = x.shape[0]
  120. splines = self.b_splines(x) # (batch, in, coeff)
  121. splines = splines.transpose([1, 0, 2]) # (in, batch, coeff)
  122. orig_coeff = self.spline_weight # (out, in, coeff)
  123. orig_coeff = orig_coeff.transpose([1, 2, 0]) # (in, coeff, out)
  124. unreduced_spline_output = paddle.bmm(splines, orig_coeff) # (in, batch, out)
  125. unreduced_spline_output = unreduced_spline_output.transpose([1, 0, 2]) # (batch, in, out)
  126. # Sort each channel individually to collect data distribution
  127. x_sorted = paddle.sort(x, axis=0)
  128. grid_adaptive = x_sorted[
  129. paddle.linspace(
  130. 0, batch - 1, self.grid_size + 1, dtype='int64'
  131. ).astype('int32')
  132. ]
  133. uniform_step = (x_sorted[-1] - x_sorted[0] + 2 * margin) / self.grid_size
  134. grid_uniform = (
  135. paddle.arange(
  136. self.grid_size + 1, dtype='float32'
  137. ).unsqueeze(1)
  138. * uniform_step
  139. + x_sorted[0]
  140. - margin
  141. )
  142. grid = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
  143. grid = paddle.concat(
  144. [
  145. grid[:1]
  146. - uniform_step
  147. * paddle.arange(self.spline_order, 0, -1, dtype='float32').unsqueeze(1),
  148. grid,
  149. grid[-1:]
  150. + uniform_step
  151. * paddle.arange(1, self.spline_order + 1, dtype='float32').unsqueeze(1),
  152. ],
  153. axis=0
  154. )
  155. self.grid.set_value(grid.T)
  156. self.spline_weight.set_value(self.curve2coeff(x, unreduced_spline_output))
  157. def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
  158. l1_norm = paddle.mean(paddle.abs(self.spline_weight), axis=-1)
  159. reg_loss_activation = paddle.sum(l1_norm)
  160. p = l1_norm / reg_loss_activation
  161. reg_loss_entropy = -paddle.sum(p * paddle.log(p + 1e-8))
  162. return (
  163. regularize_activation * reg_loss_activation +
  164. regularize_entropy * reg_loss_entropy
  165. )
  166. class KAN(paddle.nn.Layer):
  167. def __init__(
  168. self,
  169. layers_hidden,
  170. grid_size=5,
  171. spline_order=3,
  172. scale_noise=0.1,
  173. scale_base=1.0,
  174. scale_spline=1.0,
  175. base_activation=paddle.nn.Silu,
  176. grid_eps=0.02,
  177. grid_range=[-1, 1],
  178. ):
  179. super(KAN, self).__init__()
  180. self.layers = paddle.nn.LayerList()
  181. for in_features, out_features in zip(layers_hidden[:-1], layers_hidden[1:]):
  182. self.layers.append(
  183. KANLinear(
  184. in_features,
  185. out_features,
  186. grid_size=grid_size,
  187. spline_order=spline_order,
  188. scale_noise=scale_noise,
  189. scale_base=scale_base,
  190. scale_spline=scale_spline,
  191. base_activation=base_activation,
  192. grid_eps=grid_eps,
  193. grid_range=grid_range,
  194. )
  195. )
  196. def forward(self, x: paddle.Tensor, update_grid=False):
  197. for layer in self.layers:
  198. if update_grid:
  199. layer.update_grid(x)
  200. x = layer(x)
  201. return x
  202. def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
  203. return sum(
  204. layer.regularization_loss(regularize_activation, regularize_entropy)
  205. for layer in self.layers
  206. )

5.11日有直播课:

周六晚上20:00-22:00 大家可以到集智乐园观看。

据说效果非常好,期待

期待自己能看懂!

论文摘要:

受Kolmogorov Arnold表示定理的启发,我们提出了Kolmogorov-Anold网络(KAN)作为多层感知器(MLP)的有前途的替代方案。
虽然MLP在节点(“神经元”)上具有固定的激活功能,但KAN具有可学习性
边上的激活函数(“权重”)。KANs根本没有线性权重——每
权重参数由参数化为样条曲线的单变量函数代替。我们展示
这个看似简单的变化使KANs在准确性方面优于MLP
以及可解释性。就准确性而言,更小的KANs可以实现相当或更好的结果
在数据拟合和PDE求解中,精度远大于MLP。理论上和经验上,KANs拥有比MLP更快的神经缩放定律。对于可解释性,KANs
可以直观地可视化并且可以容易地与人类用户交互。通过数学和物理学中的两个例子,KAN被证明是有用的“合作者”,有助于科学家重新发现数学和物理定律。总之,KANs很有希望
MLP的替代方案,为进一步改进当今严重依赖MLP的模型的深度学习提供了机会。

介绍

多层感知器(MLP)[1,2,3],也称为全连接前馈神经网络,是当今深度学习模型的基础构建块。由于MLP表达能力受到普遍逼近定理[3]的保证,MLP是机器学习中用于近似的默认非线性函数模型,他们的重要性毋庸置疑。然而,MLP是我们能建立的最好的非线性回归器吗?尽管普遍使用MLP,它们也有有显著的缺点。例如,在Transformer模型[4]中,MLP几乎消耗了所有非嵌入参数,并且通常不太可解释(相对于注意力层)后分析工具[5]。

我们提出了一种很有前途的MLP替代方案,称为Kolmogorov Arnold Networks(KAN)。
MLP受到普遍近似定理的启发,而KAN受到
Kolmogorov-Anold表示定理[6,7]。与MLP一样,KAN具有完全连接的结构。然而,当MLP在节点(“神经元”)上放置固定的激活函数时,KAN放置
边缘上的可学习激活函数(“权重”),如图0.1所示。因此,KAN
根本没有线性权重矩阵:相反,每个权重参数都被可学习的1D代替
函数参数化为样条曲线。KAN的节点简单地对输入信号求和,而不应用任何
非线性。人们可能会担心KAN的价格高得令人绝望,因为每个MLP的重量
参数变为KAN的样条函数。幸运的是,KAN通常允许比MLP小得多的计算图。例如,我们展示了对于PDE求解,2层宽度为10的KAN(2-Layer width-10 KAN) 100次求解 精度要高于
4层宽度-100 MLP(4-Layer width-100 MLP  10−7 vs 10−5 MSE)求解100次, 参数也更高效(10^2 vs 10^4 parameters)

公式

f(x_1, ... ,x_N ) = exp (\frac{1}{N} \sum_{i=1}^{n} sin^2 (xi) )

 这个公式太简单了,以至于大家争论非常大。连我这看不懂公式的人,也感觉简单。

拆开之后,结果就是求下面累加和的平均数,简直太简单了。

sin^2 (x_1) ) + sin^2 (x_2) ) +sin^2 (x_3) ) 

不过考虑到论文作者是北大数学系毕业,我认为其严谨的工作态度,还真可能就找到这么简洁而漂亮的人工智能密码。

讨论

KAN作为人工智能+科学的“语言模型”,大语言模型之所以如此
流行是因为它们对任何会说自然语言的人都很有用,同样KAN也是这样,KAN由可解释的函数组成,因此当用户使用KAN时就像使用函数语言(大模型语言)与它进行通信一样。本段旨在
推广人工智能科学家协作模式,而不是我们的特定工具KAN。就像
人们使用不同的语言进行交流,我们预计在未来KAN将只是一种
人工智能+科学的语言之一,尽管KANs语言将是最早的
使人工智能和人类能够进行通信语言之一。然而,在KAN的支持下,人工智能科学家合作
范式从来没有这么容易和方便,这导致我们重新思考如何
促进人工智能+科学:我们想要人工智能科学家,还是想要帮助科学家的人工智能?
(全自动化)人工智能科学家的内在困难在于很难做出人类的偏好
量化,把人类偏好编码成人工智能目标。事实上,不同领域的科学家
可能对哪些函数是简单的或可解释的有不同的感受。因此,
科学家们更希望拥有一种能说科学语言(功能)并能
方便地与个别科学家的归纳偏见互动,以适应特定的科学
领域的人工智能。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/635851
推荐阅读
相关标签
  

闽ICP备14008679号