赞
踩
- class KMeans private (
- private var k: Int,
- private var maxIterations: Int,
- private var initializationMode: String,
- private var initializationSteps: Int,
- private var epsilon: Double,
- private var seed: Long) extends Serializable
参数 | 定义 |
K | 聚的总类 |
maxIterations | 迭代的次数 |
initializationMode | 有 random 和 k-means||两种 |
initializationSteps | 初始化的步长 |
epsilon | 最小中心距离的筏值 |
seed | 随机数的种子 |
- if (initializationMode == KMeans.RANDOM) {
- initRandom(data)
- } else {
- initKMeansParallel(data)
- }
- // Compute squared norms and cache them.
- val norms = data.map(Vectors.norm(_, 2.0))
- norms.persist()
- else if (p == 2) {
- var sum = 0.0
- var i = 0
- while (i < size) {
- sum += values(i) * values(i)
- i += 1
- }
- math.sqrt(sum)
- val sumSquaredNorm = norm1 * norm1 + norm2 * norm2
- val normDiff = norm1 - norm2
- val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON)
- if (precisionBound1 < precision) {
- sqDist = sumSquaredNorm - 2.0 * dot(v1, v2)
- }
如果在精度(precision: Double = 1e-6)满足条件的情况下,欧式距离sqDist = sumSquaredNorm - 2.0 * v1.dot(v2),sumSquaredNorm即为
,2.0 * v1.dot(v2)即为
lowerBoundOfSqDist=(norm1-norm2)*(norm1-norm2)
- private[mllib] def findClosest(
- centers: TraversableOnce[VectorWithNorm],
- point: VectorWithNorm): (Int, Double) = {
- var bestDistance = Double.PositiveInfinity
- var bestIndex = 0
- var i = 0
- centers.foreach { center =>
- // Since `\|a - b\| \geq |\|a\| - \|b\||`, we can use this lower bound to avoid unnecessary
- // distance computation.
- var lowerBoundOfSqDist = center.norm - point.norm
- lowerBoundOfSqDist = lowerBoundOfSqDist * lowerBoundOfSqDist
- if (lowerBoundOfSqDist < bestDistance) {
- val distance: Double = fastSquaredDistance(center, point)
- if (distance < bestDistance) {
- bestDistance = distance
- bestIndex = i
- }
- }
- i += 1
- }
- (bestIndex, bestDistance)
- }
- totalContribs.foreach { case (j, (sum, count)) =>
- scal(1.0 / count, sum)
- val newCenter = new VectorWithNorm(sum)
- if (converged && KMeans.fastSquaredDistance(newCenter, centers(j)) > epsilon * epsilon) {
- converged = false
- }
- centers(j) = newCenter
- }
- @Since("0.8.0")
- class KMeansModel @Since("1.1.0") (@Since("1.0.0") val clusterCenters: Array[Vector])
- extends Saveable with Serializable with PMMLExportable {
-
- /**
- * A Java-friendly constructor that takes an Iterable of Vectors.
- */
- @Since("1.4.0")
- def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray)
-
- /**
- * Total number of clusters.
- */
- @Since("0.8.0")
- def k: Int = clusterCenters.length
-
- /**
- * Returns the cluster index that a given point belongs to.
- */
- @Since("0.8.0")
- def predict(point: Vector): Int = {
- KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
- }
-
- /**
- * Maps given points to their cluster indices.
- */
- @Since("1.0.0")
- def predict(points: RDD[Vector]): RDD[Int] = {
- val centersWithNorm = clusterCentersWithNorm
- val bcCentersWithNorm = points.context.broadcast(centersWithNorm)
- points.map(p => KMeans.findClosest(bcCentersWithNorm.value, new VectorWithNorm(p))._1)
- }
-
- /**
- * Maps given points to their cluster indices.
- */
- @Since("1.0.0")
- def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
- predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
-
- /**
- * Return the K-means cost (sum of squared distances of points to their nearest center) for this
- * model on the given data.
- */
- @Since("0.8.0")
- def computeCost(data: RDD[Vector]): Double = {
- val centersWithNorm = clusterCentersWithNorm
- val bcCentersWithNorm = data.context.broadcast(centersWithNorm)
- data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum()
- }
-
- private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
- clusterCenters.map(new VectorWithNorm(_))
-
- @Since("1.4.0")
- override def save(sc: SparkContext, path: String): Unit = {
- KMeansModel.SaveLoadV1_0.save(sc, this, path)
- }
-
- override protected def formatVersion: String = "1.0"
- }
- @Since("0.8.0")
- def predict(point: Vector): Int = {
- KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
- }
看了熟悉的函数findClosest,那些中心点是在聚类结束创建中心点
new KMeansModel(centers.map(_.vector))
- @Since("0.8.0")
- def computeCost(data: RDD[Vector]): Double = {
- val centersWithNorm = clusterCentersWithNorm
- val bcCentersWithNorm = data.context.broadcast(centersWithNorm)
- data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum()
- }
函数的算法:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。