- package mllib
- import org.apache.spark.{SparkConf, SparkContext}
- import org.apache.spark.mllib.linalg.DenseVector
- import org.apache.spark.mllib.regression.LabeledPoint
- import org.apache.spark.mllib.util.MLUtils
- import org.apache.spark.rdd.RDD
- import scala.util.Random
- /**
- * created by LMR on 2019/6/10
- */
- object NaiveBayesDataProduce {
- def main(args: Array[String]): Unit = {
- val random = new Random()
- val numsInstances = 1000
- val numsfeature = 3
- val numsclass = 3
- var data: Array[LabeledPoint] = Array.fill[LabeledPoint](numsInstances)(null)
- for (i <- 0 to numsInstances - 1)
- {
- val array: Array[Double] = Array.fill[Double](numsfeature)(1)
- for (j <- 0 to numsfeature - 1){
- array(j) = random.nextInt(5)
- }
- val vector = new DenseVector(array)
- val label: Int = random.nextInt(3)
- data(i) = LabeledPoint(label, vector)
- }
- val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("naiveBayes")
- val sc = new SparkContext(conf)
- val dataRDD: RDD[LabeledPoint] = sc.parallelize(data)
- MLUtils.saveAsLibSVMFile(dataRDD,"E://output")
- }
- }

- package mllib
- import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
- import org.apache.spark.mllib.regression.LabeledPoint
- import org.apache.spark.mllib.util.{MLUtils, SVMDataGenerator}
- import org.apache.spark.rdd.RDD
- import org.apache.spark.{SparkConf, SparkContext}
- import org.apache.log4j.{Level, Logger}
- /**
- * created by LMR on 2019/6/10
- */
- object Naive_Bayes {//要求特征值非负
- Logger.getRootLogger.setLevel(Level.ERROR)
- def main(args: Array[String]): Unit = {
- val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("naiveBayes")
- val sc = new SparkContext(conf)
- val data: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "E://output")
- //划分训练集喝测试集
- val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array(0.6,0.4), seed = 11L)
- val train: RDD[LabeledPoint] = splits(0)
- val test: RDD[LabeledPoint] = splits(1)
- //建立贝叶斯模型
- val model: NaiveBayesModel = NaiveBayes.train(train, lambda = 1.0, modelType = "multinomial")
- //测试集进行测试
- val predictionAndLabel: RDD[(Double, Double)] = test.map(p => (model.predict(p.features), p.label))
- val print_prediction: Array[(Double, Double)] = predictionAndLabel.take(20)
- for (elem <- print_prediction) {println(elem._1 + "\t" + elem._2)}
- //准确率
- val accuracy: Double = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
- println(accuracy)
- }
- }

