当前位置:   article > 正文

spark随机森林算法的应用

spark df randomforest

应用场景:预测反欺诈用户

1、构建用户画像,用户画像由多种业务指标组成(例如用户申请后回访次数、ip城市与gps城市是否一致等等)

2、提取用户画像的业务指标--->将指标数值变成向量和矩阵

3、先验数据集:总数据条数:75568 正常用户条数:72723  确定欺诈用户条数:2845

     训练数据集条数:60162     预测数据集:15406

155722_LI9b_3455048.png

4、使用RandomForestClassifier算法

代码如下

  1. package mllib
  2. import org.apache.spark.ml.Pipeline
  3. import org.apache.spark.ml.classification.RandomForestClassifier
  4. import org.apache.spark.ml.feature._
  5. import org.apache.spark.sql.{DataFrame, SparkSession}
  6. /**
  7. * Created by dongdong on 17/6/16.
  8. */
  9. case class Feature(cid: String, label: String, f2: Double, f3: Double, f4: Double, f5: Double, f6: Double, f7: Double,
  10. f8: Double, f9: Double, f10: Double, f11: Double, f12: Double, f13: Double, f14: Double,
  11. f15: Double, f16: Double, f17: Double, f18: Double, f19: Double, f20: Double, f21: Double,
  12. f22: Double, f23: Double, f24: Double, f25: Double, f26: Double, f27: Double, f28: Double,
  13. f29: Double, f30: Double, f31: Double, f32: Double, text: String, f38: Double, f39: Double
  14. )
  15. object UserProfile_Forest {
  16. def main(args: Array[String]): Unit = {
  17. val inpath = "/user/hive/warehouse/user_profile_tmp_db.db/t_cid_feature/*"
  18. val spark = SparkSession
  19. .builder()
  20. .master("local[3]")
  21. .appName("UserProfile_Forest")
  22. .getOrCreate()
  23. import spark.implicits._
  24. //Build a dataset and read data
  25. val originalData = spark.sparkContext
  26. .textFile(inpath)
  27. .map(line => {
  28. val arr = line.split("\001")
  29. val cid = arr(0)
  30. val f1 = arr(1)
  31. val f2 = arr(2).replace("\\N", "0").toDouble
  32. val f3 = arr(3).replace("\\N", "0").toDouble
  33. val f4 = arr(4).replace("\\N", "0").toDouble
  34. val f5 = arr(5).replace("\\N", "0").toDouble
  35. val f6 = arr(6).replace("\\N", "0").toDouble
  36. val f7 = arr(7).replace("\\N", "0").toDouble
  37. val f8 = arr(8).replace("\\N", "0").toDouble
  38. val f9 = arr(9).replace("\\N", "0").toDouble
  39. val f10 = arr(10).replace("\\N", "0").toDouble
  40. val f11 = arr(11).replace("\\N", "0").toDouble
  41. val f12 = arr(12).replace("\\N", "0").toDouble
  42. val f13 = arr(13).replace("\\N", "0").toDouble
  43. val f14 = arr(14).replace("\\N", "0").toDouble
  44. val f15 = arr(15).replace("\\N", "0").toDouble
  45. val f16 = arr(16).replace("\\N", "0").toDouble
  46. val f17 = arr(17).replace("\\N", "0").toDouble
  47. val f18 = arr(18).replace("\\N", "0").toDouble
  48. val f19 = arr(19).replace("\\N", "0").toDouble
  49. val f20 = arr(20).replace("\\N", "0").toDouble
  50. val f21 = arr(21).replace("\\N", "0").toDouble
  51. val f22 = arr(22).replace("\\N", "0").toDouble
  52. val f23 = arr(23).replace("\\N", "0").toDouble
  53. val f24 = arr(24).replace("\\N", "0").toDouble
  54. val f25 = arr(25).replace("\\N", "0").toDouble
  55. val f26 = arr(26).replace("\\N", "0").toDouble
  56. val f27 = arr(27).replace("\\N", "0").toDouble
  57. val f28 = arr(28).replace("\\N", "0").toDouble
  58. val f29 = arr(29).replace("\\N", "0").toDouble
  59. val f30 = arr(35).replace("\\N", "0").toDouble
  60. val f31 = arr(36).replace("\\N", "0").toDouble
  61. val f32 = arr(37).replace("\\N", "0").toDouble
  62. val text = arr(40) + "|" + arr(41)
  63. //val f35 = arr(44).replace("\\N", "0").toDouble
  64. //val f36 = arr(45).replace("\\N", "0").toDouble
  65. // val f37 = arr(46).replace("\\N", "0").toDouble
  66. val f38 = arr(47).replace("\\N", "0").toDouble
  67. val f39 = arr(48).replace("\\N", "0").toDouble
  68. Feature(cid, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27
  69. , f28, f29, f30, f31, f32, text, f38, f39)
  70. }
  71. ).toDS
  72. //build label
  73. val labelIndexer = new StringIndexer()
  74. .setInputCol("label")
  75. .setOutputCol("indexedLabel")
  76. .fit(originalData)
  77. val tokenizer = new RegexTokenizer()
  78. .setInputCol("text")
  79. .setOutputCol("words")
  80. .setPattern("\\|")
  81. val word2Vec = new Word2Vec()
  82. .setInputCol("words")
  83. .setOutputCol("feature_one")
  84. .setVectorSize(100)
  85. //.setMinCount(1)
  86. .setMaxIter(20)
  87. val arr = Array("f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", "f32", "feature_one", "f38", "f39")
  88. val vectorAssembler = new VectorAssembler()
  89. .setInputCols(arr)
  90. .setOutputCol("featureVector")
  91. //set setnumtrees 1
  92. val rfClassifier = new RandomForestClassifier()
  93. .setLabelCol("indexedLabel")
  94. .setFeaturesCol("featureVector")
  95. .setNumTrees(1)
  96. //predict label to string
  97. val labelConverter = new IndexToString()
  98. .setInputCol("prediction")
  99. .setOutputCol("predictedLabel")
  100. .setLabels(labelIndexer.labels)
  101. val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2))
  102. //build pipeline
  103. val pipeline = new Pipeline().setStages(Array(labelIndexer, tokenizer, word2Vec, vectorAssembler, rfClassifier, labelConverter))
  104. //train data
  105. val model = pipeline.fit(trainingData)
  106. //predict data
  107. val predictionResultDF = model.transform(testData)
  108. //predict corrector bad user
  109. val correctcount_baduser = predictionResultDF.select("cid", "label", "predictedLabel")
  110. .filter($"label" === $"predictedLabel")
  111. .filter($"label" === 1).count()
  112. //predict corrector good user
  113. val correctcount_gooduser = predictionResultDF.select("cid", "label", "predictedLabel")
  114. .filter($"label" === $"predictedLabel")
  115. .filter($"label" === 0).count()
  116. spark.stop()
  117. }
  118. }

debug时的一些向量特征如下

  1. +--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+
  2. |cid |label|f2 |f3 |f4 |f5 |f6 |f7 |f8 |f9 |f10|f11|f12|f13|f14|f15|f16|f17|f18|f19|f20|f21|f22|f23|f24 |f25|f26 |f27|f28|f29 |f30|f31|f32|text |f38|f39|indexedLabel|words |feature_one |featureVector |rawPrediction |probability |prediction|predictedLabel|
  3. +--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+
  4. |2**60327000*0017**12|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  5. |20**050300000031**55|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  6. |20**051800000043**09|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  7. |20**051900000044**35|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  8. |20**052100000047**47|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  9. |20**052600000051**75|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  10. |20**053100000057**95|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  11. |20**060100000057**90|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  12. |20**060300000060**02|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  13. |20**061500000072**13|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  14. |20**061700000073**10|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  15. |20**061700000074**37|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  16. |20**061900000077**27|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  17. |20**062100000080**02|1 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|0.0|1.0|1.0|0.0|0.0|21.0|1.0|20.0|0.0|0.0|18.0|1.0|1.0|1.0|\N|\N|0.5|1.0|1.0 |[\n, \n]|[-0.6971070766448975]|(34,[14,18,19,22,23,24,27,28,29,30,31,32,33],[1.0,1.0,1.0,21.0,1.0,20.0,18.0,1.0,1.0,1.0,-0.6971070766448975,0.5,1.0]) |[0.8320209973753281,0.1679790026246719] |[0.8320209973753281,0.1679790026246719] |0.0 |0 |
  18. |20**062400000083**16|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  19. |20**062400000084**81|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  20. |20**070500000098**50|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  21. |20**070600000099**12|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  22. |20**070900000102**72|0 |1.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|1.0|0.0|0.0|0.0|1.0|1.0|1.0|0.0|0.0|1.0|1.0|0.0|0.0|44.0|1.0|43.0|0.0|1.0|14.0|1.0|1.0|1.0|\N|高 |0.0|0.0|0.0 |[\n, 高] |[-0.8806669116020203]|(34,[0,6,9,13,14,15,18,19,22,23,24,26,27,28,29,30,31],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,44.0,1.0,43.0,1.0,14.0,1.0,1.0,1.0,-0.8806669116020203])|[0.9938829787234043,0.006117021276595745]|[0.9938829787234043,0.006117021276595745]|0.0 |0 |
  23. |20**071700000112**30|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 |
  24. +--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+

结果:

程序预测为欺诈用户为73人,其中正确为欺诈用户为57人,16人预测不正确(本身是正常用户被预测为欺诈用户)

程序预测为正常用户为14995人,其中正确为正常用户为14498人,其中407预测不正确

优化方向:1、gooduser 数据量和baduser数据量分布不均匀,可以提取更多baduser的先验数据

               2、用户画像中的指标数据量不全,导致有些用户是指标为null

               3、根据业务设置更多有用指标

总结:使用过lr算法和kmeas,但是效果不怎么好

 

转载于:https://my.oschina.net/u/3455048/blog/1031391

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/693273
推荐阅读
相关标签
  

闽ICP备14008679号