赞
踩
今天在拆分以下数据集的时候
area,perimeter,compactness,lengthOfKernel,widthOfKernel,asymmetryCoefficient,lengthOfKernelGroove
15.26,14.84,0.871,5.763,3.312,2.221,5.22
14.88,14.57,0.8811,5.554,3.333,1.018,4.956
14.29,14.09,0.905,5.291,3.337,2.699,4.825
13.84,13.94,0.8955,5.324,3.379,2.259,4.805
16.14,14.99,0.9034,5.658,3.562,1.355,5.175
14.38,14.21,0.8951,5.386,3.312,2.462,4.956
14.69,14.49,0.8799,5.563,3.259,3.586,5.219
14.11,14.1,0.8911,5.42,3.302,2.7,5
16.63,15.46,0.8747,6.053,3.465,2.04,5.877
16.44,15.25,0.888,5.884,3.505,1.969,5.533
15.26,14.85,0.8696,5.714,3.242,4.543,5.314
14.03,14.16,0.8796,5.438,3.201,1.717,5.001
13.89,14.02,0.888,5.439,3.199,3.986,4.738
13.78,14.06,0.8759,5.479,3.156,3.136,4.872
13.74,14.05,0.8744,5.482,3.114,2.932,4.825
14.59,14.28,0.8993,5.351,3.333,4.185,4.781
13.99,13.83,0.9183,5.119,3.383,5.234,4.781
15.69,14.75,0.9058,5.527,3.514,1.599,5.046
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
我得到了这么一个错误:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1950)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at homework.Seed$.main(Seed.scala:21)
at homework.Seed.main(Seed.scala)
Caused by: java.io.NotSerializableException: java.lang.Object
Serialization stack:
- object not serializable (class: java.lang.Object, value: java.lang.Object@118102ee)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 1)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class homework.Seed$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic homework/Seed$.$anonfun$main$2:(Ljava/lang/Object;Ljava/lang/String;)Lscala/collection/Iterable;, instantiatedMethodType=(Ljava/lang/String;)Lscala/collection/Iterable;, numCaptured=1])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class homework.Seed$$$Lambda$3/2139501486, homework.Seed$$$Lambda$3/2139501486@446c3920)
- field (class: org.apache.spark.rdd.RDD$$anonfun$flatMap$1$$anonfun$apply$6, name: cleanF$2, type: interface scala.Function1)
- object (class org.apache.spark.rdd.RDD$$anonfun$flatMap$1$$anonfun$apply$6, <function3>)
- field (class: org.apache.spark.rdd.MapPartitionsRDD, name: f, type: interface scala.Function3)
- object (class org.apache.spark.rdd.MapPartitionsRDD, MapPartitionsRDD[3] at flatMap at Seed.scala:21)
- field (class: org.apache.spark.rdd.RDD$$anonfun$collect$1, name: $outer, type: class org.apache.spark.rdd.RDD)
- object (class org.apache.spark.rdd.RDD$$anonfun$collect$1, <function0>)
- field (class: org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13, name: $outer, type: class org.apache.spark.rdd.RDD$$anonfun$collect$1)
- object (class org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 12 more
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
我的代码是这样的
case class SEED(area: Double, perimeter: Double, compactness: Double, lengthOfKernel: Double,
widthOfKernel: Double, asymmetryCoefficient: Double, lengthOfKernelGroove: Double)
object Seed {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("filter")
val sc = new SparkContext(conf)
val root = Seed.getClass.getResource("/")
val data = sc.textFile(root + "seeds.csv")
val regex ="""^\d""".r
val result = data.filter(regex.findFirstIn(_)!=None)
.flatMap { str => {
val info = str.split(",")
if (info(0).toDouble > 15)
Some(SEED(info(0).toDouble, info(1).toDouble, info(2).toDouble, info(3).toDouble, info(4).toDouble, info(5).toDouble, info(6).toDouble))
else
None
}
}
result.collect().foreach(println)
}
}
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
仔细检查代码,并没有发现什么错误,后来经过经过排查,是我使用的scala版本和spark的版本不兼容导致的,我使用的是spark
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
</dependency>
scala2.12.2
spark2.1.1使用scala2.11.8编译的,将scalaSDK换成2.11.8问题得到解决
下面给出spark2.1.1编译的相关的所有版本信息:
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.7</java.version>
<maven.version>3.3.9</maven.version>
<sbt.project.name>spark</sbt.project.name>
<slf4j.version>1.7.16</slf4j.version>
<log4j.version>1.2.17</log4j.version>
<hadoop.version>2.2.0</hadoop.version>
<protobuf.version>2.5.0</protobuf.version>
<yarn.version>${hadoop.version}</yarn.version>
<flume.version>1.6.0</flume.version>
<zookeeper.version>3.4.5</zookeeper.version>
<curator.version>2.4.0</curator.version>
<hive.group>org.spark-project.hive</hive.group>
<!-- Version used in Maven Hive dependency -->
<hive.version>1.2.1.spark2</hive.version>
<!-- Version used for internal directory structure -->
<hive.version.short>1.2.1</hive.version.short>
<derby.version>10.12.1.1</derby.version>
<parquet.version>1.8.1</parquet.version>
<hive.parquet.version>1.6.0</hive.parquet.version>
<jetty.version>9.2.16.v20160414</jetty.version>
<javaxservlet.version>3.1.0</javaxservlet.version>
<chill.version>0.8.0</chill.version>
<ivy.version>2.4.0</ivy.version>
<oro.version>2.0.8</oro.version>
<codahale.metrics.version>3.1.2</codahale.metrics.version>
<avro.version>1.7.7</avro.version>
<avro.mapred.classifier>hadoop2</avro.mapred.classifier>
<jets3t.version>0.7.1</jets3t.version>
<aws.kinesis.client.version>1.6.1</aws.kinesis.client.version>
<!-- the producer is used in tests -->
<aws.kinesis.producer.version>0.10.2</aws.kinesis.producer.version>
<!-- org.apache.httpcomponents/httpclient-->
<commons.httpclient.version>4.5.2</commons.httpclient.version>
<commons.httpcore.version>4.4.4</commons.httpcore.version>
<!-- commons-httpclient/commons-httpclient-->
<httpclient.classic.version>3.1</httpclient.classic.version>
<commons.math3.version>3.4.1</commons.math3.version>
<!-- managed up from 3.2.1 for SPARK-11652 -->
<commons.collections.version>3.2.2</commons.collections.version>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<codehaus.jackson.version>1.9.13</codehaus.jackson.version>
<fasterxml.jackson.version>2.6.5</fasterxml.jackson.version>
<snappy.version>1.1.2.6</snappy.version>
<netlib.java.version>1.1.2</netlib.java.version>
<calcite.version>1.2.0-incubating</calcite.version>
<commons-codec.version>1.10</commons-codec.version>
<commons-io.version>2.4</commons-io.version>
<!-- org.apache.commons/commons-lang/-->
<commons-lang2.version>2.6</commons-lang2.version>
<!-- org.apache.commons/commons-lang3/-->
<commons-lang3.version>3.5</commons-lang3.version>
<datanucleus-core.version>3.2.10</datanucleus-core.version>
<janino.version>3.0.0</janino.version>
<jersey.version>2.22.2</jersey.version>
<joda.version>2.9.3</joda.version>
<jodd.version>3.5.2</jodd.version>
<jsr305.version>1.3.9</jsr305.version>
<libthrift.version>0.9.3</libthrift.version>
<antlr4.version>4.5.3</antlr4.version>
<jpam.version>1.1</jpam.version>
<selenium.version>2.52.0</selenium.version>
<paranamer.version>2.8</paranamer.version>
<maven-antrun.version>1.8</maven-antrun.version>
<commons-crypto.version>1.0.0</commons-crypto.version>
<test.java.home>${java.home}</test.java.home>
<test.exclude.tags></test.exclude.tags>
<!-- When using different JDKs for the build, we can't use Zinc for the jdk8 part. -->
<useZincForJdk8>true</useZincForJdk8>
<!-- Package to use when relocating shaded classes. -->
<spark.shade.packageName>org.spark_project</spark.shade.packageName>
<!-- Modules that copy jars to the build directory should do so under this location. -->
<jars.target.dir>${project.build.directory}/scala-${scala.binary.version}/jars</jars.target.dir>
<!-- Allow modules to enable / disable certain build plugins easily. -->
<build.testJarPhase>prepare-package</build.testJarPhase>
<build.copyDependenciesPhase>none</build.copyDependenciesPhase>
<!--
Dependency scopes that can be overridden by enabling certain profiles. These profiles are
declared in the projects that build assemblies.
For other projects the scope should remain as "compile", otherwise they are not available
during compilation if the dependency is transivite (e.g. "graphx/" depending on "core/" and
needing Hadoop classes in the classpath to compile).
-->
<flume.deps.scope>compile</flume.deps.scope>
<hadoop.deps.scope>compile</hadoop.deps.scope>
<hive.deps.scope>compile</hive.deps.scope>
<parquet.deps.scope>compile</parquet.deps.scope>
<parquet.test.deps.scope>test</parquet.test.deps.scope>
<!--
Overridable test home. So that you can call individual pom files directly without
things breaking.
-->
<spark.test.home>${session.executionRootDirectory}</spark.test.home>
<PermGen>64m</PermGen>
<MaxPermGen>512m</MaxPermGen>
<CodeCacheSize>512m</CodeCacheSize>
</properties>
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
版本兼容问题真的特别特别重要!!!!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。