赞
踩
win10安装spark:https://blog.csdn.net/songhaifengshuaige/article/details/79480491
不宜妄自菲薄,引喻失义。
可参考:
windows上配置 Python+spark开发环境
说明:
安装注意版本
- yang 85 90 30
- wang 20 60 50
- zhang 90 90 100
- zhang 90 90 100
- li 100 54 0
- li 100 54 0
- yanf 0 0 0
- #r = reduce(lambda x, y: x+y, [4,4,5,5]) # 使用 lambda 匿名函数
- from functools import reduce
- def add(x, y) : # 两数相加
- return x + y
-
- print(reduce(add, [1,2,3,4,5]))
-
- print(reduce(lambda x,y:x+y,[1,2,3,4,5]))
-
- print(type(1))
-
- print(type([2]))
-
- class y(object):
- z=5
-
- x=type('y',(object,),dict(z=5))
- print(x)
-
- '''
- def map_func(x):
- s = x.split()
- return (s[0], [int(s[1]),int(s[2]),int(s[3])]) #返回为(key,vaklue)格式,其中key:x[0],value:x[1]且为有三个元素的列表
- #return (s[0],[int(s[1],s[2],s[3])]) #注意此用法不合法
- from pyspark import SparkContext #导入模块
- sc=SparkContext(appName='Student') #命名
- lines=sc.textFile("student.txt").map(lambda x:map_func(x)).cache() #导入数据且保持在内存中,其中cache():数据保持在内存中
- count=lines.count() #对RDD中的数据个数进行计数;其中,RDD一行为一个数据集
- print(count)
- '''
-
- #studentExample 例子 练习
- def map_func(x):
- s = x.split()
- return (s[0], [int(s[1]),int(s[2]),int(s[3])]) #返回为(key,vaklue)格式,其中key:x[0],value:x[1]且为有三个元素的列表
- #return (s[0],[int(s[1],s[2],s[3])]) #注意此用法不合法
-
- def has100(x):
- for y in x:
- if(y == 100): #把x、y理解为 x轴、y轴
- return True
- return False
-
- def allis0(x):
- if(type(x)==list and sum(x) == 0): #类型为list且总分为0 者为true;其中type(x)==list :判断类型是否相同
- return True
- return False
-
- def subMax(x,y):
- m = [x[1][i] if(x[1][i] > y[1][i]) else y[1][i] for i in range(3)]
- return('Maximum subject score', m)
-
- def sumSub(x,y):
- n = [x[1][i]+y[1][i] for i in range(3)]
- #或者 n = ([x[1][0]+y[1][0],x[1][1]+y[1][0],x[1][2]+y[1][2]])
- return('Total subject score', n)
-
- def sumPer(x):
- return (x[0],sum(x[1]))
-
-
- #停止之前的SparkContext,不然重新运行或者创建工作会失败;另外,只有 sc.stop()也可以,但是首次运行会有误
- try:
- sc.stop()
- except:
- pass
-
- from pyspark import SparkContext #导入模块
- sc=SparkContext(appName='Student') #命名
- lines=sc.textFile("student.txt").map(lambda x:map_func(x)).cache() #导入数据且保持在内存中,其中cache():数据保持在内存中
- count=lines.count() #对RDD中的数据个数进行计数;其中,RDD一行为一个数据集
-
-
- #RDD'转换'运算 (筛选 关键字filter)
- whohas100 = lines.filter(lambda x: has100(x[1])).collect() #注意:处理的是value列表,也就是x[1]
- whois0 = lines.filter(lambda x: allis0(x[1])).collect()
- sumScore = lines.map(lambda x: (x[0],sum(x[1]))).collect()
-
- #‘动作’运算
- maxScore = max(sumScore,key=lambda x: x[1]) #总分最高者
- minScore = min(sumScore,key=lambda x: x[1]) #总分最低者
- sumSubScore = lines.reduce(lambda x,y: sumSub(x,y))
- avgScore = [x/count for x in sumSubScore[1]]#单科成绩平均值
-
- #RDD key-value‘转换’运算
- subM = lines.reduce(lambda x,y: subMax(x,y))
- redByK = lines.reduceByKey(lambda x,y: [x[i]+y[i] for i in range(3)]).collect() #合并key相同的value值x[0]+y[0],x[1]+y[1],x[2]+y[2]
-
- #RDD'转换'运算
- sumPerSore = lines.map(lambda x: sumPer(x)).collect() #每个人的总分 #sumSore = lines.map(lambda x: (x[0],sum(x[1]))).collect()
- sorted = lines.sortBy(lambda x: sum(x[1])) #总成绩低到高的学生成绩排序
- sortedWithRank = sorted.zipWithIndex().collect()#按总分排序
- first3 = sorted.takeOrdered(3,key=lambda x:-sum(x[1])) #总分前三者
-
-
- #限定以空格的形式输出到文件中
- first3RDD = sc.parallelize(first3)\
- .map(lambda x:str(x[0])+' '+str(x[1][0])+' '+str(x[1][1])+' '+str(x[1][2])).saveAsTextFile("result")
- #print(lines.collect())
- print("数据集个数(行):",count)
- print("单科满分者:",whohas100)
- print("单科零分者:",whois0)
- print("单科最高分者:",subM)
- print("单科总分:",sumSubScore)
- print("合并名字相同的分数:",redByK)
- print("总分/(人)",sumPerSore)
- print("最高总分者:",maxScore)
- print("最低总分者:",minScore)
- print("每科平均成绩:",avgScore)
- print("总分倒序:",sortedWithRank)
- print("总分前三者:",first3)
- print(first3RDD)
- sc.stop()
- 数据集个数(行): 7
- 单科满分者: [('li', [100, 54, 0]), ('li', [100, 54, 0])]
- 单科零分者: [('yanf', [0, 0, 0])]
- 单科最高分者: ('Maximum subject score', [100, 90, 100])
- 单科总分: ('Total subject score', [485, 438, 280])
- 合并名字相同的分数: [('zhang', [180, 180, 200]), ('li', [200, 108, 0]), ('yang', [85, 90, 30]), ('wang', [20, 60, 50]), ('yanf', [0, 0, 0])]
- 总分/(人) [('yang', 205), ('wang', 130), ('zhang', 280), ('zhang', 280), ('li', 154), ('li', 154), ('yanf', 0)]
- 最高总分者: ('zhang', 280)
- 最低总分者: ('yanf', 0)
- 每科平均成绩: [69.28571428571429, 62.57142857142857, 40.0]
- 总分倒序: [(('yanf', [0, 0, 0]), 0), (('wang', [20, 60, 50]), 1), (('li', [100, 54, 0]), 2), (('li', [100, 54, 0]), 3), (('yang', [85, 90, 30]), 4), (('zhang', [90, 90, 100]), 5), (('zhang', [90, 90, 100]), 6)]
- 总分前三者: [('zhang', [90, 90, 100]), ('zhang', [90, 90, 100]), ('yang', [85, 90, 30])]
- None
RDD的特性
在进行基本RDD“转换”运算时不会立即执行,结果不会显示在显示屏中,collect()是一个“动作”运算,会立刻执行,显示结果。
reduce()函数会对参数序列中的元素进行累积。
语法
reduce(function, iterable[, initializer])
实例
说明:Python3的内建函数移除了reduce函数,reduce函数放在functools模块
- In [24]:
-
- #r = reduce(lambda x, y: x+y, [4,4,5,5]) # 使用 lambda 匿名函数
- from functools import reduce
- def add(x, y) : # 两数相加
- return x + y
- reduce(add, [1,2,3,4,5])
- Out[24]:
- 15
- In [25]:
-
- reduce(lambda x, y: x+y, [1,2,3,4,5]) # 使用 lambda 匿名函数
- Out[25]:
- 15
class type(name, bases, dict)
一个参数返回对象类型, 三个参数,返回新的类型对象。
- # 一个参数实例
-
- In [1]:
-
- type(1)
- Out[1]:
- int
- In [2]:
-
- type([2])
- Out[2]:
- list
- In [3]:
-
- type({3:'three'})
- Out[3]:
- dict
- In [5]:
-
- x = 5
- type(x) == list #判断x的类型是否为list
- Out[5]:
- False
- # 三个参数实例
-
- class y(object):
- z = 5
-
- x = type('y',(object,),dict(z=5))
-
- print(x)
-
- <class '__main__.y'> #产生一个新的类型
- An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
- : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 2.0 failed 1 times, most recent failure: Lost task 1.0 in stage 2.0 (TID 5, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
1、检查拼写是否有误
2、检查缩进是否合规
3、检查()是否一一配对
- 1441900799.728000 1441900802.452000 8618245698655 0134730038729312 2 1 1 IPHONE_5 17999 20693 10.67.23.157 111.13.34.100 6 58986 80 GET mmsns.qpic.cn /mmsns/PdibpV1sFDHdaOTqNXb8VGSNicyYpOVa9R7icxSr4BkwbsSyzJbBTmE5Zz5aZichejbkKuia7twzraqk/150?tp=webp&length=1136&width=640 weixin.qq.com/?version=369229843&uin=2925174340&nettype=0&scene=moment WeChat/6.2.0.19 CFNetwork/711.3.18 Darwin/14.0.0 200 59 image/webp 7504 706 8212 7 1827
- 1441900750.023000 1441900754.063000 8613836044032 0136210021269713 2 1 1 IPHONE_5 17752 25632 10.67.21.71 117.144.242.26 6 52941 80 POST short.weixin.qq.com http://short.weixin.qq.com/cgi-bin/micromsg-bin/tenpay - MicroMessenger Client - - - - 715 0 7 1827
- 1441900755.480472 1441900756.762000 8618246899077 0131830068670612 2 1 1 IPHONE_4S 17875 61433 10.67.43.51 120.192.84.86 6 58684 31271 GET i.gtimg.cn http://i.gtimg.cn/qqshow/admindata/comdata/vip_emoji_aio_ios_new_config/xydata.json - QQ/5.7.0.469 CFNetwork/672.0.8 Darwin/14.0.0 304 83 x-json - 0 0 18 1041
- 1441900754.860000 1441900755.480472 8618246899077 0131830068670612 2 1 1 IPHONE_4S 17875 61433 10.67.43.51 120.192.84.86 6 58684 31271 GET i.gtimg.cn http://i.gtimg.cn/club/item/avatar/zip/0/i0/all.zip - QQ/5.7.0.469 CFNetwork/672.0.8 Darwin/14.0.0 404 210 text/html 85 487 411 18 1041
- 1441900753.786000 1441900755.726000 8618246195634 9900026543899411 2 1 1 IPHONE_4S 17783 19302 10.67.29.55 111.40.194.207 6 49412 80 GET sb.symcd.com /MFYwVKADAgEAME0wSzBJMAkGBSsOAwIaBQAEFDmvGLQcAh85EJZW%2FcbTWO90hYuZBBROQ8gddu83U3pP8lhvlPM44tW93wIQd9jUM82by0%2FVy957MNapGQ%3D%3D - securityd (unknown version) CFNetwork/672.0.2 Darwin/14.0.0 - - - - 522 0 18 1041
- 1441900761.308739 1441900761.408000 8615045213668 0127590050857822 2 1 1 IPHONE_4 17772 50621 10.67.63.219 183.232.95.61 6 49337 80 POST szminorshort.weixin.qq.com http://szminorshort.weixin.qq.com/cgi-bin/micromsg-bin/rtkvreport - MicroMessenger Client - - - - 500 16 7 1827
- 1441900696.427624 1441900761.308739 8615045213668 0127590050857822 2 1 1 IPHONE_4 17772 50621 10.67.63.219 183.232.95.61 6 49337 80 POST szminorshort.weixin.qq.com http://szminorshort.weixin.qq.com/cgi-bin/micromsg-bin/rtkvreport - MicroMessenger Client - - - - 500 16 7 1827
- 1441900693.219000 1441900696.427624 8615045213668 0127590050857822 2 1 1 IPHONE_4 17772 50621 10.67.63.219 183.232.95.61 6 49337 80 POST szminorshort.weixin.qq.com http://szminorshort.weixin.qq.com/cgi-bin/micromsg-bin/rtkvreport - MicroMessenger Client - - - - 502 16 7 1827
- 1441900750.845345 1441900753.537000 8618246195634 9900026543899411 2 1 1 IPHONE_4S 17783 19302 10.67.29.55 117.135.169.124 6 49411 80 GET b227.photo.store.qq.com /psb?/V12jlwSP30SPej/VE1V5LlXFMzHeg5gTzpyuCueaEVEGV*0X6BbSyJZRhs!/b/dCWGUIc.HQAA&ek=1&kp=1&pt=0&bo=yAD6AAAAAAABBxI!&t=5 v1_iph_sq_5.6.0_1_app_a-4-2 QQ/5.6.0.438 CFNetwork/672.0.2 Darwin/14.0.0 - - - - 792 0 18 1041
- 1441900748.094000 1441900750.845345 8618246195634 9900026543899411 2 1 1 IPHONE_4S 17783 19302 10.67.29.55 117.135.169.124 6 49411 80 GET b227.photo.store.qq.com /psb?/V12jlwSP30SPej/VE1V5LlXFMzHeg5gTzpyuCueaEVEGV*0X6BbSyJZRhs!/b/dCWGUIc.HQAA&ek=1&kp=1&pt=0&bo=yAD6AAAAAAABBxI!&t=5 v1_iph_sq_5.6.0_1_app_a-4-2 QQ/5.6.0.438 CFNetwork/672.0.2 Darwin/14.0.0 - - - - 792 0 18 1041
- #test 1_1 用户上网记录统计
- from pyspark import SparkContext
- sc = SparkContext(appName='test1')
- rdd = sc.textFile('user_small')\
- .map(lambda x:x.split('\t'))\
- .map(lambda x:(x[3],1))\
- .reduceByKey(lambda x,y:x+y)\
- .map(lambda x:str(x[0])+' '+str(x[0][1])).collect()
- #.saveAsTextFile('text1_1') #限定为空格键输出到文件
- print(rdd)
- sc.stop()
运行结果:
['0127590050857822 1', '0134730038729312 1', '0136210021269713 1', '0131830068670612 1', '9900026543899411 9']
- #test 1_2 统计用户上网 分别为上、下行流量
- def map_func(x):
- s = x.split('\t')
- return (s[2],[int(s[24]),int(s[25])])#返回为(key,vaklue)格式,其中key:x[0],value:x[1]且为有三个元素的列表
- #return (s[0],[int(s[1],s[2],s[3])]) #注意此用法不合法
-
- try:
- sc.stop() #停止之前的SparkContext,不然重新运行或者创建工作会失败
- except:
- pass
-
- from pyspark import SparkContext
- sc=SparkContext(appName='test')
- lines=sc.textFile("user_small").map(lambda x:map_func(x)).cache()
- redByK = lines.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
- sum_flow = redByK.map(lambda x:str(x[0])+' '+str(x[1][0])+' '+str(x[1][1]))\
- .saveAsTextFile('text1_2')
- sc.stop()
-
- #test 1_2 统计用户上网 总流量
- try:
- sc.stop() #停止之前的SparkContext,不然重新运行或者创建工作会失败
- except:
- pass
-
- from pyspark import SparkContext
- sc = SparkContext(appName='test1')
- rdd = sc.textFile('user_small')\
- .map(lambda x:x.split('\t'))\
- .map(lambda x:(x[2],int(x[24])+int(x[25])))\
- .reduceByKey(lambda x,y:x+y)\
- .map(lambda x:str(x[0])+' '+str(x[1])).collect()
- print(rdd)
- sc.stop()
运行结果:
['8618246899077 898', '8615045213668 1550', '8618245698655 8918', '8613836044032 715', '8618246195634 2106']
-
- # test 1_3
- from pyspark import SparkContext
-
- sc = SparkContext(appName='test1')
- rdd = sc.textFile('user_small') \
- .map(lambda x: x.split('\t')) \
- .map(lambda x: (x[19], int(x[25]))) \
- .filter(lambda x: 'WeChat' or 'MicroMessenger' in x[1])\
- .reduceByKey(lambda x, y: x + y) \
- .map(lambda x: str(x[0]) + ' ' + str(x[1])).collect()
-
- print(rdd)
运行结果:
['QQ/5.7.0.469 CFNetwork/672.0.8 Darwin/14.0.0 411', 'securityd (unknown version) CFNetwork/672.0.2 Darwin/14.0.0 0', 'QQ/5.6.0.438 CFNetwork/672.0.2 Darwin/14.0.0 0', 'WeChat/6.2.0.19 CFNetwork/711.3.18 Darwin/14.0.0 8212', 'MicroMessenger Client 48']
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。