赞
踩
前言:由于很多人问我清洗后job.csv里无数据,
经排查,感觉是这几个python包的版本原因,
在此提供我当时用的这几个包的版本(主要是pandas库):
一.数据采集(python爬虫)
爬虫这块不多说啦(主打数据分析),采集到的信息先转入excel表格,然后进行清洗、可视化。
#!user/bin/env python3 # -*- coding: utf-8 -*- import re import requests import random,time from lxml import html key = '大数据' headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.5", "Connection": "keep-alive", "Host": "jobs.51job.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"} def get_urls(page): url0 = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(page) + '.html' req0 = requests.get(url0, headers, timeout=10) req0.decoding = 'gbk' result = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">',re.S)#无re.S只在每一行内匹配 url = re.findall(result, req0.text) return url def str1(x): y = '' for x0 in x: y = y + x0 + ';' return y def get_content(url): req1 = requests.get(url, headers, timeout=10) req1.encoding = 'gbk' t1 = html.fromstring(req1.text) try: job_name = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0].strip() company_name = t1.xpath('//p[@class="cname"]//a/text()')[0].strip() job_place = t1.xpath('//p[@class="msg ltype"]/text()')[0].strip().split('-')[0]#由于有的职位信息只有城市,此处只保留城市 workExperience = t1.xpath('//p[@class="msg ltype"]/text()')[1].strip() educationBackground = t1.xpath('//p[@class="msg ltype"]/text()')[2].strip() require_num = t1.xpath('//p[@class="msg ltype"]/text()')[3].strip() date = t1.xpath('//p[@class="msg ltype"]/text()')[4].strip() Salary = t1.xpath('//div[@class="tHeader tHjob"]//strong/text()')[0].strip() company_type = t1.xpath('//div[@class="com_tag"]/p/text()')[0].strip() company_num = t1.xpath('//div[@class="com_tag"]/p/text()')[1].strip() company_business = str1(t1.xpath('//div[@class="com_tag"]/p//a/text()'))#自定义函数str() job_treatment = str1(re.findall(re.compile(r'<span class="sp4">(.*?)</span>', re.S), req1.text)) job_describe = str1(t1.xpath('//div[@class="bmsg job_msg inbox"]/p/text()')) content = str(company_name) + ',' + str(job_name) + ',' + str(job_place) + ',' + str(workExperience) + ',' +\ str(educationBackground) + ',' + str(require_num) + ',' + str(date) + ',' + str(Salary) + ',' + str( company_type) + ',' + str(company_num) + ',' + str(company_business) + ',' + str(job_treatment)+ ',' + str(job_describe) + '\n' file.write(content) return content except Exception as e: print(e) if __name__ == '__main__': file = open('51job.csv', 'w', encoding='gbk') content0 = 'company_name,job_name,job_place,workExperience,educationBackground,require_num,date,Salary,company_type,' \ 'company_num,company_business,job_treatment,job_describe'+'\n' file.write(content0) for page in range(1,200): print('正在爬取第{}页信息'.format(page)) #time.sleep(random.random() + random.randint(1, 5)) urls = get_urls(page) try: for url in urls: try: data = get_content(url) print(data) #time.sleep(random.random() + random.randint(0,1)) except : print('None') except Exception as e: print(e) file.close()
二.数据清洗及处理(用到pandas,numpy等库,需自行安装)
(生成文件job.csv)
#!user/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd import numpy as np import re data0 = pd.read_excel('data0.xlsx', encoding='gbk') def get_salary(salary): #自定义针对薪资信息处理的函数 if '-' in salary: # 针对1-2万/月或者10-20万/年的情况,包含- low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0] high_salary = re.findall(re.compile('(\d?\.?\d+)'), salary)[1] if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式 low_salary = round(float(low_salary) / 12 * 10,2) high_salary = round(float(high_salary) / 12 * 10,2) elif u'万' in salary and u'月' in salary: low_salary = round(float(low_salary) * 10,2) high_salary = round(float(high_salary) * 10,2) else: # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资 low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0] high_salary = np.NAN if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式 low_salary = round(float(low_salary) / 12 * 10,2) elif u'万' in salary and u'月' in salary: low_salary = round(float(low_salary) * 10,2) elif u'元' in salary and u'天' in salary: low_salary = round(float(low_salary) / 1000 * 21,2) # 每月工作日21天 return low_salary,high_salary def get_experience(exp): #自定义处理工作经验的函数 if u'无' in exp: low_exp = [0] else: low_exp = re.findall(re.compile('(\d*)'), exp) return low_exp[0] #数据清洗 def data_clear(df): df.drop(columns='job_describe',inplace=True) #data.drop_duplicates(subset='company_name', inplace=True) df.dropna(axis=0, how='any', inplace=True) # 删除有缺失值的数据 print('清洗有缺失值的数据后的数据shape:', df.shape) df = df[df.job_name.str.contains(r'.*?数据.*?')] # 挑选含有'...数据...'的行 print('挑选职位信息中含有"数据"后的数据shape:', df.shape) df.to_csv('job0.csv') df = pd.read_csv('job0.csv') df = df.iloc[:,1:] cols1 = [x for i, x in enumerate(df.index) if u'招' in df.loc[i, 'educationBackground']] df.drop(cols1, axis=0,inplace=True) df.to_csv('job0.csv') print('清洗学历信息含有招人数后的数据shape:', df.shape) df = pd.read_csv('job0.csv') df = df.iloc[:,1:] cols2 = [x for i, x in enumerate(df.index) if u'经验' in df.loc[i, 'educationBackground']] df.drop(cols2, axis=0,inplace=True) df.to_csv('job0.csv') print('清洗学历信息含有工作经验后的数据shape:', df.shape) df = pd.read_csv('job0.csv') df = df.iloc[:,1:] cols3 = [x for i,x in enumerate(df.index) if u'省' in df.loc[i, 'job_place']] df.drop(cols3, axis=0,inplace=True) df.to_csv('job0.csv') print('清洗job_place含有省份后的数据shape:',df.shape) # 数据预处理 def data_deal(): data = pd.read_csv('job0.csv') data= data.iloc[:, 1:] low_list, high_list, exp_list = [], [], [] for i in range(0,data.shape[0]): salary_lh = get_salary(data['Salary'][i]) low_list.append(salary_lh[0]) high_list.append(salary_lh[1]) exp = get_experience(data['workExperience'][i]) exp_list.append(exp) col_name = data.columns.tolist() col_name.insert(col_name.index('Salary'),'low_salary') data = data.reindex(columns=col_name) data.rename(columns={'Salary': 'high_salary'},inplace=True) data['low_salary'], data['high_salary'] = low_list, high_list # 解决高低工资,工资含有年薪等,统一月薪k/约、月 data['workExperience'] = exp_list # 解决工作经验,统一取最低工作经验 #print(data['high_salary'].isnull().sum()) #high_salary存在45个缺失值,故删除 data.dropna(axis=0,subset=['high_salary'],inplace=True)#删除45high_salary data.index = np.arange(data.shape[0]) # data.index = np.arange(len(data.index)) data.to_csv('job.csv', encoding='utf-8') print('预处理后数据shape:',data.shape) if __name__ == '__main__': data_clear(data0) data_deal()
三.数据可视化(matplotlib,pyecharts库)
# !user/bin/env python3 # -*- coding: utf-8 -*- from pyecharts import Geo import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyecharts import WordCloud from mpl_toolkits.mplot3d import Axes3D plt.rcParams['font.sans-serif'] = 'simhei' plt.rcParams['axes.unicode_minus'] = False data = pd.read_csv('job.csv',encoding='gbk') def edu_com(df): edu = df['educationBackground'].value_counts() y1 = [edu.index[i] for i in range(0, edu.shape[0])] x1 = [edu[i] for i in range(0, edu.shape[0])] ax = plt.figure(figsize=(12, 6)) ax1 = ax.add_subplot(1, 2, 1) plt.barh(y1, x1, height=0.2, color='green') plt.xlabel('公司数量') plt.ylabel('学历要求') plt.title('大数据职位对学历要求条形图') ax2 = ax.add_subplot(1, 2, 2) com = df['company_type'].value_counts() y = [com.index[i] for i in range(0, com.shape[0])] x = [com[i] for i in range(0, com.shape[0])] plt.barh(y, x, height=0.2, color='purple') plt.xlabel('公司数量') plt.ylabel('公司类型') plt.title('大数据职位公司类型条形图') plt.savefig('大数据职位学历及公司类型条形图.png') plt.show() def exp_edu_sal(df): d1 = df[['workExperience','low_salary','high_salary']].groupby('workExperience').mean() s1 = d1.index s11 = d1['low_salary'] s12 = d1['high_salary'] d2 = df[['educationBackground','low_salary','high_salary']].groupby('educationBackground').mean() s2 = d2.index s21 = d2['low_salary'] s22 = d2['high_salary'] p = plt.figure(figsize=(12,6)) p1 = p.add_subplot(1,2,1) plt.plot(s1,s11,c='r') plt.plot(s1,s12,c='b') plt.xlabel('工作经验(年)') plt.ylabel('薪资(k/月)') plt.legend(['最低薪资','最高薪资']) plt.title('大数据职位工作经验对薪资影响折线图') p2 = p.add_subplot(1, 2, 2) plt.plot(s2, s21, c='r') plt.plot(s2, s22, c='b') plt.xlabel('学历') plt.ylabel('薪资(k/月)') plt.legend(['最低薪资', '最高薪资']) plt.title('大数据职位学历对薪资影响折线图') plt.savefig('大数据职位工作经验及学历对薪资影响折线图.png') plt.show() def job_city_geo(df): job_place = df['job_place'].value_counts()[0:40] job_indexs = [x for i, x in enumerate(job_place.index)] counts = [job_place[i] for i in job_indexs] # list_city = [] # for i, x in enumerate(ser.index): # tuple_city = (x, ser[i]) # list_city.append(tuple_city) # list_city = list_city[0:40] geo = Geo("大数据职位在全国分布(前40城市排名)地理坐标系图", title_color="#fff", title_pos="center", width=1200, height=600, background_color='#404a59') # attr, value = geo.cast(list_city) geo.add("", attr=job_indexs, value=counts, visual_range=[0, 1400], visual_text_color="#fff", symbol_size=12, is_visualmap=True, is_roam=False) geo.render('大数据职位在全国分布(前40城市排名)地理坐标系图.html') def job_name_wc(df): job_name = df['job_name'].value_counts() job_indexs = [x for i, x in enumerate(job_name.index)] counts = [job_name[i] for i in job_indexs] wordcloud = WordCloud('大数据工作名称词云图', width=1300, height=620) wordcloud.add("", attr=job_indexs, value=counts, word_size_range=np.arange(100)) wordcloud.render('大数据工作名称词云图.html') if __name__ == '__main__': edu_com(data) exp_edu_sal(data) job_city_geo(data) job_name_wc(data)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。