赞
踩
1,使用的模块:
- import codecs
- import csv
- import requests
- import re
- import json
- import pprint
2,主要爬取内容:
- '职位名称',
- '基本信息',
- '公司名字',
- '工作地点',
- '公司类型',
- '公司规模',
- '公司性质',
- '福利',
- '工资',
- '信息发布时间',
- '职位详情页',
3,不固定url资源路径,通过if和elif对城市进行判断然后选择url,可以选择不同的城市。
4,最终实现代码
- import codecs
- import csv
-
- import requests
- import re
- import json
- import pprint
- f = open(
- '前程无忧.csv',
- mode='a',
- encoding='utf-8-sig',
- newline='')
- #创建一个csv文件,mode=a表示对文件只能写入,encoding是内容文字,newline避免有换行字符等产生
- csv__ = csv.DictWriter(
- f,
- fieldnames = [
- '职位名称',
- '基本信息',
- '公司名字',
- '工作地点',
- '公司类型',
- '公司规模',
- '公司性质',
- '福利',
- '工资',
- '信息发布时间',
- '职位详情页']
- )
- #f是创建的csv文件,fieldnames表示列名
- csv__.writeheader()
- print("输入你的城市:")
- str = input()
- if str=='成都':
- url=\
- 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=' \
- '99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='北京':
- url = \
- 'https://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str==\
- '上海':
- url = \
- 'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str == '广州':
- url = \
- 'https://search.51job.com/list/030200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='深圳':
- url = \
- 'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='武汉':
- url = \
- 'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='西安':
- url = \
- 'https://search.51job.com/list/200200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='杭州':
- url = \
- 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='南京':
- url = \
- 'https://search.51job.com/list/070200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='重庆':
- url = \
- 'https://search.51job.com/list/060000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='东莞':
- url = \
- 'https://search.51job.com/list/030800,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='大连':
- url = \
- 'https://search.51job.com/list/230300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='沈阳':
- url = \
- 'https://search.51job.com/list/230200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- elif str=='苏州':
- url = \
- 'https://search.51job.com/list/070300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
- ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
- '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- #不固定url资源路径,通过if和elif对城市进行判断然后选择url
- # url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'
- }#头文件模仿网页对网页进行爬虫
-
- response = requests.get(
- url=url,
- headers=headers
- )
- #获取资源路径下的网页文件
- print(response.text)
- html_data=re.findall(
- 'window.__SEARCH_RESULT__ =(.*?)</script>',
- response.text)[0]
- #运用正则表达式findall找到需要的资源,[0]表示爬取出来的是字符串
- json_data=json.loads(html_data)
- #用json.loads对获取到的字符串进行解码返回python字段
- # pprint.pprint(json)
-
- engine=json_data['engine_jds']
- #找到这个字段的内容
- pprint.pprint(engine)
- for i in engine:
- # pprint.pprint(i)
- title=i['job_name']
- attribute_text=i['attribute_text']
- jjj = ' '.join(attribute_text)
- company_name=i['company_name']
- companyind_text=i['companyind_text']
- companysize_text=i['companysize_text']
- companytype_text=i['companytype_text']
- jobwelf=i['jobwelf']
- providesalary_text=i['providesalary_text']
- updatedate=i['updatedate']
- job_href=i['job_href']
- workarea_text=i['workarea_text']
- #对找到的列表拆分为多个字典内容
-
- dit={
- '职位名称':title,
- '基本信息':jjj,
- '公司名字':company_name,
- '工作地点':workarea_text,
- '公司类型':companyind_text,
- '公司规模':companysize_text,
- '公司性质':companytype_text,
- '福利':jobwelf,
- '工资':providesalary_text,
- '信息发布时间':updatedate,
- '职位详情页':job_href
-
- }
- #把拆分的数据整合进一个新的字典
- csv__.writerow(dit)
- #把dit字典内容写进csv文件
-
-
5,结果:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。