赞
踩
爬虫简单流程 1,分析网页找出目标url 2,python用urllib2下载url内容 3,BeautifulSoup bs4解析网页 4,从解析出来的数据中选择爬取目标 5将目标写入本地。
注意事项 1, 添加header 主要为cookie和网站代理 放置封ip 2,python编码问题 下载解析过程中不改变编码方式 等最后写入本地时更改编码方式。
# -- encoding: utf-8 -- //转化python编码方式
import urllib2 //urllib2 下载网站包
from bs4 import BeautifulSoup//解析网站包
import json//将获取信息写入json格式
import sys //转换编码方式
#sys系统编码为ascii 通过下面两行代码更改默认编码
reload(sys)
sys.setdefaultencoding(‘utf-8’)
resultData = []//用来存放每次抓取的信息
//http://www.snnu.edu.cn/tzgg/217.htm
//urllib2 抓取网页
url = ‘http://www.snnu.edu.cn/tzgg/’
count =1
for i in range(1, 219):
num = 219 - i
url_com = url + str(num) + “.htm”//循环组成所有目标url
response = urllib2.urlopen(url_com)
print response
//charset = chardet.detect(url)
//print charset //获取目标网页的编码方式
//print response.getcode() #检测是否抓取成功输出200
//cont = response.read().decode(‘utf-8’) 读出获得的网页
// BeautifulSoup 解析网页
soup = BeautifulSoup(response, ‘html.parser’)
link = soup.find_all(class_=“con_newslist”)[0] #寻找目标特征
links = link.find_all(“li”)
for li in links:
data = {
“title” : li.find(“a”).text
}
print count
count = count + 1
resultData.append(data)//将每次循环获取的数据写入数组
with open(‘result.json’, ‘wb’) as f://将数组写入相同目录resylt.json中 wb表示可读取 可写入
f.write(json.dumps(resultData).decode(“unicode-escape”))、、写入的这行代码跟开头sys配套更改最后写入编码
//encoding: utf-8 转变编码方式
//https://movie.douban.com/top250?start=25&filter=
import urllib2 //下载网页
import sys //转换格式
import json//写入json格式
// 强制转换编码方式
reload(sys)
sys.setdefaultencoding(‘utf-8’)
from bs4 import BeautifulSoup //解析网页
url = “https://movie.douban.com/top250?start=”
heads = {
}
num = 0
resultData = [] //用来存放每次循环爬取的信息
count = 1
for i in range(0, 10):
//分析所有的url
num = i * 25
url_com = url + str(num) + “&filter=”
//添加header的表达方式
// request = urllib2.Request(url_comg, headers=header)
// response = urllib2.urlopen(request)
// 下载url
response = urllib2.urlopen(url_com)
// 分析url
soup = BeautifulSoup(response, ‘html.parser’)
// 查找目标元素
link = soup.find_all(class_=“grid_view”)[0]
links = link.find_all(“li”)
for li in links:
print count
try:
data = {
“movieName”: li.find(“img”).attrs[“alt”],// arrtrs表示属性 alt=XXXX
“movieImg” : li.find(“img”).attrs[“src”],
“info” : li.find(“div”,class_=“bd”).find(‘p’).text.encode(‘utf8’).strip(),
```//encode转化格式字符串在Python内部的表示是unicode编码,因此,在做编码转换时,通常需要以unicode作为中间编码,即先将其他编码的字符串解码(decode)成unicode,再从unicode编码(encode)成另一种编码,strip()函数的作用是info中有空白格,删除其中的空白格 ` #输出文本转化编码格式否则存储到元祖中为空
“score” : li.find(class_=“star”).find(class_=“rating_num”).text,
“comment” : li.find(class_=“inq”).text
}
json.dumps(data, encoding=“UTF-8”, ensure_ascii=False) //将data数据转化为json
resultData.append(data)//添加到数组
count = count + 1
except AttributeError:
# try except 是因为有的comment为空,会因为其中一项数据没有定义所以会报错,所以对为空的数据赋值为空格
data[“comment”] = " "
//json.dumps(data, encoding=“UTF-8”, ensure_ascii=False)
// 将每次找到的data元祖数据放入数组中
resultData.append(data)
count =count + 1
with open(‘douban.json’, ‘wb’) as f:
//json.dump 将python数据转化为字符串 f.write(resultData)resultData为元祖不能写入 只有将其转化为字符串才能写入,40行提前虽然数据转化成功但是仍为元祖无法写入
f.write(json.dumps(resultData, indent=4).decode(“unicode-escape”))
indent如如元祖每一项前面有4个空格
print “that’s all”//结束语
## 查找网页编码的方法
import chardet
import urllib2
url = "http://index.baidu.com/v2/main/index.html#/trend/%E8%A5%BF%E5%AE%89?words=%E8%A5%BF%E5%AE%89"
response = urllib2.urlopen(url)
html = response.read()
print chardet.detect(html)
获取2017-2018西安关键词的百度指数查找每一天的指数来完成遇到每个月不到31天的时候因为没有返回信息系统系统会自动跳过 不会报错 用python调用函数来完成
encoding: utf-8
http://index.baidu.com/api/SearchApi/index?area=0&word=西安&startDate=2019-05-01&endDate=2019-05-01
import sys
import urllib2
import json
data_f = []
reload(sys)
sys.setdefaultencoding(‘utf-8’)
header = {
“cookie” : “BAIDUID=D990C407249FE8E70BB0E1BCC69067A3:FG=1; BIDUPSID=D990C407249FE8E70BB0E1BCC69067A3; PSTM=1547021385; bdindexid=j946ivqfo9rq50k3mfsa5hcsb5; BDSFRCVID=508sJeCCxG3JggR9c11XMVqhqAFOeQZRddMu3J; H_BDCLCKID_SF=tR30WJbHMTrDHJTg5DTjhPrMhh-jbMT-027OKKOF5b3CbJOxXfrY24_d-H7lW-QIyHrb0p6athF0hDvYh4Oq2KCV-frb-C62aKDsob7I-hcqEpO9QTbkbP-wefbjqPTDWC5f_CnIBb5GoxogbMchDUThDHR02t3-MPoa3RTeb6rjDnCrjj0WKUI8LPbO05JZ5KvNVR8htxnYS4jdKfQPXPLuWR5bKUrtt2LE3-oJqCD5bD-63J; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1557584073,1557625526,1557626135,1557636493; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1465_21119_18560_28519_28774_28722_28964_28831_28585; delPer=0; PSINO=2; BDUSS=VsbWYyUHpDeTZFTUxMbjNiZ0pTM2VqWmhqR3c0d0xQd3JhZTEyd1FzRVFhfjljRVFBQUFBJCQAAAAAAAAAAAEAAABgjFgvemp6MTQyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABDe11wQ3tdcWk; CHKFORREG=25a112f22b13124156fa13139499188d; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1557651091”,
“User-Agent” : “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36”
}
//找到目标网页查看工作台找到cookie 和代理
def getdata(key_1,url_comg)?/解析网页 获取目标信息
request = urllib2.Request(url_comg, headers=header)//添加header的访问方式
response = urllib2.urlopen(request)
data = json.load(response)
if str(data[“message”])== “bad request”:
return
data = {
“key” : key_1,
“startData”: str(data[“data”][“userIndexes”][0][“pc”][“startDate”]),
“endData”: str(data[“data”][“userIndexes”][0][“pc”][“endDate”]),
“mavg”: str(data[“data”][“generalRatio”][0][“all”][“avg”]),
“avg”: str(data[“data”][“generalRatio”][0][“wise”][“avg”])
}
data_f.append(data)
savedata(data_f)
def savedata(data_fina)?/存储 写入 信息
with open(“baidu_1.json”,“wb”) as f:
f.write(json.dumps(data_fina, indent=4).decode(“unicode-escape”))
def getkeyword()?/爬取的关键词
keylist = [“西安”]
return keylist
def spider(url1, sy, ey)?/获取目标url
urls = []
key = getkeyword()
for k in key:
for i in range(sy, ey ):
for j in range(12,13):
for f in range(1,33):
url_com = url1 + k + “&startDate=” + str(i) + “-” + str(j) + “-” + str(f) + “&endDate=” + str(i) + “-” + str(j) + “-” + str(f)
getdata(key,url_com)
if name ==“main”: //主文件
url = “http://index.baidu.com/api/SearchApi/index?area=0&word=”
startyear = 2017
endyear = 2018
spider(url, startyear, endyear)
print json.dumps(data_f).decode(“unicode-escape”)
代码没有完善 只能搜索出标题专门的图片,如果使用户的图片的话没有办法爬出来,会出现越界和返回数据类型错误的问题 用调用函数完成
#encoding : utf-8
import urllib
import urllib2
from bs4 import BeautifulSoup
header ={
“Cookie”: “SINAGLOBAL=9411374167025.424.1552041554396; UOR=,v.ifeng.com; login_sid_t=6bc26fe217fbe09d05225f5654146e49; cross_origin_proto=SSL; _s_tentry=-; Apache=2129606814883.5312.1557713131893; ULV=1557713131897:3:2:1:2129606814883.5312.1557713131893:1557536542243; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWahfmB37nahga6ao9IhFid5JpX5o275NHD95QcSK24ShM4e0-0Ws4DqcjJi–Ri-zXi-iWCs-LxK-LB–LBoqLxKqL1-eL1h.LxK.L1K2LB-zt; SSOLoginState=1557713230; SUB=_2A25x3KECDeRhGeBL6lsV9yfPzj-IHXVSq5XKrDV8PUNbmtAKLWHakW9NRyhOGFlGQ_u-eepwRtim_EAJSV9wzzbY; SUHB=0Ela5I-B2gp8kP; ALF=1589249232; wvr=6; webim_unReadCount=%7B%22time%22%3A1557713248360%2C%22dm_pub_total%22%3A0%2C%22chat_group_pc%22%3A0%2C%22allcountNum%22%3A2%2C%22msgbox%22%3A0%7D”,
“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36”
}
datas = []
pics = []
def loadpic(pic): //下载图片
for i in range(0,len(pic)):
pic1 = pic[i]
home = “C:\Users\snnucs\Desktop\pic\” + str(i) + “.jpg”
try:
urllib.urlretrieve(pic[i], home)
except Exception:
print “meiyoufuhegeshi”
def get_data(urls):
print len(urls)
for i in range(0,len(urls))?/用添加header的方式下载解析url
request = urllib2.Request(“https://s.weibo.com/”+ urls[i], headers=header)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response, ‘html.parser’)
try:
pic = soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(“img”)[“src”] //这里会出现问题 越界或者返回数据类型错误
except AttributeError:
print “meizhaodao”
except IndexError:
print “yuejie”
datas.append(pic)
return datas//返图偏
# data = {
# “titile” : soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(class_=“info”).find(class_=“title”).find(“a”).text,
# “pic” : soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(“img”)[“src”]
#
# }
# datas.append(data)
# return datas
# pics.append(pic)
# return pics //注释的代码 爬取标题和照片但是有格式问题 没有完成待修改
def get_url(url_g)?/两次下载解析的目的是热搜51条里 要先下载一次获取 所有51条链接 之后再下载对应的
request = urllib2.Request(url_g,headers=header)//51链接的标题和图片
response = urllib2.urlopen(request)
soup = BeautifulSoup(response,‘html.parser’)
urls = []
url_f = soup.find_all(class_=“wbs-hotrank”)[0].find_all(class_=“m-main”)[0].find_all(class_=“m-wrap”)[0].find_all(class_=“data”)[0].find_all(“a”)
print len(url_f)
for i in range(0,len(url_f)):
urls.append(url_f[i][‘href’])
return urls
if name ==“main”:
url = “https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6”
loadpic(get_data(get_url(url)))
print “ok”
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。