赞
踩
又是因为作业。。。视频:Python网络爬虫与信息提取
1.爬取网页代码框架(requests库)
可以在网页根目录下放置robots.txt来规定爬取规则,如:https://www.jd.com/robots.txt
import requests
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status() #如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
if __name__ == "__main__":
url = "http://www.baidu.com"
print(getHTMLText(url))
#伪装为浏览器
#kv = {'user-agent':'Mozilla/5.0'}
#r = requests.get(url,headers=kv)

2.关键字查询
import requests
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get("http://www.baidu.com/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
3.爬取图片代码框架(存在D盘pics目录中)
import requests
import os
url = "https://ss0.baidu.com/-Po3dSag_xI4khGko9WTAnF6hhy/zhidao/pic/item/80cb39dbb6fd5266db76c6d7a918972bd5073697.jpg"
root = "D://pics//"
#给图片在url上的原名
path = root +url.split('/')[-1]
try:
#如果没有,建文件夹
if not os.path.exists(root):
os.mkdir(root)
#无重名文件
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")

4.bs4
from bs4 import BeautifulSoup
soup = BeautifulSoup(demo, "html.parser")
print(soup.prettify())
5.大学排名实例
import requests
from bs4 import BeautifulSoup
import bs4
#得到html代码
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
#获取关键信息
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
#只有bs4.element.Tag类型才会继续,略过字符串类型的子孙
if isinstance(tr, bs4.element.Tag):
#find_all()简写
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[2].string])
#打印大学列表
def printUnivList(ulist, num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","总分"))
for i in range(num):
u = ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
main()

实例优化(用中文空格代替英文空格)
import requests
from bs4 import BeautifulSoup
import bs4
#得到html代码
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
#获取关键信息
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
#只有bs4.element.Tag类型才会继续,略过字符串类型的子孙
if isinstance(tr, bs4.element.Tag):
#find_all()简写
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[2].string])
#打印大学列表
def printUnivList(ulist, num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
#chr(12288) 中文空格
print(tplt.format("排名","学校名称","总分", chr(12288)))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0], u[1], u[2], chr(12288)))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
main()

6.re 正则表达式
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。