赞
踩
urllib模块应用
如何通过python获取网页数据
做转码
准备web页面素材
启动httpd
通过apache的访问日志 发现是python进行的登录
解决为 urllib添加头部信息
- import urllib.request as u
- request = u.Request("http://192.168.86.11") #将网页地址添加到request实例(变量)
- request.add_header("User-Agent","Mozilla/5.0 \
- (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0") #为该实例添加头部信息
- response = u.urlopen(request) #通过urlopen打开实例(网页地址 和 头部信息)
- html = response.read()
- print(html) #访问页面
验证linux apache的日志
vim /var/log/httpd/access_log 查看信息记录是否还有python信息
下载图片的程序
- import urllib.request as u
- request = u.Request("http://192.168.86.11/style/\
- u24020836931378817798fm170s6BA8218A7B2128178FA0A49F010080E2w.jpg") #图片地址
- request.add_header("User-Agent","Mozilla/5.0 \
- (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
- response = u.urlopen(request)
- html = response.read() #读取的图片的2进制数据
- #print(html)
- with open("c:\\users\\allen\\desktop\\爬虫.jpg","wb") as f:
- f.write(html)
需要将网页信息获取程序转换为函数模式
- import urllib.request as u
-
- url = "http://192.168.86.11"
-
- def get_html(urladdr):
- "我的功能是获取主页的所有源码"
- request = u.Request(urladdr)
- request.add_header("User-Agent","Mozilla/5.0 \
- (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
- response = u.urlopen(request)
- html = response.read()
- return html
-
- def get_imglist():
- "我的功能是将所有图片信息地址 做成一个大的列表"
- pass
-
- def get_imgs():
- "我的功能是下载图片列表中的所有 图片信息 并保存图片"
- pass
-
- html = get_html(url)
- print(html)
单个字符匹配
"." 匹配单个任意字符
- >>> import re
- >>> re.findall(".ood","I say Good not food")
- ['Good', 'food']
- >>> re.findall(".ood","I say Good not food @ood")
- ['Good', 'food', '@ood']
- >>> re.findall(".ood","I say Good not food ood")
- ['Good', 'food', ' ood']
- >>> re.findall(".ood","I say Good not food \nood")
- ['Good', 'food']
- >>>
[] 单个字符逐一匹配
- >>> re.findall("[fn]ood","I say Good not food nood") #ood以f或者n链接 的字符串
- ['food', 'nood']
- >>> re.findall("[^fn]ood","I say Good not food nood")#ood不是以f或者n链接的字符串 取反
- ['Good']
- >>> re.findall("^[Gfn]ood","Good not food nood") #以G f n 开头的和ood链接的字符串匹配
- ['Good']
- >>> re.findall("^[Gfn]ood","I say Good not food nood")
- []
- >>>
\d 匹配单个0-9
- >>> re.findall("\d","How old are you? I am 36")
- ['3', '6']
- >>> re.findall("\d\d","How old are you? I am 36")
- ['36']
- >>>
\w 匹配0-9a-zA-Z_ 该范围内的单个字符
- >>> re.findall("\w","How old are you? I am 36")
- ['H', 'o', 'w', 'o', 'l', 'd', 'a', 'r', 'e', 'y', 'o', 'u', 'I', 'a', 'm', '3', '6']
- >>> re.findall("\w\w\w","How old are you? I am 36")
- ['How', 'old', 'are', 'you']
- >>> re.findall("\w\w","How old are you? I_am 36")
- ['Ho', 'ol', 'ar', 'yo', 'I_', 'am', '36']
- >>>
\s 匹配空白字符以及空格
- >>> re.findall("\s","\tHow old are you?\r\n")
- ['\t', ' ', ' ', ' ', '\r', '\n']
- >>>
一组字符匹配
逐字匹配
- >>> re.findall("allen","I am allen")
- ['allen']
- >>> re.findall("allen","I am allenallen")
- ['allen', 'allen']
- >>>
逐字匹配 | 分割不同的字符串
- >>> re.findall("food|nood","I say Good not food nood")
- ['food', 'nood']
- >>> re.findall("not|nood","I say Good not food nood")
- ['not', 'nood']
- >>>
*表示左邻第一个字符 出现0次到无穷次
- >>> re.findall("go*gle","I like google not ggle goooogle and gogle")
- ['google', 'ggle', 'goooogle', 'gogle']
- >>>
+表示左邻第一个字符 出现1次到无穷次
- >>> re.findall("go+gle","I like google not ggle goooogle and gogle")
- ['google', 'goooogle', 'gogle']
- >>>
?表示左邻第一个字符 出现0次或1次
- >>> re.findall("go?gle","I like google not ggle goooogle and gogle")
- ['ggle', 'gogle']
{}指定左邻字符出现的次数
- >>> re.findall("go{2}gle","I like google not ggle goooogle and gogle")
- ['google']
- >>> re.findall("go{1}gle","I like google not ggle goooogle and gogle")
- ['gogle']
- >>> re.findall("go{1,4}gle","I like google not ggle goooogle and gogle")
- ['google', 'goooogle', 'gogle']
- >>>
- import urllib.request as u
- import re
-
- url = "http://192.168.86.11/" #结尾添加左斜杠
-
- def get_html(urladdr):
- "我的功能是获取主页的所有源码"
- request = u.Request(urladdr)
- request.add_header("User-Agent","Mozilla/5.0 \
- (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
- response = u.urlopen(request)
- html = response.read()
- return html
-
- def get_imglist(url,html):
- "我的功能是将所有图片信息地址 做成一个大的列表"
- imglist = [] #存储图片地址的一个容器列表
- bytsimglist = re.findall(b"style/\w{60}\.jpg",html)
- for i in bytsimglist: #因为图片地址不全而且是2进制字符串 因此 要进行拼接处理
- imgaddr = url+str(i,encoding='utf8') #拼接并且转换为字符串
- imglist.append(imgaddr) #将地址放入列表中
- return imglist
-
-
- def get_imgs(imglist):
- "我的功能是下载图片列表中的所有 图片信息 并保存图片"
- num = 0 #为了图片名称进行自增
- for imgurl in imglist:
- num += 1
- data = get_html(imgurl)
- with open("%s.jpg" %num,"wb") as f: #图片名字会从1.jpg开始一直到54.jpg
- f.write(data)
-
- html = get_html(url)
- #print(html)
- imglist = get_imglist(url,html)
- #print(len(imglist))
- get_imgs(imglist)
- import urllib.request as u
- import re
-
- #url = "http://www.buka.cn/view/223172/65537.html"
- #url = "http://www.buka.cn/view/223578/65537.html"
- #url = "http://www.buka.cn/view/221784/65540.html"
- url = "http://www.buka.cn/view/219792/65742.html"
-
- def get_html(urladdr):
- "我的功能是获取主页的所有源码"
- request = u.Request(urladdr)
- request.add_header("User-Agent","Mozilla/5.0 \
- (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
- response = u.urlopen(request)
- html = response.read()
- return html
-
- def get_imglist(url,html):
- "我的功能是将所有图片信息地址 做成一个大的列表"
- imglist = [] #存储图片地址的一个容器列表
- bytsimglist = re.findall(b"http://i-cdn.ibuka.cn/pics/\d+/\d+/\w+.jpg",html)
- #print(bytsimglist)
- for i in bytsimglist:
- imglist.append(str(i,encoding='utf8'))
- return imglist
-
-
- def get_imgs(imglist):
- "我的功能是下载图片列表中的所有 图片信息 并保存图片"
- num = 0 #为了图片名称进行自增
- for imgurl in imglist:
- num += 1
- data = get_html(imgurl)
- with open("%s.jpg" %num,"wb") as f: #图片名字会从1.jpg开始一直到54.jpg
- f.write(data)
-
- html = get_html(url)
- #print(html)
- imglist = get_imglist(url,html)
- #print(imglist)
- get_imgs(imglist)
正则匹配中特殊符号应用
^表示已什么开头 $以什么结尾
- >>> re.findall('^I say',"I say Good not food")
- ['I say']
- >>> re.findall('not food$',"I say Good not food")
- ['not food']
- >>> re.findall('not Good$',"I say Good not food")
- []
- >>>
\b 指定单词边界 _不属于特殊符号
- >>> re.findall("allen","allen.com allen_123 allen.com")
- ['allen', 'allen', 'allen']
- >>> re.findall("\ballen\b","allen.com allen_123 allen.com")
- []
- >>> re.findall("\\ballen\\b","allen.com allen_123 allen.com")
- ['allen', 'allen']
- >>>
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。