赞
踩
明确目的,例如,爬取直播网站“某猫”英雄联盟版块主播名字和人气。找到数据对应的网页,分析网页的结构找到数据所在标签位置。
模拟HTTP请求,向服务器发送请求,获取到服务器返回给我们的HTML,用正则表达式提取需要的数据。
完整代码如下,50行代码搞定python原生爬虫。
import re #正则表达式模块 from urllib import request #通过request对象获取html页面 class Spider(): url = 'https://www.panda.tv/cate/lol' root_pattern = '<div class="video-info">([\s\S]*?)</div>' #()表示只提取定位标签中间内容 name_pattern = '</i>([\s\S]*?)</span>' number_pattern = '<span class="video-number">([\s\S]*?)</span>' def __fetch_content(self): #私有方法,获取html页面 r = request.urlopen(Spider.url) #在实例方法里读取类变量url htmls = r.read() #字节码 htmls = str(htmls,encoding = 'utf-8') #将字节码转为字符串 print(htmls) return htmls def __analysis(self,htmls):#分析htmls文本,通过正则表达式提取html文本中的主播名和人气值 root_html = re.findall(Spider.root_pattern,htmls) anchors = [] for html in root_html: name = re.findall(Spider.name_pattern,html) number = re.findall(Spider.number_pattern,html) anchor = { 'name':name,'number':number} anchors.append(anchor) print(anchors) return anchors def __refine(self,anchors): #精炼数据,剔除文本中的空格和换行符等内容,规范成易读的数据 targets = [] for target_list in anchors: name = target_list['name'][0].strip() number = target_list['number'][0] one_people = { 'name':name,'number':number} targets.append(one_people) print(targets); return targets def __sort(self,anchors): #对精炼后的数据按主播人气值进行排序 anchors = sorted(anchors,key = self.__sort_seed,reverse = True) return anchors def __sort_seed(self,anchor): #设置排序规则 r = re.findall('\d*',anchor['number']) number = float(r[0]) if '万' in anchor['number']: number *= 10000 return number def __show(self,anchors): #展示最终爬取的数据 for rank in range(0,len(anchors)): print('rank ' + str(rank+1) + ':' +anchors[rank]['name'] + ' ' + anchors[rank]['number']) def go(self): #公开方法,go方法是Spider的入口方法 htmls = self.__fetch_content() anchors = self.__analysis(htmls) anchors = self.__refine(anchors) anchors = self.__sort(anchors) self.__show(anchors) spider = Spider() spider.go()
以下是print()
到控制台的数据:
data-pdt-ele="1"> 英雄联盟 </a> </div> </div> </li> <li class="video-list-item video-no-tag video-no-cate " data-pdt-block="sd1-109" data-id="26657"> <a target="_blank" href="26657" class="video-list-item-wrap" data-pdt-ele="0" data-id="26657" > <div class="video-cover "> <img class="video-img video-img-lazy" data-original="https://i.h2.pdim.gs/90/e9c7aaa62412bb248c6829b04c56a3c7/w338/h190.jpg" alt="【吸血鬼教学各种细节】"> <div class="video-overlay"></div> <div class="video-play"></div> <div class="lottery-icon-list"> </div> </div> <div class="video-info"> <span class="video-title" title="【吸血鬼教学各种细节】">【吸血鬼教学各种细节】</span> <span class="video-nickname" title="有毒i吸血鬼"> <i class="icon-hostlevel icon-hostlevel-11" data-level="11"></i> 有毒i吸血鬼 </span> <span class="video-number">851</span> <span class="video-station-info"> <i class="video-station-num">18人</i> </span> </div> </a> <div class="video-label"> <div class="video-label-content"> <a class="video-label-item label-color-0" href="/cate/lol"data-pdt-ele="1"> 英雄联盟 </a> </div> </div> </li> <li class="video-list-item video-no-tag video-no-cate " data-pdt-block="sd1-110" data-id="2276191"> <a target="_blank" href="2276191" class="video-list-item-wrap" data-pdt-ele="0" data-id="2276191" > <div class="video-cover "> <img class="video-img video-img-lazy" data-original="https://i.h2.pdim.gs/90/6f46def671c207a8e4750a3c2ad2d092/w338/h190.jpg" alt="求订阅,artifact还可以"> <div class="video-overlay"></div> <div class="video-play"></div> <div class="lottery-icon-list"> </div> </div> <div class="video-info"> <span class="video-title" title="求订阅,artifact还可以">求订阅,artifact还可以</span> <span class="video-nickname" title="高调的火星人"> <i class="icon-hostlevel icon-hostlevel-0" data-level="0"></i> 高调的火星人 </span> <span class="video-number">846</span> <span class="video-station-info"> <i class="video-station-num">0人</i> </span> </div> </a> <div class="video-label"> <div class="video-label-content"> <a class="video-label-item label-color-0" href="/cate/lol"data-pdt-ele="1"> 英雄联盟 </a> </div> </div> </li> <li class="video-list-item video-no-tag video-no-cate " data-pdt-block="sd1-111" data-id="1193989"> <a target="_blank" href="1193989" class="video-list-item-wrap" data-pdt-ele="0" data-id="1193989" > <div class="video-cover "> <img class="video-img video-img-lazy" data-original="https://i.h2.pdim.gs/90/fa49782338c10a678915ecfb07891119/w338/h190.jpg" alt="刀妹专场这个中单刀妹最无敌不接受反驳"> <div class="video-overlay"></div> <div class="video-play"></div> <div class="lottery-icon-list"> </div> </div> <div class="video-info"> <span class="video-title" title="刀妹专场这个中单刀妹最无敌不接受反驳">刀妹专场这个中单刀妹最无敌不接受反驳</span> <span class="video-nickname" title="爱唱歌的小南丶"> <i class="icon-hostlevel icon-hostlevel-1" data-level="1"></i> 爱唱歌的小南丶 </span> <span class="video-number">825</span> <span class="video-station-info"> <i class="video-station-num">3人</i> </span> </div> </a> <div class="video-label"> <div class="video-label-content"> <a class="video-label-item label-color-0" href="/cate/lol"data-pdt-ele="1"> 英雄联盟 </a> </div> </div> </li> <li class="video-list-item video-no-tag video-no-cate " data-pdt-block="sd1-112" data-id="796585"> <a target="_blank" href="796585" class="video-list-item-wrap" data-pdt-ele="0" data-id="796585" > <div class="video-cover "> <img class="video-img video-img-lazy" data-original="https://i.h2.pdim.gs/90/3ca5b2f3cbeb62b9a443fa73e4186d44/w338/h190.jpg" alt="青铜皇帝在线锤号!一礼炮=锤号➕房管"> <div class="video-overlay"></div> <div class="video-play"></div> <div class="lottery-icon-list"> </div> </div> <div class="video-info"> <span class="video-title" title="青铜皇帝在线锤号!一礼炮=锤号➕房管">青铜皇帝在线锤号!一礼炮=锤号➕房管</span> <span class="video-nickname" title="熊猫尼古拉斯胖虎"> <i class="icon-hostlevel icon-hostlevel-7" data-level="7"></i> 熊猫尼古拉斯胖虎 </span> <span class="video-number">820</span> <span class="video-station-info"> <i class="video-station-num">12人</i> </span> </div> </a> <div class="video-label"> <div class="video-label-content"> <a class="video-label-item label-color-0" href="/cate/lol"data-pdt-ele="1"> 英雄联盟 </a> </div> </div> </li
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。