import requests
from bs4 import BeautifulSoup as BS
from fontTools.ttLib import TTFont  # 前提是先安装fontTools：pip install fontTools
import re
 
'''
起点中文网的数字数据为动态字体加密，通过观察每次刷新页面字体文件都会改变
这是一种通过字体反爬虫的手段，猫眼、大众点评也使用类似的方法阻止爬虫回去数字类信息
比如：点评数量、点评分数、电影票房等数据
解码方法是字体文件下载到本地，通过python自带的字体处理库打开，生成xml文件获取映射
将原始网页的由'&#'开头的编码根据映射关系替换即可获得具体数字
参考：https://www.bilibili.com/video/av967854064/
https://blog.csdn.net/jianmoumou233/article/details/81267055
https://www.cnblogs.com/chenlove/p/14858742.html
注：可将功能封装为函数
'''
 
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
resp = requests.get('https://www.qidian.com/rank/yuepiao/year2022-month01/', headers=hds);
ct = resp.content.decode('utf-8');
# print(ct);
bs = BS(ct, 'lxml');
 
c = bs.select('.rank-body .book-right-info .total p span style');
 
# 使用正则表达式匹配获取下载链接
font_url = re.findall('https.*?\'', str(c[0]));
# print(font_url);
 
# 第二条链接为.woff文件
woff = requests.get(font_url[1].replace('\'', '')).content;
 
# 存储.woff文件到本地（本地保存字体库）
with open(r'.\fonts.woff', 'wb') as f:
    f.write(woff);
    f.close();
 
# 解析字体库，得到加密规则
online_fonts = TTFont(r'.\fonts.woff');  # 使用TTFont打开字体文件
online_fonts.saveXML(r'.\fonts.xml');  # 将字体文件存储为xml文件
 
font_map = online_fonts.getBestCmap();  # 获取xml文件中的字体编码映射关系
# print(font_map)
 
# 建立英文数字字典
_dic = {
    "period": ".",
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": '7',
    "eight": "8",
    "nine": "9",
}
 
# 使用建立的字典修改字体编码映射关系，将英文改为数字
for key in font_map:
    font_map[key] = _dic[font_map[key]];
print(font_map);
 
# 使用新的映射遍历网页内容并替换为相应数字
for key in font_map:
    ct = ct.replace('&#' + str(key) + ';', font_map[key]);
 
bs = BS(ct, 'lxml');  # 重新获取网页信息
 
votes = bs.select('.rank-body .book-right-info .total p span span');  # 获取票数标签
 
for i in range(len(votes)):
    print(votes[i].get_text());

3、网络爬虫

1、直接爬取页面html，从html中就能获取到结果（30%）

2、爬取的html页面只是一个空壳子，里面没有想要的内容。（50%）
页面中我们看到的内容，是“前端页面通过请求后台接口（ajax请求），得到数据”后，又充填到html中的。
重点：通过工具，找到提供数据的那个后台接口。

3.1、直接爬取页面html-爬取每个章节的内容

昨天的内容：爬取每个章节的内容。


import requests
from bs4 import BeautifulSoup as BS
 
# 伪装成浏览器
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
 
# 小说首页路径
url = 'https://book.qidian.com/info/1027669580/#Catalog';
 
resp = requests.get(url, headers=hds);
 
ct = resp.content.decode('utf-8');
 
# print(ct)
# print(ct.index('李家的剑'))  # 找到了，说明爬取到的html中包含我们想要的内容；使用BeautifulSoup解析即可
 
bs = BS(ct, 'lxml');
sa = bs.select('.volume-wrap .volume:first-child .book_name a');  #:nth-child(1)
 
zjinfo = {};  # 名称：url地址
 
for a in sa:
    zjinfo[a.get_text()] = 'https:' + a['href'];
 
# print(zjinfo)
 
 
# 遍历每个章节
 
for k, v in zjinfo.items():
    u2 = v;
    resp = requests.get(u2, headers=hds);
    ct = resp.content.decode('utf-8');
    # print(ct.index('纯净的笑容'))  # 可以使用BS解析，获取想要的内容
    # print(ct)
 
    bs = BS(ct, 'lxml');
    sps = bs.select('.main-text-wrap .read-content p');  # 通过打印ct，再次确认元素选择器结构
    print(len(sps));  # 89：89个自然段的内容
 
    cts = [];
 
    # 遍历，获取每一个内容，放入到cts中
    for p in sps:
        cts.append(p.get_text());
 
    # print(cts);
 
    # 写入文件
    f = open(r'C:\Users\lwx\Desktop\星门\{}'.format(k + '.txt'), 'w');
    f.write('\n'.join(cts));  # 将cts中所有字符串拼接，使用\n换行，间隔
    f.close();
    print("已下载：" + k)

3.2、爬取数据接口-英雄之刃头像


import requests
from bs4 import BeautifulSoup as BS
 
# 验证，爬取到的就是一个壳子，其中没有英雄信息！！！
 
# 伪装成浏览器
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
 
url = 'https://cos.99.com/data/'
 
resp = requests.get(url, headers=hds);
 
ct = resp.content.decode('utf-8');
 
# print(ct)
# print(ct.index('东方月初'))  # 报错
 
bs = BS(ct, 'lxml');  # 解析html
 
imgs = bs.select('#heroList .img img');
# print(len(imgs))  # 0
 
 
# 数据是通过ajax来访问后台数据接口得到的
 
# 所以爬虫程序，需要找到数据接口，来访问得到数据
# 关键： 通过浏览器调试工具  找数据接口！！！
 
# 方法：综合根据 接口名、 筛选工具（Fetch/XHR）、响应内容等方法，来找到数据接口
 
# 英雄信息的数据接口：
# https://wjdown.99.com/games/cos/upload/yhzrheroattr/yhzr_hero_list.js?_=1641864636638
 
 
# 爬取数据接口，得到英雄信息，并解析：
import json
 
# 爬取数据接口，得到数据
 
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
 
url = 'https://wjdown.99.com/games/cos/upload/yhzrheroattr/yhzr_hero_list.js?_=1641864636638'
 
resp = requests.get(url, headers=hds);
 
ct = resp.content.decode('utf-8');
# print(ct)
# print(type(ct))
# 解析接口响应的字符串，不是html页面，不需要使用BeautifulSoup
 
# 对象形式的字符串---->对象本身
 
ls = json.loads(ct);
# print(ls)
# print(type(ls))
 
dr = r'C:\Users\lwx\Desktop\英魂之刃\{}.jpg'
 
for h in ls:
    # print(h['name'])
    # print(h['headimg'])
    # https://wjdown.99.com/games/cos/upload/yhzrheroheadimg/123001.jpg
    resp = requests.get(h['headimg'], headers=hds);
    ct = resp.content;  # 文件格式数据，不需要decode解码
    f = open(dr.format(h['name']), 'wb');
    f.write(ct);
    f.close();
    print('下载完成：', h['name']);

小结，根据爬取后的响应，分为如下3种情况：

爬取到的是html页面(文本信息)，需要decode解码+beautifulsoup解析。
爬取到的是接口数据(文本信息)，需要decode解码+json解析。
爬取到的是文件(图片、音视频、woff、pdf、doc等)，不能decode解码，直接io写入本地即可。

3.3、爬取数据接口-课堂练习-王者头像

课堂练习：将王者中108个英雄的头像全部爬取下来，文件名以英雄的名字来命名：https://pvp.qq.com/web201605/herolist.shtml

提醒：

确认是否需要爬取接口，可能html页面中就包含你需要的信息；
如果decode('utf-8')不行，那就decode('gbk')尝试。


# 课堂练习：将王者中108个英雄的头像全部爬取下来，文件名以英雄的名字来命名：
# https://pvp.qq.com/web201605/herolist.shtml
 
import requests
from bs4 import BeautifulSoup as BS
 
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
 
resp = requests.get('https://pvp.qq.com/web201605/herolist.shtml', headers=hds);
 
ct = resp.content.decode('gbk');
 
# print(ct)#有我们需要的英雄信息，所以解析html即可
 
bs = BS(ct, 'lxml');
imgs = bs.select('.herolist img');
 
print(len(imgs));  # 93 --->不全 ！！！！！
 
# 还是需要爬接口，https://pvp.qq.com/web201605/js/herolist.json
import requests
import json
 
hds = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
 
resp = requests.get('https://pvp.qq.com/web201605/js/herolist.json', headers=hds);
 
ct = resp.content.decode('utf-8');
 
# print(ct)
 
ls = json.loads(ct);
# print(ls)
# print(len(ls))  # 108 全！！
# {'ename': 105, 'cname': '廉颇', 'title': '正义爆轰', 'pay_type': 10, 'new_type': 0, 'hero_type': 3, 'skin_name': '正义爆轰|地狱岩魂'}
 
# 头像咋弄呢？
url_img = 'https://game.gtimg.cn/images/yxzj/img201606/heroimg/{}/{}.jpg';
 
dr = r'C:\Users\lwx\Desktop\王者\{}.jpg';
 
for h in ls:
    hid = h['ename'];
    hname = h['cname'];
 
    resp = requests.get(url_img.format(hid, hid), headers=hds);
    ct = resp.content;
 
    f = open(dr.format(hname), 'wb');
    f.write(ct);
    f.close();
 
    print('下载完成：', hname)

心为形役，尘世马牛；身被名牵，樊笼鸡鹜。——明·陈继儒《小窗幽记》

勿自暴，勿自弃，圣与贤，可驯致。——《弟子规》

正气内存，邪不可干。——《黄帝内经》

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/1004875