python爬虫[简易版]

作者：IT小白 | 2024-05-19 07:57:34

踩

python爬虫[简易版]

python爬数据[简易版]

对于每个网站的爬的原理基本是一样的,但是具体的代码写法的区别就在于爬的数据中解析出想要的数据格式:

以爬取有道词典中的图片为例:

第一步:打开网站,分析图片的数据源来自哪里,

https://dict-subsidiary.youdao.com/home/content?invalid=&previewEnvTest=

发现我们要的数据原来自这里,

{
  "data": {
     .....略.....
    "secondList": [
      {
        "name": "网易有道词典APP",
        "picture": "https://ydlunacommon-cdn.nosdn.127.net/4e7ca43db1a83f11c467105181e9badb.png",
        "desc": "智能学习更高效",
        "buttonList": [
          {
            "text": "下载",
            "type": 0,
            "url": "https://cidian.youdao.com/download-app/"
          }
        ]
      },
      {
        "name": "有道词典笔",
        "picture": "https://ydlunacommon-cdn.nosdn.127.net/c30638638a393dc38464600caf4888fb.jpg",
        "desc": "更专业的词典笔",
        "buttonList": [
          {
            "text": "查看详情",
            "type": 0,
            "url": "https://smart.youdao.com/dictPenX6Pro"
          }
        ]
      },
    ......略....
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

接下来就是分析返回的数据,解析数据~拿出图片的url

response = requests.get(url, headers)
# response = request.urlopen(url).read()
# 处理数据

json_load = json.loads(response.text)
# json_load = json.loads(response)
dt = json_load['data']
dumps = json.dumps(dt)
loads = json.loads(dumps)
listS = loads["secondList"]
arr = []
for i in listS:
    json_dumps = json.dumps(i)
    json_loads = json.loads(json_dumps)
    arr.append(json_loads["picture"])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

爬取数据:

i = 0
for item in arr:
    requests_get=request.urlopen(item).read()
    png = "d:/PythonData/pic/" + str(i) + ".png"
    with open(png, "wb") as f:
        f.write(requests_get)
    i = i + 1
1
2
3
4
5
6
7

完整的代码:

from urllib import request
import re
import requests
import json
"""
    爬取有道词典的图片
"""
url = "https://dict-subsidiary.youdao.com/home/content?invalid=&previewEnvTest="
headers = {
    # User-Agent 用户代理 浏览器基本身份信息
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 '
                  'Safari/537.36'
}
response = requests.get(url, headers)
# response = request.urlopen(url).read()
# 处理数据
json_load = json.loads(response.text)
# json_load = json.loads(response)
dt = json_load['data']
dumps = json.dumps(dt)
loads = json.loads(dumps)
listS = loads["secondList"]
arr = []
for i in listS:
    json_dumps = json.dumps(i)
    json_loads = json.loads(json_dumps)
    arr.append(json_loads["picture"])
i = 0
for item in arr:
    requests_get=request.urlopen(item).read()
    png = "d:/PythonData/pic/" + str(i) + ".png"
    with open(png, "wb") as f:
        f.write(requests_get)
    i = i + 1


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

example:

"""
# 导入数据请求模块 --> 第三方模块, 需要安装 pip install requests
# 导入正则模块 --> 内置模块, 不需要安装
"""
import requests
import re

"""
1.
发送请求, 模拟浏览器对于url地址发送请求
- 模拟浏览器 < 反爬处理 > 请求头 < 字典数据类型 >
如果你不伪装, 可能会被识别出来是爬虫程序, 从而得到数据内容
可以直接复制粘贴 --> 开发者工具里面就可以复制

- < Response[200] > 响应对象
Response: 中文意思 -->响应
<>: 表示对象
200: 状态码
表示请求成功
发送请求, 请求成功了

分析请求url地址变化规律:
第一页: http: // www.netbian.com / dongman /
第二页: http: // www.netbian.com / dongman / index_2.htm
第三页: http: // www.netbian.com / dongman / index_3.htm
第四页: http: // www.netbian.com / dongman / index_4.htm

"""
for page in range(2, 11):
    print(f'=================正在采集第{page}页的数据内容=================')
    # 请求图片目录页面url
    url = f'http://www.netbian.com/dongman/index_{page}.htm'
    # 伪装模拟成浏览器
    headers = {
        # User-Agent 用户代理 浏览器基本身份信息
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
    }
    # 发送请求
    # 调用requests模块里面get请求方法, 对于url地址发送请求, 并且携带上headers请求头伪装, 最后用自定义变量名response接受返回的数据
    response = requests.get(url=url, headers=headers)
    """
2.
获取数据, 获取服务器返回响应数据
response
网页源代码
response.text
获取响应文本数据 < 网页源代码 >
3.
解析数据, 提取我们想要的数据内容
- 图片ID
正则表达式Re
会1
不会0
调用re模块里面findall方法 --> 找到所有我们想要的数据
re.findall('找什么数据', '从哪里找') --> 从什么地方, 去匹配找什么样的数据内容
从
response.text < 网页源代码 > 里面
去找 < a
href = "/desk/(\d+).htm"
其中(\d +) 就是我们要的内容
\d + 表示任意数字
"""
# 提取图片ID --> 列表 <盒子/箱子> '29381' 是列表<箱子>里面元素<苹果>
img_id_list = re.findall('<a href="/desk/(\d+).htm"', response.text)
# for循环遍历, 把列表里面元素 一个一个提取出来
for img_id in img_id_list:
    # img_id变量<袋子> 给 img_id_list 列表<盒子> 里面 元素<苹果> 给装起来
    print(img_id)
    """
4.
发送请求, 模拟浏览器对于url地址发送请求
- 请求
图片详情页页面url
http: // www.netbian.com / desk / {图片ID}.htm
5.
获取数据, 获取服务器返回响应数据
response
网页源代码
"""
    # 请求详情页链接 --> f'{img_id}' 字符串格式化方法
    link = f'http://www.netbian.com/desk/{img_id}.htm'
    # 发送请求
    response_1 = requests.get(url=link, headers=headers)
    # 获取数据内容 网页源代码 ---> 乱码了, 进行转码
    response_1.encoding = 'gbk'
    # 6. 解析数据, 提取我们想要的数据内容<图片链接/图片标题>
    img_url, img_title = re.findall('<img src="(.*?)" alt="(.*?)"', response_1.text)[0]
    # 7. 保存数据 --> 先获取图片数据内容
    img_content = requests.get(url=img_url, headers=headers).content
    with open('img\\' + img_title + '.jpg', mode='wb') as f:
        f.write(img_content)
        print(img_url, img_title)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

python中的拷贝

import copy
from typing import override

"""对于简单的 object，用 shallow copy 和 deep copy 没区别"""
"""构造一个类,然后构造一个实例,然后拷贝这个实例,就会发现,浅拷贝和深拷贝效果一样"""
"""如果是拷贝的list,深拷贝会拷贝list里面的内容,浅拷贝不会拷贝list里面的内容,而是拷贝list的地址"""

"""浅拷贝"""


class Dog(object):
    def __init__(self, name):
        self.name = name

    """
    如果要求名一样就是一个,那么需要重写下面的方法
    #
    # def __eq__(self, other):
    #     return self.name == other.name
    #
    # def __gt__(self, other):
    #     return self.name > other.name
    """


a = Dog("a")

copy_obj_dog = copy.copy(a)

deep_copy_obj_dog = copy.deepcopy(a)

print(copy_obj_dog)  # <__main__.Dog object at 0x000002A0BFA56210>
print(deep_copy_obj_dog)  # <__main__.Dog object at 0x000002A0BFA56ED0>
print(copy_obj_dog == deep_copy_obj_dog)  # FALSE

# 修改其中的一个,修改深拷贝的对象不影响源对象
deep_copy_obj_dog.name = "b"
print(copy_obj_dog.name)  # a
print(deep_copy_obj_dog.name)  # b
print(a.name)  # a
"""深拷贝"""


class Cat:
    def __init__(self, name):
        self.name = name


c = Cat("c")
copy_obj_cat = copy.copy(c)
deep_copy_obj_cat = copy.deepcopy(c)

copy_obj_cat.name = "d"
print(copy_obj_cat.name)  # d
print(deep_copy_obj_cat.name)  # c
print(c.name)  # c

# 可以看到浅拷贝跟深拷贝对于简单的obj对象,效果是一样的,都是拷贝到一个新对象中

cat = Cat("list")
arr = [cat]
# 浅拷贝list~复杂对象 此时浅拷贝考的是地址
copy_arr = copy.copy(arr)
print(copy_arr == arr)  # True

print(arr[0])
print(copy_arr[0])

# 修改list中的一个,修改浅拷贝的对象会影响源对象
copy_arr[0].name = "修改了"
print(arr[0].name)

# 深拷贝~list 修改不会影响源对象
cat2 = Cat("list2")
arr2 = [cat2]

deepcopy_arr2 = copy.deepcopy(arr2)
print(deepcopy_arr2 == arr2)

print(arr2[0])
print(deepcopy_arr2[0])

deepcopy_arr2[0].name = "change"
print(arr2[0].name)

"""浅拷贝list中填充的是地址"""
"""深拷贝list中填充的是原始对象的副本"""


class CustomCopy():
    """自定义复制行为"""

    def __init__(self, name):
        self.name = name

    def __copy__(self):
        print("copy")
        return CustomCopy(self.name)

    def __deepcopy__(self, memo):
        print("deepcopy")
        return CustomCopy(copy.deepcopy(self.name, memo))


a = copy.copy(CustomCopy("a"))
ab = copy.deepcopy(CustomCopy("a"))
print(a == ab)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/IT小白/article/detail/591943