当前位置:   article > 正文

python爬虫获取新华字典的数据_利用python盗取汉语词典基础数据的方法 csdn

利用python盗取汉语词典基础数据的方法 csdn

数据库字段设计
在这里插入图片描述
word_id:自动递增

python代码如下:


import requests
import pymysql
from bs4 import BeautifulSoup

def downloader(url):
    """
    下载汉字并保存
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f'{url} is failed!')
        return
    
    print(f'{url} is parsing')
    html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
    a = html.find_all('a', target="_blank")

    prefix = 'http://www.zd9999.com'
    words = [prefix + w.get('href') for w in a]

    res = []
    for i in range(0, len(words)):
        response = requests.get(words[i])
        print(f'{[words[i]]} is parsing')
        if response.status_code != 200:
            print(f'{words[i]} is failed!')
            continue

        wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore').replace('<br/>', '\n').replace('<br>', '\n')\
                     , "lxml")
        td = wordhtml.find_all('table')[4].find_all('td')
        word =  td[1].text.strip()
        oldword = td[4].text.strip()
        pinyin = td[8].text.strip()
        radicals = td[10].text.strip()
        explanation = td[12].text.strip()

       # 连接数据库
        conn = pymysql.connect(
            host='localhost',
            user='root',
            # 密码
            password='******',
            # 数据库名称
            db='*****',
            charset='utf8'
        )
        # python必须有一个游标对象,用来给数据库发送sql语句并执行
        # 创建游标对象
        cur = conn.cursor()
        # 对于数据库进行增删改查
        # insert into word(word,oldword,pinyin,radicals,explanation) values(%s,%s,%s,%s,%s)
        try:
            insert_sql = "insert into word(word,oldword,pinyin,radicals,explanation) values(%s,%s,%s,%s,%s)"
            param = (word, oldword, pinyin, radicals, explanation)
            cur.execute(insert_sql, param)
            conn.commit()
            print("插入数据成功;")
        except Exception as e:
            print("插入数据失败:", e)
            conn.rollback()
        finally:
            cur.close()

if __name__ == '__main__':
    downloader('http://www.zd9999.com/zi/index.htm')
    for i in range(2, 102):
        downloader(f'http://www.zd9999.com/zi/index_{i}.htm')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70

效果如下:
在这里插入图片描述

在这里插入图片描述

参考:https://github.com/pwxcoo/chinese-xinhua

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/843563
推荐阅读
相关标签
  

闽ICP备14008679号