赞
踩
- # coding=utf-8
-
- import urllib.request
- import ssl
- import re
-
- # 开始调用
- def getHtml(url):
- ssl._create_default_https_context = ssl._create_unverified_context
- page = urllib.request.urlopen(url)
- html = page.read()
- html = html.decode('utf-8')
- return html
-
-
-
- def scriptHtmlKind(data, reg):
-
- # data = '11爱woni000'
- # reg = r'\d{2}[\u4e00-\u9fa5]+\w{4}\d{3}'
- # 编写正则表达式-获取首页所有分类list
- kindDomReg = re.compile(reg)
- kindDom = re.findall(kindDomReg, data)
- return kindDom
-
-
-
-
- # 处理总列表
- def kindAllBoss(url):
- print(url)
- # 获取分类页面节点
- # getHtml(url)
-
-
-
- if __name__ == '__main__':
- # 处理url
- url = 'https://www.bxwxorg.com/'
- # 获取资源
- data = getHtml(url)
- # 处理资源,获取分类列表DOM
- reg = r'<div class="nav">\s*<ul>[\u4e00-\u9fa50-9a-zA-Z\<\>\\\"\s\=\:\/\/\.]*?</div>'
- kindDom = scriptHtmlKind(data, reg)
- if len(kindDom) < 1:
- AssertionError
- # 获取分类列表
- reg2 = r'.com/(\w*\/)">([\u4e00-\u9fa5a-zA-Z]*)'
- kindList = scriptHtmlKind(kindDom[0], reg2)
- # 分类list,存库
- kinds = []
- for kind in kindList:
- kinds.append(kind[0])
-
-
- # 拼接url,处理分类资源,获取分类页面下所有书本
- for kind in kinds:
- kindAllBoss(url+kind)
- # print(data)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。