赞
踩
看别人用异步请求快的飞起, 忍不住手痒尝试了下, 但过程并不美好, 一不小心就是满屏的"红色警告"
不过成功的喜悦总是让人陶醉的, 这也是学习的魅力吧
有些章节标题漏掉了, 前几章内容变成了最新章节的内容, 应该是正则匹配出了的偏差, 继续往下看
第一章怎么跑到这么靠后的地方了, 原因不明, 继续往下看
第二章是空的,
继续往下检查了一些章节, 好像没啥问题了. 总体上就是第一章前面多了一些章节, 部分章节为空, 其余部分正常. 应该是前面这几个玩意对应的html结构不同导致的
所有章节内容都保存在“小说名.txt”中了, 进行适当的修改, 处理结果保存在"modify.txt"中
快速浏览了整个文件, 章节内容没啥问题, 顺序也对的上
# -*- coding:utf8 -*- # 从https://www.xbiquge.cc/网站下载小说 # https://www.xbiquge.cc/book/9860/ # https://www.xbiquge.cc/book/9860/7063460.html # catalog目录,chapter章节 # r'[\u4e00-\u9fa5]+' 1到任意多个汉字 # r'\d{1,10}' 章节链接编号,章节链接在类名为box_con的第2个div中 # r'[\u4e00-\u9fa5]+\d{1,4}[\u4e00-\u9fa5]+ [\u4e00-\u9fa5]+' 小说章节名 import requests import asyncio import aiohttp import json import re import time import os import sys from bs4 import BeautifulSoup from docx import Document from docx.shared import Cm headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'} url = input('please input url:') if len(url) < 24: # url = 'https://www.xbiquge.cc/book/9860/'#为了测试方便,设置默认地址 url = 'https://www.xbiquge.cc/book/14779/' rootPath = r'C:\Users\QQ\Desktop\ls\py\{}' url = url.replace(' ','') # name = '我的微信连三界 狼烟新书'#name和saveCatalog()必须要注释掉一个 # name = '一世兵王 我本疯狂新书' def getCatalog(): def saveCatalog(): rep = requests.get(url, headers=headers) print(rep.text[:10]) rep.encoding = 'gbk' soup = BeautifulSoup(rep.text, 'lxml') # 解析 title = soup.title.contents[0] print(title) global name name = (re.findall('(.+?) ', title))[0] + ' ' + (re.findall('_(.+?)_', title))[0] # 小说名 print(name) mkDir(path=rootPath.format(name)) # 为之后将要保存的文件创建文件夹 f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, '目录') with open(f1, 'w') as f: f.write(rep.text) saveCatalog() # 只需要运行一次 def findAllChapter(): f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, '目录') f2 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, '章节链接') with open(f1, 'r') as f: rep = f.read() soup = BeautifulSoup(rep, 'lxml') s = str(soup.find(id='list')) soup = BeautifulSoup(s, 'lxml') ss = soup.findAll('a')[:] print(ss[:10], ss[-10:]) global cul, cnl #print(str(s)) cul = re.findall(r'\d{6,8}.html', str(s)) # ChapterUrlList # cnl = re.findall(r'第\d{1,4}章 [\u4e00-\u9fa5]+', str(ss))#ChapterNameList,我的微信连三界,漏掉了第373章 B级任务 # cnl = re.findall(r'>(第{0,1}\d{1,4}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了010 章 搂腰算非礼吗? # cnl = re.findall(r'>(第{0,1}\d{1,4} {0,1}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了137章无名字 cnl = re.findall(r'.html.>(第?\d{0,4} ?章? ?.*?)</a', str(s)) # 只支持数字编号 print(len(ss), len(cul), len(cnl)) print(cul[:10], cul[-10:], cnl[:10], cnl[-10:]) print('len(cul):', len(cul), 'len(cnl):', len(cnl)) '''for i in range(0, len(ss)): # 检查正则表达式,检查完后需注释掉 c = str(ss[i]) cu = re.search(r'\d{7,8}.html', str(c)).group() cn = c[c.index('.html') + 7:-4] if cu != cul[i] or cn != cnl[i]: print(cu, cul[i], cu == cul[i], cn, cnl[i], cn == cnl[i]) break''' if len(cul) == len(cnl): with open(f2, 'w') as f: for u, n in zip(cul, cnl): f.write(u + n + '\n') print('All url and name of chapters from source have been saved in this file:{}'.format(f2)) else: print('Rules require changes the regular expression') # 需要修改正则表达式来适应网页的变化 # 如果未保存小说目录信息,则获取并保存,反之,开始提取各个章节的信息 findAllChapter() def mkDir(path): if not os.path.exists(path): os.makedirs(path) def missingChapter(): # 章节序号为中文时不管用,改进方法:增加中文数字转阿拉伯数字模块 new = int(re.search(r'\d{1,4}', cnl[-1]).group()) # print('newest chapter: ',cnl[-1]) nl = [0] # chapter number list ml = [] # missing chapter number list for i in range(len(cnl)): nl.append(int(re.search(r'\d{1,4}', cnl[i]).group())) d = nl[i] - nl[i - 1] - 1 while d > 0: ml.append(nl[i] - d) # print("missing chapters' number:{}!!!".format(ml[-1]),d) d -= 1 return nl ''' for i in ml: if str(i) in str(cnl): print(i,True) else: print(i,False) ''' def saveChapter(): f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, name) # print(list(zip(cul[1900:],cnl[1900:]))) lencnl = len(cnl) nlc = modify() # Number of loaded chapters with open(f3, 'a') as f: # for cu, cn in zip(cul[nlc:], cnl[nlc:]): # 开始位置根据实际情况调整 # 此处将同步更改为异步 async def getreptext(session, cu, cn, serialcontent): async with session.get(url + cu, headers=headers) as rep: rep = await rep.text(encoding = 'gbk') serialcontent.update(await cache(rep, cu, cn)) return serialcontent async def cache(rep, cu, cn, nlc=nlc): start = time.perf_counter() content = '' for s in rep.splitlines(): if 'span' in s: # print('this line contain <span>') continue # 网页中的这一行存在问题 test1 = re.findall(r' (.+)<', s) if test1: content += test1[0] + '\n' if len(content) > 1200: # 章节字数少于1200则不写入文件 content += '\n' print('contents has been writen to cache which from : {} {}'.format(cu, cn)) else: print(content) content = '\n' print('There are problems in this chapter : {} {} !!!'.format(cu, cn)) #continue end = time.perf_counter() rt = end - start # run time trt = rt * (lencnl - nlc) # total remaining time print('estimate rest of runtime : {} minutes {} seconds'.format(trt // 60, trt%60)) nlc += 1 return {cu: content} # 对tasks分组,避免短时间内访问量过大导致爬虫被封 pervolume = 20 # workload. setting how much work would be done per iteration size = lencnl - nlc tail = size % pervolume iterations = size // pervolume async def getandcache(cu, cn, serialcontent): async with aiohttp.ClientSession() as session: await getreptext(session, cu, cn, serialcontent) for i in range(iterations): serialcontent = {} tasks = [asyncio.ensure_future(getandcache(cu, cn, serialcontent)) for cu, cn in zip(cul[nlc:nlc+pervolume], cnl[nlc:nlc+pervolume])] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) for i in cul[nlc: nlc+pervolume]: f.write(serialcontent[i]) #print(len(serialcontent), serialcontent) nlc = nlc + pervolume # del serialcontent pervolume = tail if pervolume: serialcontent = {} tasks = [asyncio.ensure_future(getandcache(cu, cn, serialcontent)) for cu, cn in zip(cul[nlc:nlc + pervolume], cnl[nlc:nlc + pervolume])] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) for i in cul[nlc: nlc+pervolume]: f.write(serialcontent[i]) #print(len(serialcontent), serialcontent) del serialcontent def runlog(): # 记录每次运行时长、运行时间、已保存的章节、缺失章节、增加的章节等信息 pass def modify(): # 检查文件中是否有广告信息、多余字符、空章节。根据检查结果对saveChapter()进行完善 f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, name) f4 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, 'modify') if not os.path.exists(f3): with open(f3, 'w') as f: pass print('saved such file : {}'.format(f3)) else: print('no such file : {}'.format(f3)) with open(f3, 'r') as f, open(f4, 'w') as fs: cc(f) c = 0 li = f.readlines() # print(type(li),len(li)) for n, i in enumerate(li): if 'span' in i: continue fs.write(i) if i == '\n' and n < len(li) - 1: c += 1 if '第' not in li[n + 1] and '章' not in li[n + 1]: # print(cnl[c]) fs.write(cnl[c] + '\n') pass #print('c :', c, 'cnl[c] :', cnl[c], 'cnl[c-1] :', cnl[c - 1]) return c def cc(file): # count characters f00 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name, 'other characters') hs0 = { 3: '·、【】!¥—~……();‘’:“”《》,。?、', 4: ''' `~!@#$%^&*()_+-={}|:%"<>?[]\;',./×''' } hs = { 1: 0, # 中文 2: 0, # english letter 3: 0, # 中文标点符号 4: 0, # english punctuation marks 5: 0, # 数字 6: 0, # 行数 7: 0, # 中文字数占总字符数的比例 } string = file.read() with open(f00, 'w') as f: for i in string: if 19968 <= ord(i) <= 40869: hs[1] += 1 elif 65 <= ord(i) <= 90 or 97 <= ord(i) <= 122: hs[2] += 1 elif i in hs0[3]: hs[3] += 1 elif i in hs0[4]: hs[4] += 1 elif 48 <= ord(i) <= 57: hs[5] += 1 elif i == '\n': hs[6] += 1 else: f.write(i) # 检查是否有其他特殊字符,应该是没有的。如果有,可能乱码了 length = len(string) hs[7] = hs[1] / (length + 1) # len+1避免报错ZeroDivisionError: division by zero file.seek(0) l = ['中文', 'english letter', '中文标点符号', 'english punctuation marks', '数字', '行数', '中文字数占总字符数的比例'] for i in range(7): if i == 6: print('{} : {:.2%}'.format(l[i], hs[i + 1])) else: print('{} : {:.2f}万'.format(l[i], hs[i + 1] / 10000)) print('\n总字符数:{:.0f}万.平均每章节{:.0f}字,平均每个段落{:.0f}字\n'.format(length / 10000, length / (len(cnl) + 1), length / (hs[6] + 1))) # can't division by zero def main(): start = time.perf_counter() getCatalog() # missingChapter() saveChapter() modify() end = time.perf_counter() print('total time consuming : ', (end - start) // 60, 'minutes', (end - start) % 60, 'seconds') main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。