赞
踩
目标网页:当当网书籍畅销榜 http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-1
爬取结果:
代码:
- import requests,re,json
-
- def request_dangdang(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- return response.text
- except requests.RequestException:
- return None
-
- def parse_result(html):
- pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S)
- items = re.findall(pattern,html)
- for item in items:
- yield{ #写入为字典类型
- 'range':item[0],
- 'iamge':item[1],
- 'title':item[2],
- 'recommend':item[3],
- 'author':item[4],
- 'times':item[5],
- 'price':item[6]
- }
- print(1)
- for item in items:
- print(item)
-
- def write_item_to_file(item):
- #print("写入数据===》" + str(item))
- with open('book.txt','a',encoding='utf-8') as f:
- f.write(json.dumps(item,ensure_ascii=False) + '\n') #将字典类型转化为字符串写入文件
- f.close()
-
- def main(page):
- url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-'+str(page)
- html = request_dangdang(url)
- items = parse_result(html) #解析过滤想要的信息
- for item in items:
- write_item_to_file(item)
-
- if __name__ == "__main__":
- for i in range(1,26):
- main(i)
参考网址:https://blog.csdn.net/weixin_42469142/article/details/89856325
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。