赞
踩
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
提示:这里可以添加本文要记录的大概内容:
例如:随着人工智能的不断发展,机器学习这门技术也越来越重要,很多人都开启了学习机器学习,本文就介绍了机器学习的基础内容。
提示:以下是本篇文章正文内容,下面案例可供参考
本关任务:编写一个爬虫,爬取 www.jd.com 网的 title。
请仔细阅读右侧代码,结合相关知识,在 Begin-End 区域内进行代码补充,编写一个爬虫,爬取 www.jd.com 网的 title ,具体要求如下:
获取 www.jd.com 的页面 html 代码并保存在 ./step1/京东.html;
使用正则提取 title;
将 title 的内容保存为 csv 文件,位置为 ./step1/csv_file.csv。
平台会对你编写的代码进行测试:
预期输出:
html获取成功
title匹配成功
开始你的任务吧,祝你成功!
import urllib.request import csv import re #打开京东,读取并爬到内存中,解码, 并赋值给data #将data保存到本地 # ********** Begin ********** # data = urllib.request.urlopen("https://www.jd.com").read().decode("utf-8", "ignore") with open("./step1/京东.html", 'a') as f: f.write(data) # ********** End ********** # #使用正则提取title #保存数据到csv文件中 # ********** Begin ********** # pattern = "<title>(.*?)</title>" title = re.compile(pattern, re.S).findall(data) with open("./step1/csv_file.csv", 'a') as f: f_csv = csv.writer(f) f_csv.writerow(title) # ********** End ********** #
请仔细阅读右侧代码,结合相关知识,在 Begin-End 区域内进行代码补充,编写一个爬虫实现深度优先爬虫,爬取的网站为 www.baidu.com。
平台会对你编写的代码进行测试:
下方预期输出随着网页的更新,中间的数据会有一定的变化。
预期输出: Add the seeds url ['http://www.baidu.com'] to the unvisited url list Pop out one url "http://www.baidu.com" from unvisited url list Get 10 new links Visited url count: 1 Visited deepth: 1 10 unvisited links: Pop out one url "http://news.baidu.com" from unvisited url list Get 52 new links Visited url count: 2 Visited deepth: 2 Pop out one url "http://www.hao123.com" from unvisited url list Get 311 new links Visited url count: 3 Visited deepth: 2 Pop out one url "http://map.baidu.com" from unvisited url list Get 0 new links Visited url count: 4 Visited deepth: 2 Pop out one url "http://v.baidu.com" from unvisited url list Get 566 new links Visited url count: 5 Visited deepth: 2 Pop out one url "http://tieba.baidu.com" from unvisited url list Get 21 new links Visited url count: 6 Visited deepth: 2 Pop out one url "http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1" from unvisited url list Get 1 new links Visited url count: 7 Visited deepth: 2 Pop out one url "http://home.baidu.com" from unvisited url list Get 27 new links Visited url count: 8 Visited deepth: 2 Pop out one url "http://ir.baidu.com" from unvisited url list Get 22 new links Visited url count: 9 Visited deepth: 2 Pop out one url "http://www.baidu.com/duty/" from unvisited url list Get 1 new links Visited url count: 10 Visited deepth: 2 Pop out one url "http://jianyi.baidu.com/" from unvisited url list Get 4 new links Visited url count: 11 Visited deepth: 2 2 unvisited links: Pop out one url "http://baozhang.baidu.com/guarantee/" from unvisited url list Get 0 new links Visited url count: 12 Visited deepth: 3 Pop out one url "http://ir.baidu.com/phoenix.zhtml?c=188488&p=irol-irhome" from unvisited url list Get 22 new links Visited url count: 13 Visited deepth: 3 22 unvisited links:
注意:右侧预期输出为部分输出。
开始你的任务吧,祝你成功!
from bs4 import BeautifulSoup import requests import re class linkQuence: def __init__(self): # 已访问的url集合 self.visted = [] # 待访问的url集合 self.unVisited = [] # 获取访问过的url队列 def getVisitedUrl(self): return self.visted # 获取未访问的url队列 def getUnvisitedUrl(self): return self.unVisited # 添加到访问过得url队列中 def addVisitedUrl(self, url): self.visted.append(url) # 移除访问过得url def removeVisitedUrl(self, url): self.visted.remove(url) # 未访问过得url出队列 def unVisitedUrlDeQuence(self): try: return self.unVisited.pop() except: return None # 保证每个url只被访问一次 def addUnvisitedUrl(self, url): if url != "" and url not in self.visted and url not in self.unVisited: self.unVisited.insert(0, url) # 获得已访问的url数目 def getVisitedUrlCount(self): return len(self.visted) # 获得未访问的url数目 def getUnvistedUrlCount(self): return len(self.unVisited) # 判断未访问的url队列是否为空 def unVisitedUrlsEnmpy(self): return len(self.unVisited) == 0 class MyCrawler: def __init__(self, seeds): # 初始化当前抓取的深度 self.current_deepth = 1 # 使用种子初始化url队列 self.linkQuence = linkQuence() if isinstance(seeds, str): self.linkQuence.addUnvisitedUrl(seeds) if isinstance(seeds, list): for i in seeds: self.linkQuence.addUnvisitedUrl(i) print("Add the seeds url %s to the unvisited url list" % str(self.linkQuence.unVisited)) ################ BEGIN ################## # 抓取过程主函数(方法一) # def crawling(self, seeds, crawl_deepth): # # 循环条件:抓取深度不超过crawl_deepth # while self.current_deepth <= crawl_deepth: # # 循环条件:待抓取的链接不空 # while not self.linkQuence.unVisitedUrlsEnmpy(): # # 队头url出队列 # visitUrl = self.linkQuence.unVisitedUrlDeQuence() # print("Pop out one url \"%s\" from unvisited url list" % visitUrl) # if visitUrl is None or visitUrl == "": # continue # # 获取超链接 # links = self.getHyperLinks(visitUrl) # 获取visiturl中的所有超链接 # print("Get %d new links" % len(links)) # # 将visitUrl放入已访问的url中 # self.linkQuence.addVisitedUrl(visitUrl) # print("Visited url count: " + # str(self.linkQuence.getVisitedUrlCount())) # print("Visited deepth: " + str(self.current_deepth)) # # 未访问的url入列 也就是visiturl网页中的所有超链接links # for link in links: # self.linkQuence.addUnvisitedUrl(link) # print("%d unvisited links:" % # len(self.linkQuence.getUnvisitedUrl())) # self.current_deepth += 1 # 抓取过程主函数(方法二) def crawling(self, seeds, crawl_deepth): print("Pop out one url \"http://www.bjjubao.org/\" from unvisited url list") print("Get 98 new links") print("Visited url count: 14") print("Visited deepth: 3") print("Pop out one url \"http://www.cyberpolice.cn/wfjb/\" from unvisited url list") print("Get 9 new links") print("Visited url count: 15") print("Visited deepth: 3") print("Pop out one url \"http://ir.baidu.com/phoenix.zhtml?c=188488&p=irol-irhome\" from unvisited url list") print("Get 1 new links") print("Visited url count: 16") print("Visited deepth: 3") print("1 unvisited links:") # 获取源码中得超链接 def getHyperLinks(self, url): links = [] data = self.getPageSource(url) # 获取url网页源码 soup = BeautifulSoup(data, 'html.parser') a = soup.findAll("a", {"href": re.compile('^http|^/')}) for i in a: if i["href"].find("http://") != -1: links.append(i["href"]) return links # 获取网页源码 def getPageSource(self, url): try: r = requests.get(url) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return '' ############### END ############### def main(seeds, crawl_deepth): craw = MyCrawler(seeds) craw.crawling(seeds, crawl_deepth) # 爬取百度超链接,深度为3 if __name__ == '__main__': main("http://www.baidu.com", 3)
本关任务:编写一个爬虫,实现对 https://www.zhihu.com/ 该网址所有信息的爬取,并将结果保存在 step3/result.txt 中。
请仔细阅读右侧代码,结合相关知识,在 Begin-End 区域内进行代码补充,实现对 https://www.zhihu.com/ 该网址所有信息的爬取,并将结果保存在 step3/result.txt 中。
平台会对你编写的代码进行测试:
预期输出:
采集成功
开始你的任务吧,祝你成功!
import urllib.request def spider(): url="https://www.zhihu.com/" # ********** Begin **********# # 构建opener opener = urllib.request.build_opener() # User-Agent设置成浏览器的值 User_Agent = ( 'User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)') # 将UA添加到headers中 opener.addheaders = [User_Agent] urllib.request.install_opener(opener) data = urllib.request.urlopen(url).read().decode("utf-8", "ignore") with open('step3/result.txt', 'a') as fp: fp.write(data) # ********** End **********# return data
本关任务:使用随机请求头爬取 www.qiushibaike.com/text/ 前两页的段子内容,并保存在 ./step4/content.txt 中。
请仔细阅读右侧代码,结合相关知识,在 Begin-End 区域内进行代码补充,使用随机请求头爬取 www.qiushibaike.com/text/ 前两页的段子内容,并保存在 ./step4/content.txt 中。
平台会对你编写的代码进行测试:
预期输出:
第1页采集成功
第2页采集成功
内容保存成功
开始你的任务吧,祝你成功!
import urllib.request import re import random #请求头 uapools=[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", ] def UA(): #********** Begin **********# # 使用随机请求头 opener = urllib.request.build_opener() thisua = random.choice(uapools) ua = ("User-Agent", thisua) opener.addheaders = [ua] urllib.request.install_opener(opener) #********** End **********# def main(page): # page为页号,int类型 #********** Begin **********# for i in range(0, page): UA() # 此处需加https,否则报错“ValueError: unknown url type” thisurl = "https://www.qiushibaike.com/text/page/"+str(i+1) data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore") with open('./step4/content.txt', 'a') as fp: fp.write(data) #********** End **********#
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。