赞
踩
使用Python解决问题
自主编写并运行代码,按照模板要求撰写实验报告
1 爬取并下载当当网某一本书的网页内容,并保存为html格式
2 在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式)
3 从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中
""" 爬取并下载当当网某一本书的网页内容,并保存为html格式 """ import os from urllib import request header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} url = 'http://product.dangdang.com/24029955.html' req = request.Request(url, headers=header) html = str(request.urlopen(req).read) is_exist = os.path.exists('DangDang.html') if not is_exist: with open('DangDang.html', 'w+') as f: f.write(html) else: print('File already exsist')
""" 在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式) """ import re from urllib import request from bs4 import BeautifulSoup comments = [] list = [] def get_commment(comment): count = 0 for i in comment: count = count + 1 # print(count, i.string) # 也可以使用正则 comments.append(i.string) def get_score(score): pattern = re.compile('<span class="user-stars allstar(.*?) rating"') res = re.findall(pattern, str(score)) for irr in res: list.append(float(irr)) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} p = 0 for i in range(0, 3): url = f'https://book.douban.com/subject/26912767/comments/?start={i * 20}&limit={(i + 1) * 20}&status=P&sort=new_score' req = request.Request(url, headers=header) html = request.urlopen(req).read() soup = BeautifulSoup(html, 'html.parser') # get_commment(html.find_all("span", class_="short")) get_score(soup) get_commment(soup.find_all("span", class_="short")) for j in range(0, 50): print(comments[j]) sum = 0.0 for j in range(0, 50): sum = sum + float(list[j]) print(sum / 50 * 2 / 10)
""" 从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中 """ from urllib import request import xlwt from bs4 import BeautifulSoup def getHouseList(url): house = [] header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} req = request.Request(url, headers = header) html = request.urlopen(req).read() soup = BeautifulSoup(html, 'html.parser') housename_divs = soup.find_all('div', class_='title') for housename_div in housename_divs: housename_as = housename_div.find_all('a') for housename_a in housename_as: housename = [] housename.append(housename_a.get_text()) housename.append(housename_a.get('href')) house.append(housename) huseinfo_divs = soup.find_all('div', class_='houseInfo') for i in range(len(huseinfo_divs)): info = huseinfo_divs[i].get_text() infos = info.split('|') # 小区名称 house[i].append(infos[0]) # 户型 house[i].append(infos[1]) # 平米 house[i].append(infos[2]) # 查询总价 house_prices = soup.find_all('div', class_='totalPrice') for i in range(len(house_prices)): # 价格 price = house_prices[i].get_text() house[i].append(price) return house # 爬取房屋详细信息:所在区域、套内面积 def houseinfo(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} req = request.Request(url, headers=header) html = request.urlopen(req).read() soup = BeautifulSoup(html, 'html.parser') msg = [] # 所在区域 areainfos = soup.find_all('span', class_='info') for areainfo in areainfos: area = areainfo.find('a') if (not area): continue hrefStr = area['href'] if (hrefStr.startswith('javascript')): continue msg.append(area.get_text()) break infolist = soup.find_all('div', id='infoList') num = [] for info in infolist: cols = info.find_all('div', class_='col') for i in cols: pingmi = i.get_text() try: a = float(pingmi[:-2]) num.append(a) except ValueError: continue msg.append(sum(num)) return msg def writeExcel(excelPath, houses): workbook = xlwt.Workbook() sheet = workbook.add_sheet('git') row0 = ['标题', '链接地址', '户型', '面积', '朝向', '总价', '所属区域', '套内面积'] for i in range(0, len(row0)): sheet.write(0, i, row0[i]) for i in range(0, len(houses)): house = houses[i] print(house) for j in range(0, len(house)): sheet.write(i + 1, j, house[j]) workbook.save(excelPath) # 主函数 def main(): data = [] for i in range(1, 5): print('-----分隔符', i, '-------') if i == 1: url = 'https://cs.lianjia.com/ershoufang/c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/' else: url = 'https://cs.lianjia.com/ershoufang/pg' + str( i) + 'c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/' houses = getHouseList(url) for house in houses: link = house[1] if (not link or not link.startswith('http')): continue mianji = houseinfo(link) house.extend(mianji) data.extend(houses) writeExcel('C:/Users/Lunatic/Desktop/cs.xls', data) if __name__ == '__main__': main()
爬虫是Python重要的应用场景,在使用相关技术时不仅仅需要熟悉相关的Python库,更要仔细分析网页,寻找其中规律进行爬取,达成自动化的初衷。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。