赞
踩
选取一个网站,例如新闻类、影评类、小说、股票金融类、就业类等等。
(1) 初步分析,数据抓取方式:静态或动态页面,get/post方式;
(2) 确定信息抽取方式:BeautifulSoup、re、xpath或json格式提取;
(3) 初步分页 网页结构,获取几层页面;每层页面提取的URL特征等等。
要求:
1.完成内链接提取和数据提取
2.设置合适的异常处理,保证程序正确运行
将提取的数据存储到文件:txt或csv或json等
网站url:
外链:【酒店预订】宾馆预订_网上订酒店_旅馆住宿价格查询-去哪儿酒店
在搜索框输入地点和入住时间,点击搜索按钮
找到信息的爬取位置
使用Selenium中的函数获取此时网页的url
获取html
使用beautiful与正则表达式爬取酒店名称,评分,地址信息
将结果分别放入列表中
将列表合并为数据框
查看data
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import datetime
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
driver = webdriver.Chrome()
driver.get('https://www.qunar.com/')
# 点击酒店,进入酒店页面
button = driver.find_element_by_link_text('酒店')
button.click()
time.sleep(1)
# 输入目的地上海
input = driver.find_element_by_class_name("inputText")
input.clear()
input.send_keys("上海")
time.sleep(1)
#输入入住时间2021年10月23日
day1 = datetime.date(2021,10,23)
input = driver.find_element_by_xpath("//input[@tabindex='2']")
input.clear()
input.send_keys(str(day1))
time.sleep(1)
#输入离开时间2021年11月2日
day2 = datetime.date(2021,11,2)
input = driver.find_element_by_xpath("//input[@tabindex='3']")
input.clear()
input.send_keys(str(day2))
time.sleep(1)
# 点击搜索按钮
input = driver.find_element_by_class_name('main')
input.click()
time.sleep(1)
#获取此次爬取数据的网页url
html = driver.current_url
#创建三个存放数据的空列表
name = []
allnum = []
alladress = []
allprice = []
try:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,'accept-encoding': 'gzip, deflate, br'
,'accept-language': 'zh-CN,zh;q=0.9'
,'cache-control': 'max-age=0'
,'cookie': 'QN1=00009080306c3944ca081651; QN99=6009; QunarGlobal=10.86.213.148_-332dc806_17ca596c5b8_4bd2|1634865985991; QN601=198047fde20de10eb695b8fdfe3566a4; QN269=14E98B1132D711ECBF20FA163E217C9C; _i=ueHd8NCX9mfX7FjyUieSRnFiUJ-X; QN48=00008d002f103944ca287021; fid=b0207184-233c-4754-877b-ee4fe80e8ac5; QN300=auto_4e0d874a; QN277=auto_4e0d874a; QN205=auto_4e0d874a; HN1=v1a295f9201942f6f37f1924b59f328e2a; HN2=qucslqlkrsusu; csrfToken=wwJxjPkXenv9Pdai247eA91Ccd9ggohC; qunar-assist={%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false%2C%22readscreen%22:false}; qunar-assist-ignore=[%22https://www.qunar.com/%22]; QN163=0; QN6=auto_4e0d874a; SECKEY_CID2=a6d1db3ae0d051d6f1486e6208878bcfb0b626ed; BMAP_SECKEY2=348b3ea8be0aad420af3f6fd20b5fe5d742b033169aaff9e55468f20442b2a4079fc11a7bd963ba9f8cb6be3ba7407e73b6fce9ae6fd7d808f0466c44c5b81ead435cc8349a943b0f3928367a825d9ba2f72343e4cc2d8ffb1c3e55662a402c47813d5d7815734abf9cf43cc780295529265effaf05999a771b4d52f159de9f123afb78063aed415b32f638f832ce2b0b23b1bda58bc6f8197d2960a3a5e1409c87782b66e4edf46b5483c6a3f4d827f0cdc1dcc312429b5cbf53e6884fc353b622b9ed7b549cae80f27bf16bd61cc81acc59afe605e95329f05b88f459e6db6f4ba58dfd2cf9e27ffe1ff1ea00c0643; ctt_june=1624603727149##iK3wVRDwWhPwawPwasHGXSj8W2DNWRgnXKfTaKXNXKWRXStwaPfRWRHGa2DAiK3siK3saKjsWR2OWsPsaSaNauPwaUvt; _vi=qjArjPgR-mH7_P0-iszJrSSjuHKNSG5PDBJZ9gOh-LbE3JETZw6fzkV-nDeMtTnFffW9T8llhge-InCNSW30z8HHAMsX00VxLaJCX-2gGKWFs5Qq1a0rnJZKYTukgMIx332XP-LU3vi6JX734mEE1VJz84TzmVI1PdGwzZLs4CQZ; tabIndex=0; cityUrl=shanghai_city; cityName=%25E4%25B8%258A%25E6%25B5%25B7; checkInDate=2021-11-30; checkOutDate=2021-12-01; QN267=0152221845862fa64fc; ctf_june=1624603727149##iK3wasv8WuPwawPwa%3DkhaRtOVDXNESfREREDX%3D3mERPNEPanWK0hVDDwVRjAiK3siK3saKjsWR2%2BWstNWRXNVhPwaUvt; QN271=9c98334d-42e8-4da5-ba49-da206fe1246c; cs_june=a4e7c8acd60e0c8483c9c4af0f303f81df6361696daefcf359213b3b8970a158cbe5e6f7383b6549e0e4f45ff384fba3cbaa742089d0e876931a25aef726e978b17c80df7eee7c02a9c1a6a5b97c117931c6ace31985c55d765acdb0081ddd7f5a737ae180251ef5be23400b098dd8ca; __qt=v1%7CVTJGc2RHVmtYMS9SRXVHd3lLZnlhT002VVlHWUdxdUJzcHgwNjlzR0cydERWV3pTaFhUSm9iYUlwMWJWUE1iNDlsWnV5RG5YZVNISW9HSG5wc3ZGNzFxTEZUKzk0aDZONTdnMkQxendvV0RCMlN3Z0xOcGNmSTZJdVFrZW91MHBWMS9UK3dySUVnbnJnaXRYR21rL241dE5kem1pRXV6MXVINFlTd1lnTUhVPQ%3D%3D%7C1634977992282%7CVTJGc2RHVmtYMSt2OHQ5Tlc0N1ptYXBRa2doNFo5bTNNcnNpWGh3QlI0WG56R0Z1YWMvazFBbkdxNk4zZEZ4a1pIRElDWmtKaXRJTzBsSnd1OEZDTlE9PQ%3D%3D%7CVTJGc2RHVmtYMStVdEN6b01mS3dCRHl2U0hHQ3RBdWVKSi9oYWFaRUxBYU15Qy9mNHNvYVB1emV2Ni9BVWlxb0h3NDZqT1ErM2J0ZFA1NTdiSThSSHd4cWtFWndjTGRSdllsZ3NQbGdRQ0JzYWNQUGh4QU9kcFN2Q05UM0RhRWJsUlZBSXFPdkE2a2dZRzB3M2ZwMUdMTVd5NVNmK0cvcFdPMXBTSnhIa2I3ZW9teCtuQ0pYQ3lWWG5wSHdOSHlyN3gvQUdLeDFNWEFkblNCaW8ybkhoMkxjbWMzKzFNd212MkFkbXkwakNCYkVJUlZiMUVCVytvd05naWNBcHoyaXpwdGQvekFjNmRhVWt4a1Vhb2NBb1poQUJzSDY4ZG1IOWNBN0NQMG5Gdlp4WmhMTzBRT3EvZk4vWllUemg5MWxLUEhhU2dGSDBKYVNSZWpsT1RINGU1dU5oM01lODZFRkhZbVRFYTcxUmlzVDNYL1pmc1dXbFlGdVZiTCtxbWJna1ZCRlZBcjBrTXZ2aGRydWxnWWw1STlML3BRc2EvRDVYNmlFY2doaEIrRVd4aTROaU5KN0JCaGNvOThmRVQ4MFlzMGFhd0U1TDViMkd3aXlJdGtqMkFrVGdYMUp1Qmo0TzJvYmxDRmhDM3N1UFNWajJuV1VXa2tEM3Q4OHduSEhXM05VVkNkcjN0bng4R2JVL2V5YXRQM0ZNc3I0TGlwT0hyNk8rNmFGNlhEeUxhNnNqYzIvZ2NTdnFoRHF0TVZSTkc2aXNjOU1NMGhoQjEvL0t3UzRQTS96ZkdML3lvd1YxSFN0R2prYmpIenF2cjVMSkU0ZGFRMWJzZk52Qzd5ZGxUZ0Q1Z1JhT2JGbGxjWFBDaTZZTDJRa0ZBL3MvNkZacVNHb0UyQ0ZhU0E9'
,'if-none-match': 'W/"52de8-3+EpRWW/CKyGsei/6265AGZFmK4"'
,'referer': 'https://hotel.qunar.com/'
,'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"'
,'sec-ch-ua-mobile': '?0'
,'sec-ch-ua-platform': '"Windows"'
,'sec-fetch-dest': 'document'
,'sec-fetch-mode': 'navigate'
,'sec-fetch-site': 'same-origin'
,'sec-fetch-user': '?1'
,'upgrade-insecure-requests': '1'
,'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
#伪装成浏览器
resp = requests.get(html,headers = headers)
html = BeautifulSoup(resp.content,'html.parser')
#找到所有酒店信息装进列表dds中
dds = html.findAll('div',{'class':'inner clearfix'})
#text编码格式输出
#遍历dds找出酒店名、评分、地址、价格等信息装进列表中
for dd in dds:
#酒店名称
name1 = dd.find('p',{'class':'name'}).text
name.append(name1)
print("酒店名:" + name1)
#评分
num = dd.find('p',{'class':'comm'}).text
allnum.append(num)
print("评分:" + num)
#地址
adress = dd.find('p',{'class':'adress'}).text
alladress.append(adress)
print("地址:" + adress)
#价格
price = dd.find('p',{'class':'price_new'}).text
allprice.append(price)
print("评分:" + price)
except HTTPError as e:
print(e)
except URLError as e:
print('The server could not be found')
else:
print('It Worked!')
#将列表数据存成元组形式转换为数据框
dc = {
"名称":name,
"评分":allnum,
"地址":alladress,
"价格":allprice
}
data = pd.DataFrame(dc)
#将爬取到的内容写入文档中
data.to_csv('D:\\网络大数据采集\\data.csv', encoding="utf-8", index=False, mode='a')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。