赞
踩
爬取携程酒店的信息:将酒店名称,地址,电话,开业时间,客房数,图片链接,点评,图片保存路径生成excel,再将图片保存在对应酒店的image文件夹中
第一步:
需要先对数据进行抓包:
可以用bursuite或者fiddle进行抓包,(bursuite下载教程可以参考此链接BurpSuite全平台破解通用-至今可用 - SaberCC Blog)
那么现在打开bursuite,在携程搜索框输入一个城市点击搜索,再看看bursuite,找到有全部酒店id的包,发现请求数据中有个pageindex,这个就是说你将页面往下滑时会加载出来更多的酒店,那么这个pageidex可以作为参数,同样还有城市参数(cityid)。开始写得到hotelid的代码。写一下请求数据,得到响应数据,先创建一个gethotelid.py
- #gethotelid.py
- from 检查是否有重复字段 import remove_duplicate_lines
- import requests
- def get_hotel_list(pageIndex,cityid):
- burp0_url = "https://m.ctrip.com:443/restapi/soa2/22370/gethotellist?_fxpcqlniredt=52271165296426527463"
- burp0_cookies = {"Union": "OUID=mini1053&AllianceID=1314167&SID=4258862&SourceID=55555549", "DUID": "u=656D90DBCBB453190FC35AE0E82113CC&v=0", "GUID": "52271165296426527463"}
- burp0_headers = {"X-Ctx-Locale": "zh-CN", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "X-Ctx-Group": "ctrip", "Content-Type": "application/json", "X-Wx-Openid": "33d6ec3b-d887-4cf8-8fc4-c29c77f47342", "X-Ctx-Personal-Recommend": "1", "Xweb_xhr": "1", "X-Ctx-Region": "CN", "X-Ctx-Currency": "CNY", "Duid": "u=656D90DBCBB453190FC35AE0E82113CC&v=0", "Accept": "*/*", "Sec-Fetch-Site": "cross-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://servicewechat.com/wx0e6ed4f51db9d078/800/page-frame.html", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
- burp0_json={"channel": 1, "checkinDate": "2023-10-18", "checkoutDate": "2023-10-26", "cityId": cityid, "districtId": 0, "filterInfo": {"filterItemList": [], "highestPrice": 0, "keyword": "", "lowestPrice": 0, "starItemList": []}, "head": {"auth": "1369D5C2A3508C1F5B13853ACC15CAEE9FB5FC99929FEB6DA28D19CC13AC3AEB", "cid": "52271165296426527463", "ctok": "", "cver": "1.1.188", "extension": [{"name": "sdkversion", "value": "3.0.0"}, {"name": "openid", "value": "33d6ec3b-d887-4cf8-8fc4-c29c77f47342"}, {"name": "pageid", "value": "10650012159"}, {"name": "supportWebP", "value": "true"}, {"name": "ubt", "value": "{\"vid\":\"1697027931003.8mu33n\",\"sid\":2,\"pvid\":18,\"ts\":1697030756781,\"create\":1697027931003,\"pid\":\"10320613574\"}"}, {"name": "supportFuzzyPrice", "value": "1"}, {"name": "appId", "value": "wx0e6ed4f51db9d078"}, {"name": "scene", "value": "1053"}], "lang": "01", "sauth": "", "sid": "", "syscode": "30"}, "hiddenHotelIds": None, "isHourRoomSearch": False, "isMorning": 0, "nearbySearch": 0, "pageCode": "hotel_miniprogram_list", "pageIndex":pageIndex, "pageSize": 10, "preCount": 0, "preHotelIds": "", "session": {"key": "6bf9afb8c7f191d9e9db936b511103b1386625ab2ab18cf064b7427af73e9077", "sessionKey": "6720161d-0cb7-4dd0-be39-df37110067d0"}, "sessionId": "", "sourceFromTag": "inquire_preload", "topHotelIds": [], "userCoordinate": None}
- res = requests.post(burp0_url, headers=burp0_headers, cookies=burp0_cookies, json=burp0_json)
- data = res.json()
那么我们可以调用函数把这个data给打印一下,分析出这个hotelid在哪里
在控制台看着比较乱,我们可以用json格式化网站(在线JSON校验格式化工具(Be JSON))帮助我们快速找到这个hotelid的位置,可以发现hotellnfoList是个列表,列表里包含字典,每个字典包含每个酒店信息,hotelid就在这个字典里。那么我可以用循环取得每个id。
- #gethotelid.py
- # print(data)
- cityName = data['hotelInfoList'][0]['cityName']
- # print(cityName)
- hotel_ids = []
- for hotel_info in data['hotelInfoList']:
- hotel_id = hotel_info['hotelId']
-
- hotel_ids.append(hotel_id)
- return hotel_ids, cityName
这个cityName拿过来方便用于创建txt名字
然后得到hotelid之后我们可以把它写入txt文件
- #gethotelid.py
- def save_hotel_ids_to_txt(cityid, max_pages):
- all_hotel_ids = []
- try:
- for page_index in range(1, max_pages + 1):
- hotel_ids, cityName = get_hotel_list(page_index, cityid)
- print(f"Page {page_index} Hotel Info: {hotel_ids}")
- all_hotel_ids.extend(hotel_ids)
- except Exception as e:
- print("读取完成")
-
- # 将所有的 hotel_ids 写入 txt 文件,以 cityName 命名
- txt_file_name = f'{cityName}.txt'
- with open(txt_file_name, 'w') as file:
- for hotel_id in all_hotel_ids:
- file.write(str(hotel_id) + '\n')
-
- file_path = txt_file_name # 使用新生成的 txt 文件的路径
- remove_duplicate_lines(file_path)
- print(f"Hotel IDs saved to {txt_file_name}")
- return txt_file_name
这串代码中的remove_duplicate_lines函数的添加是因为我在测试中发现生成的txt文件可能会有一些hotelid重复,于是弄了这个函数来检查txt文件,这个函数是从另外一个叫做“检查是否有重复字段.py”文件中导入的
- #检查是否有重复字段.py
- def remove_duplicate_lines(file_path):
- lines = []
- seen = set()
-
- with open(file_path, 'r') as file:
- for line in file:
- line = line.strip()
- if line not in seen:
- seen.add(line)
- lines.append(line)
- else:
- print(f"重复字段: {line}")
-
- with open(file_path, 'w') as file:
- file.write('\n'.join(lines))
-
- print("重复字段已删除并文件已更新。")
现在已经得到hotelid了。那么现在我们取读取酒店的其余信息
现在随便点进一个酒店,我们来抓包分析一下里面内容,用刚刚的方法同样我们去得到响应数据。
找到有hotel 地址的响应包,开始写代码,在新的py文件中写(hotelinfo.py)
- def get_addressApic(hotelId):
- burp0_url = "https://m.ctrip.com:443/restapi/soa2/26187/graphql?_fxpcqlniredt=52271115296426477628"
- burp0_cookies = {"Union": "OUID=&AllianceID=262684&SID=711465&SourceID=55552689", "DUID": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "GUID": "52271115296426477628"}
- burp0_headers = {"X-Ctx-Locale": "zh-CN", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "X-Ctx-Group": "ctrip", "Content-Type": "application/json", "X-Wx-Openid": "b9787617-697b-4fe6-9942-f5fddd4b8063", "X-Ctx-Personal-Recommend": "1", "Xweb_xhr": "1", "X-Ctx-Region": "CN", "X-Ctx-Currency": "CNY", "Duid": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "Accept": "*/*", "Sec-Fetch-Site": "cross-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://servicewechat.com/wx0e6ed4f51db9d078/800/page-frame.html", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
- burp0_json={"head": {"auth": "8A38C2D39415D5002658D2670FDD4F8D369AEE5E25A10B5F129505F540395603", "cid": "52271115296426477628", "ctok": "", "cver": "1.1.188", "extension": [{"name": "sdkversion", "value": "3.0.0"}, {"name": "openid", "value": "b9787617-697b-4fe6-9942-f5fddd4b8063"}, {"name": "pageid", "value": "10320654891"}, {"name": "supportWebP", "value": "true"}, {"name": "ubt", "value": "{\"vid\":\"1697008079504.6vx93z\",\"sid\":1,\"pvid\":38,\"ts\":1697011325377,\"create\":1697008079504,\"pid\":\"10320654891\"}"}, {"name": "supportFuzzyPrice", "value": "1"}, {"name": "appId", "value": "wx0e6ed4f51db9d078"}, {"name": "scene", "value": "1007"}], "lang": "01", "sauth": "", "sid": "", "syscode": "30"}, "query": " { hotel(id: "+str(hotelId)+", checkIn: \"2023-10-11\", checkOut: \"2023-10-12\") { getBaseInfo { hotelName hotelEnName zoneName address openYear fitmentYear fuzzyAddressTip commentScore commentDesc commentCount bestCommentSentence isOversea cityId cityName totalPictureCount mgrGroupId hotelCategoryOutlineImages { categoryName pictureList { url urlBody urlExtend } } coordinate { latitude longitude } starInfo { star } topAwardInfo { listSubTitle listUrl awardIconUrl lableId rankId annualListAwardIconUrl annualListTagUrl } } getTrafficDetail(filterValue: \"\") { defaultTrafficText } getDetailTag { starTag { icon } dStarTag { icon } medalTag { icon } primeTag { icon } facilityTags(limit: 3) { title } categoryTag { title } } } } ", "source": "hotel_detail_head"}
- res = requests.post(burp0_url, headers=burp0_headers, cookies=burp0_cookies, json=burp0_json)
- data = res.json()
- print(data)
打印data我们发现
这里有address还有酒店图片。
- address = data['data']['hotel']['getBaseInfo']['address']
- picList = data['data']['hotel']['getBaseInfo']['hotelCategoryOutlineImages']
- cityName = data['data']['hotel']['getBaseInfo']['cityName']
- picList = [item['pictureList'] for item in picList if 'pictureList' in item]
- pictureUrls = [item['url'] for sublist in picList for item in sublist if 'url' in item]
- return address,pictureUrls,cityName
接下来接着看包,找到一个有酒店名称,电话,酒店开业,装修时间这些信息的包,用同样的方法。
- def get_info(hotelId):
- burp0_url = "https://m.ctrip.com:443/webapp/hotels/sellingpoint?hotelid="+str(hotelId)+"&checkin=2023-10-11&checkout=2023-10-12&fromminiapp=weixin&allianceid=262684&sid=711465&sourceid=55552689&_cwxobj=%7B%22cid%22%3A%2252271115296426477628%22%2C%22appid%22%3A%22wx0e6ed4f51db9d078%22%2C%22mpopenid%22%3A%22b9787617-697b-4fe6-9942-f5fddd4b8063%22%2C%22mpunionid%22%3A%22oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%22%2C%22allianceid%22%3A%22262684%22%2C%22sid%22%3A%22711465%22%2C%22ouid%22%3A%22%22%2C%22sourceid%22%3A%2255552689%22%2C%22exmktID%22%3A%22%7B%5C%22openid%5C%22%3A%5C%22b9787617-697b-4fe6-9942-f5fddd4b8063%5C%22%2C%5C%22unionid%5C%22%3A%5C%22oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%5C%22%2C%5C%22channelUpdateTime%5C%22%3A%5C%221697011325536%5C%22%2C%5C%22serverFrom%5C%22%3A%5C%22WAP%2FWECHATAPP%5C%22%2C%5C%22innersid%5C%22%3A%5C%22%5C%22%2C%5C%22innerouid%5C%22%3A%5C%22%5C%22%2C%5C%22pushcode%5C%22%3A%5C%22%5C%22%2C%5C%22txCpsId%5C%22%3A%5C%22%5C%22%2C%5C%22amsPid%5C%22%3A%5C%22%5C%22%2C%5C%22gdt_vid%5C%22%3A%5C%22%5C%22%7D%22%2C%22scene%22%3A1007%2C%22personalRecommendSwitch%22%3Atrue%2C%22localRecommendSwitch%22%3Atrue%2C%22marketSwitch%22%3Atrue%2C%22pLen%22%3A3%7D&_obt=1697012693781"
- burp0_cookies = {"GUID": "52271115296426477628", "nfes_isSupportWebP": "1", "nfes_isSupportWebP": "1", "UBT_VID": "1697008100680.1e1cmmiBfNEU", "librauuid": "", "_RF1": "111.22.74.62", "_RSG": "2Ha6Cu221L4XAzI9a6MT2B", "_RDG": "282a01bdfbf8df2e6720e1b8de10ac1c86", "_RGUID": "4541082e-0c1a-4442-b09d-5869094bd2a1", "login_type": "0", "login_uid": "B34F49EE0A13E8463539186D23B00A8B", "DUID": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "IsNonUser": "F", "AHeadUserInfo": "VipGrade=10&VipGradeName=%BB%C6%BD%F0%B9%F3%B1%F6&UserName=&NoReadMessageCount=0", "cticket": "8A38C2D39415D5002658D2670FDD4F8D369AEE5E25A10B5F129505F540395603", "_resDomain": "https%3A%2F%2Fbd-s.tripcdn.cn", "_pd": "%7B%22_o%22%3A4%2C%22s%22%3A13%2C%22_s%22%3A0%7D", "MKT_Pagesource": "H5", "Union": "OUID=&AllianceID=262684&SID=711465&SourceID=55552689&AppID=wx0e6ed4f51db9d078&OpenID=b9787617-697b-4fe6-9942-f5fddd4b8063&exmktID={\"openid\":\"b9787617-697b-4fe6-9942-f5fddd4b8063\",\"unionid\":\"oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM\",\"channelUpdateTime\":\"1697011325536\",\"serverFrom\":\"WAP/WECHATAPP\",\"innersid\":\"\",\"innerouid\":\"\",\"pushcode\":\"\",\"txCpsId\":\"\",\"amsPid\":\"\",\"gdt_vid\":\"\"}&createtime=1697012402&Expires=1697617201862", "MKT_OrderClick": "ASID=262684711465&AID=262684&CSID=711465&OUID=&CT=1697012401865&CURL=https%3A%2F%2Fm.ctrip.com%2Fwebapp%2Fservicechatv2%2F%3FisHideNavBar%3DYES%26isFreeLogin%3D0%26platform%3Dwechat%26appId%3Dwx0e6ed4f51db9d078%26sceneCode%3D2%26channel%3DEBK%26bizType%3D1356%26isPreSale%3D1%26pageCode%3D10320654891%26thirdPartytoken%3Dae1f21df-6e4b-4473-bcf3-8f19a78ac594%26source%3Dminipro_app%26orderInfo%3D%257B%2522amount%2522%253A%2522%2522%252C%2522bu%2522%253A%2522EBK%2522%252C%2522cid%2522%253A%25220%2522%252C%2522ctype%2522%253A%2522%2522%252C%2522currency%2522%253A%2522%2522%252C%2522supplierId%2522%253A1632483%252C%2522supplierName%2522%253A%2522%25E6%2598%259F%25E8%25BE%25B0%25E9%2585%2592%25E5%25BA%2597(%25E6%25A0%25AA%25E6%25B4%25B2%25E6%25B9%2596%25E5%258D%2597%25E5%25B7%25A5%25E4%25B8%259A%25E5%25A4%25A7%25E5%25AD%25A6%25E5%25BA%2597)%2522%252C%2522title%2522%253A%2522%25E6%2598%259F%25E8%25BE%25B0%25E9%2585%2592%25E5%25BA%2597(%25E6%25A0%25AA%25E6%25B4%25B2%25E6%25B9%2596%25E5%258D%2597%25E5%25B7%25A5%25E4%25B8%259A%25E5%25A4%25A7%25E5%25AD%25A6%25E5%25BA%2597)%2522%257D%26q%3DeyJtaW5lIjowLCJ3ZW1jIwoxfQ%3D%3DTW%26mktopenid%3Db9787617-697b-4fe6-9942-f5fddd4b8063%26fromminiapp%3Dweixin%26allianceid%3D262684%26sid%3D711465%26sourceid%3D55552689%26_cwxobj%3D%257B%2522cid%2522%253A%252252271115296426477628%2522%252C%2522appid%2522%253A%2522wx0e6ed4f51db9d078%2522%252C%2522mpopenid%2522%253A%2522b9787617-697b-4fe6-9942-f5fddd4b8063%2522%252C%2522mpunionid%2522%253A%2522oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%2522%252C%2522allianceid%2522%253A%2522262684%2522%252C%2522sid%2522%253A%2522711465%2522%252C%2522ouid%2522%253A%2522%2522%252C%2522sourceid%2522%253A%252255552689%2522%252C%2522exmktID%2522%253A%2522%257B%255C%2522openid%255C%2522%253A%255C%2522b9787617-697b-4fe6-9942-f5fddd4b8063%255C%2522%252C%255C%2522unionid%255C%2522%253A%255C%2522oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%255C%2522%252C%255C%2522channelUpdateTime%255C%2522%253A%255C%25221697011325536%255C%2522%252C%255C%2522serverFrom%255C%2522%253A%255C%2522WAP%252FWECHATAPP%255C%2522%252C%255C%2522innersid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522innerouid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522pushcode%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522txCpsId%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522amsPid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522gdt_vid%255C%2522%253A%255C%2522%255C%2522%257D%2522%252C%2522scene%2522%253A1007%252C%2522personalRecommendSwitch%2522%253Atrue%252C%2522localRecommendSwitch%2522%253Atrue%252C%2522marketSwitch%2522%253Atrue%252C%2522pLen%2522%253A3%257D&VAL={\"h5_vid\":\"1697008100680.1e1cmmiBfNEU\"}", "_bfa": "1.1697008100680.1e1cmmiBfNEU.1.1697010109935.1697012401870.1.20.10650084702", "_ubtstatus": "%7B%22vid%22%3A%221697008100680.1e1cmmiBfNEU%22%2C%22sid%22%3A1%2C%22pvid%22%3A20%2C%22pid%22%3A10650084702%7D"}
- burp0_headers = {"Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Sec-Ch-Ua": "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
- res = requests.get(burp0_url, headers=burp0_headers, cookies=burp0_cookies)
- soup = BeautifulSoup(res.text, "html.parser")
- script_tag = soup.find("script", {"id": "__NEXT_DATA__"})
- json_data = json.loads(script_tag.string)
- initialState = json_data["props"]["pageProps"]["initialState"]
- hotelName = initialState["hotelEnName"]
- tels = [tel["calTel"] for tel in initialState["contactInfo"]["telInfoList"]]
- openTime = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "开业时间")
- renewTime = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "装修时间")
- roomNums = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "房间数量")
- return hotelName,tels,openTime,renewTime,roomNums
同理得到评论的数据
- def get_comments(hotelId, pageIndex):
- url = "https://m.ctrip.com/restapi/soa2/24626/commentlist"
-
- headers = {
- "User-Agent": "Your User Agent",
- }
-
- params = {
- "hotelId": hotelId,
- "pageIndex": pageIndex,
- "pageSize": 20, # 每页20条评论
- }
-
- response = requests.get(url, headers=headers, params=params)
-
- if response.status_code == 200:
- data = response.json()
- comment_list = data.get('groupList', [])[0].get('commentList', [])
- comments = [comment['content'] for comment in comment_list]
- return comments
- else:
- return []
-
- def get_200_comments(hotelId):
- comments = []
- total_pages = 10 # 获取10页的评论,每页20条
-
- for page in range(1, total_pages + 1):
- comment_list = get_comments(hotelId, page)
- comments.extend(comment_list)
-
- if len(comments) >= 200:
- break
-
- return comments
最后我们来调用以上这些函数,写入excel
- def saveE(name, data):
- # 创建一个新的文件夹
- folder_name = name # 文件夹名称与传入的 name 参数一致
- os.makedirs(folder_name, exist_ok=True)
- # 创建一个新的Excel文件
- workbook = Workbook()
- # 创建第一个表
- sheet1 = workbook.active
- sheet1.title = "表1"
- # 设置第一个表的表头标题栏
- headers = [
- "酒店名称",
- "地址",
- "电话",
- "开业时间",
- "装修时间",
- "客房数",
- "图片",
- "点评"
- ]
- sheet1.append(headers)
-
- def add1(x, y, value):
- # 在第一个表的表头x列的第y个数据赋值为value (表头算入其中,表头是y=1的位置)
- sheet1.cell(row=y, column=x, value=value)
-
- y = 2
- for v in data:
- x = 1
- for vv in v:
- add1(x, y, vv)
- x += 1
- y += 1
- # 保存Excel文件
- excel_file_path = os.path.join(folder_name, f"{name}.xlsx")
- workbook.save(filename=excel_file_path)
- return excel_file_path
-
-
- def read_hotel_ids_from_file(file_path):
- hotel_ids = []
- with open(file_path, 'r') as file:
- for line in file:
- hotel_id = int(line.strip())
- hotel_ids.append(hotel_id)
- return hotel_ids
-
- def process_hotel_data(cityid, max_pages):
- # 生成和保存酒店ID到txt文件
- txt_file_name = save_hotel_ids_to_txt(cityid, max_pages)
- city_name = os.path.splitext(txt_file_name)[0]
- # 读取酒店ID
- hotel_ids = read_hotel_ids_from_file(txt_file_name)
-
- # 处理酒店信息
- data = []
- for hotelId in hotel_ids:
- try:
- errorHotelId = hotelId
- hotelName, tels, openTime, renewTime, roomNums = get_info(hotelId)
- print(hotelName, tels, openTime, renewTime, roomNums)
- address, pics,cityName = get_addressApic(hotelId)
- print(address, len(pics), "张图片已经获取")
- comments = get200comments(hotelId)
- print(len(comments), "条点评已经获取")
- data.append([hotelName, address, ",".join(tels), openTime, renewTime, roomNums, "\n".join(pics), "\n".join(comments)])
- except Exception as e:
- print("发生了一个异常:", str(e))
- print("已经紧急保存excel文件", "报错的酒店id:", errorHotelId)
- download_images_and_update_excel(saveE(city_name, data))
- download_images_and_update_excel(saveE(city_name, data))
这个download_images...函数时从excel中下载图片保存到image中,在新的py文件中final.py
- import os
- import pandas as pd
- import requests
- from tqdm import tqdm # 导入 tqdm 库
- from gethotelid import get_hotel_list
-
- def download_images_and_update_excel(excel_file_path):
- # 获取 Excel 文件的目录
- excel_dir = os.path.dirname(excel_file_path)
-
- # 读取Excel文件
- df = pd.read_excel(excel_file_path)
-
- # 创建 image 文件夹
- image_dir = os.path.join(excel_dir, "image")
- os.makedirs(image_dir, exist_ok=True)
-
- # 遍历每一行,根据酒店名称和图片链接下载图片
- for index, row in df.iterrows():
- hotel_name = row[0] # 第一列为酒店名称
- image_urls = row[6].split('\n') # 第七列为图片链接,按换行符分割成多个链接
-
- # 遍历图片链接并下载图片
- image_paths = []
- for i, image_url in enumerate(tqdm(image_urls, desc=f"Downloading {hotel_name} images")):
- response = requests.get(image_url, stream=True)
- if response.status_code == 200:
- # 从链接中提取文件名
- filename = image_url.split("/")[-1]
- # 构建新文件名
- new_filename = f'{hotel_name}_{i + 1}{os.path.splitext(filename)[1]}' # 重命名为酒店名_编号.扩展名
- # 构建文件路径
- file_path = os.path.join(image_dir, new_filename)
- image_paths.append(file_path) # 保存文件路径
- # 保存图片
- with open(file_path, 'wb') as file:
- for chunk in response.iter_content(chunk_size=1024):
- if chunk:
- file.write(chunk)
- print(f"Downloaded image {i + 1}/{len(image_urls)}")
- else:
- print(f"Failed to download image {i + 1} for {hotel_name} ({image_url})")
-
- # 将文件路径添加到新的一列
- df.at[index, 'Image Paths'] = '\n'.join(image_paths)
-
- # 保存更新后的Excel文件在同一个目录下
- updated_excel_file_path = os.path.join(excel_dir, os.path.splitext(os.path.basename(excel_file_path))[0] + '_updated.xlsx')
- df.to_excel(updated_excel_file_path, index=False)
- # 删除原有的 Excel 文件
- os.remove(excel_file_path)
- print("Images downloaded and Excel file updated successfully.")
- return updated_excel_file_path
最后在main.py运行总程序
- from hotelinfo import process_hotel_data
-
- while True:
- city_id = int(input("输入city_id:")) # 替换成你的城市ID
- max_pages = int(input("输入最大采取页面:")) # 替换成你需要的最大页面数
-
- # 调用处理酒店数据的函数
- process_hotel_data(city_id, max_pages)
- done = input("是否继续?(y/n)")
- if done == 'n':
- break
-
-
-
输入城市id和你想要爬取的页数即可
谢谢观看
有疑问请留言
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。