当前位置:   article > 正文

爬取携程酒店信息

爬取携程酒店信息

目标:

爬取携程酒店的信息:将酒店名称,地址,电话,开业时间,客房数,图片链接,点评,图片保存路径生成excel,再将图片保存在对应酒店的image文件夹中

开始动手:

第一步:

需要先对数据进行抓包:

可以用bursuite或者fiddle进行抓包,(bursuite下载教程可以参考此链接BurpSuite全平台破解通用-至今可用 - SaberCC Blog

那么现在打开bursuite,在携程搜索框输入一个城市点击搜索,再看看bursuite,找到有全部酒店id的包,发现请求数据中有个pageindex,这个就是说你将页面往下滑时会加载出来更多的酒店,那么这个pageidex可以作为参数,同样还有城市参数(cityid)。开始写得到hotelid的代码。写一下请求数据,得到响应数据,先创建一个gethotelid.py

  1. #gethotelid.py
  2. from 检查是否有重复字段 import remove_duplicate_lines
  3. import requests
  4. def get_hotel_list(pageIndex,cityid):
  5. burp0_url = "https://m.ctrip.com:443/restapi/soa2/22370/gethotellist?_fxpcqlniredt=52271165296426527463"
  6. burp0_cookies = {"Union": "OUID=mini1053&AllianceID=1314167&SID=4258862&SourceID=55555549", "DUID": "u=656D90DBCBB453190FC35AE0E82113CC&v=0", "GUID": "52271165296426527463"}
  7. burp0_headers = {"X-Ctx-Locale": "zh-CN", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "X-Ctx-Group": "ctrip", "Content-Type": "application/json", "X-Wx-Openid": "33d6ec3b-d887-4cf8-8fc4-c29c77f47342", "X-Ctx-Personal-Recommend": "1", "Xweb_xhr": "1", "X-Ctx-Region": "CN", "X-Ctx-Currency": "CNY", "Duid": "u=656D90DBCBB453190FC35AE0E82113CC&v=0", "Accept": "*/*", "Sec-Fetch-Site": "cross-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://servicewechat.com/wx0e6ed4f51db9d078/800/page-frame.html", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
  8. burp0_json={"channel": 1, "checkinDate": "2023-10-18", "checkoutDate": "2023-10-26", "cityId": cityid, "districtId": 0, "filterInfo": {"filterItemList": [], "highestPrice": 0, "keyword": "", "lowestPrice": 0, "starItemList": []}, "head": {"auth": "1369D5C2A3508C1F5B13853ACC15CAEE9FB5FC99929FEB6DA28D19CC13AC3AEB", "cid": "52271165296426527463", "ctok": "", "cver": "1.1.188", "extension": [{"name": "sdkversion", "value": "3.0.0"}, {"name": "openid", "value": "33d6ec3b-d887-4cf8-8fc4-c29c77f47342"}, {"name": "pageid", "value": "10650012159"}, {"name": "supportWebP", "value": "true"}, {"name": "ubt", "value": "{\"vid\":\"1697027931003.8mu33n\",\"sid\":2,\"pvid\":18,\"ts\":1697030756781,\"create\":1697027931003,\"pid\":\"10320613574\"}"}, {"name": "supportFuzzyPrice", "value": "1"}, {"name": "appId", "value": "wx0e6ed4f51db9d078"}, {"name": "scene", "value": "1053"}], "lang": "01", "sauth": "", "sid": "", "syscode": "30"}, "hiddenHotelIds": None, "isHourRoomSearch": False, "isMorning": 0, "nearbySearch": 0, "pageCode": "hotel_miniprogram_list", "pageIndex":pageIndex, "pageSize": 10, "preCount": 0, "preHotelIds": "", "session": {"key": "6bf9afb8c7f191d9e9db936b511103b1386625ab2ab18cf064b7427af73e9077", "sessionKey": "6720161d-0cb7-4dd0-be39-df37110067d0"}, "sessionId": "", "sourceFromTag": "inquire_preload", "topHotelIds": [], "userCoordinate": None}
  9. res = requests.post(burp0_url, headers=burp0_headers, cookies=burp0_cookies, json=burp0_json)
  10. data = res.json()

那么我们可以调用函数把这个data给打印一下,分析出这个hotelid在哪里

在控制台看着比较乱,我们可以用json格式化网站(在线JSON校验格式化工具(Be JSON))帮助我们快速找到这个hotelid的位置,可以发现hotellnfoList是个列表,列表里包含字典,每个字典包含每个酒店信息,hotelid就在这个字典里。那么我可以用循环取得每个id。

  1. #gethotelid.py
  2. # print(data)
  3. cityName = data['hotelInfoList'][0]['cityName']
  4. # print(cityName)
  5. hotel_ids = []
  6. for hotel_info in data['hotelInfoList']:
  7. hotel_id = hotel_info['hotelId']
  8. hotel_ids.append(hotel_id)
  9. return hotel_ids, cityName

这个cityName拿过来方便用于创建txt名字

然后得到hotelid之后我们可以把它写入txt文件

  1. #gethotelid.py
  2. def save_hotel_ids_to_txt(cityid, max_pages):
  3. all_hotel_ids = []
  4. try:
  5. for page_index in range(1, max_pages + 1):
  6. hotel_ids, cityName = get_hotel_list(page_index, cityid)
  7. print(f"Page {page_index} Hotel Info: {hotel_ids}")
  8. all_hotel_ids.extend(hotel_ids)
  9. except Exception as e:
  10. print("读取完成")
  11. # 将所有的 hotel_ids 写入 txt 文件,以 cityName 命名
  12. txt_file_name = f'{cityName}.txt'
  13. with open(txt_file_name, 'w') as file:
  14. for hotel_id in all_hotel_ids:
  15. file.write(str(hotel_id) + '\n')
  16. file_path = txt_file_name # 使用新生成的 txt 文件的路径
  17. remove_duplicate_lines(file_path)
  18. print(f"Hotel IDs saved to {txt_file_name}")
  19. return txt_file_name

这串代码中的remove_duplicate_lines函数的添加是因为我在测试中发现生成的txt文件可能会有一些hotelid重复,于是弄了这个函数来检查txt文件,这个函数是从另外一个叫做“检查是否有重复字段.py”文件中导入的

  1. #检查是否有重复字段.py
  2. def remove_duplicate_lines(file_path):
  3. lines = []
  4. seen = set()
  5. with open(file_path, 'r') as file:
  6. for line in file:
  7. line = line.strip()
  8. if line not in seen:
  9. seen.add(line)
  10. lines.append(line)
  11. else:
  12. print(f"重复字段: {line}")
  13. with open(file_path, 'w') as file:
  14. file.write('\n'.join(lines))
  15. print("重复字段已删除并文件已更新。")

现在已经得到hotelid了。那么现在我们取读取酒店的其余信息

现在随便点进一个酒店,我们来抓包分析一下里面内容,用刚刚的方法同样我们去得到响应数据。

找到有hotel 地址的响应包,开始写代码,在新的py文件中写(hotelinfo.py)

  1. def get_addressApic(hotelId):
  2. burp0_url = "https://m.ctrip.com:443/restapi/soa2/26187/graphql?_fxpcqlniredt=52271115296426477628"
  3. burp0_cookies = {"Union": "OUID=&AllianceID=262684&SID=711465&SourceID=55552689", "DUID": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "GUID": "52271115296426477628"}
  4. burp0_headers = {"X-Ctx-Locale": "zh-CN", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "X-Ctx-Group": "ctrip", "Content-Type": "application/json", "X-Wx-Openid": "b9787617-697b-4fe6-9942-f5fddd4b8063", "X-Ctx-Personal-Recommend": "1", "Xweb_xhr": "1", "X-Ctx-Region": "CN", "X-Ctx-Currency": "CNY", "Duid": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "Accept": "*/*", "Sec-Fetch-Site": "cross-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://servicewechat.com/wx0e6ed4f51db9d078/800/page-frame.html", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
  5. burp0_json={"head": {"auth": "8A38C2D39415D5002658D2670FDD4F8D369AEE5E25A10B5F129505F540395603", "cid": "52271115296426477628", "ctok": "", "cver": "1.1.188", "extension": [{"name": "sdkversion", "value": "3.0.0"}, {"name": "openid", "value": "b9787617-697b-4fe6-9942-f5fddd4b8063"}, {"name": "pageid", "value": "10320654891"}, {"name": "supportWebP", "value": "true"}, {"name": "ubt", "value": "{\"vid\":\"1697008079504.6vx93z\",\"sid\":1,\"pvid\":38,\"ts\":1697011325377,\"create\":1697008079504,\"pid\":\"10320654891\"}"}, {"name": "supportFuzzyPrice", "value": "1"}, {"name": "appId", "value": "wx0e6ed4f51db9d078"}, {"name": "scene", "value": "1007"}], "lang": "01", "sauth": "", "sid": "", "syscode": "30"}, "query": " { hotel(id: "+str(hotelId)+", checkIn: \"2023-10-11\", checkOut: \"2023-10-12\") { getBaseInfo { hotelName hotelEnName zoneName address openYear fitmentYear fuzzyAddressTip commentScore commentDesc commentCount bestCommentSentence isOversea cityId cityName totalPictureCount mgrGroupId hotelCategoryOutlineImages { categoryName pictureList { url urlBody urlExtend } } coordinate { latitude longitude } starInfo { star } topAwardInfo { listSubTitle listUrl awardIconUrl lableId rankId annualListAwardIconUrl annualListTagUrl } } getTrafficDetail(filterValue: \"\") { defaultTrafficText } getDetailTag { starTag { icon } dStarTag { icon } medalTag { icon } primeTag { icon } facilityTags(limit: 3) { title } categoryTag { title } } } } ", "source": "hotel_detail_head"}
  6. res = requests.post(burp0_url, headers=burp0_headers, cookies=burp0_cookies, json=burp0_json)
  7. data = res.json()
  8. print(data)

打印data我们发现

这里有address还有酒店图片。

  1. address = data['data']['hotel']['getBaseInfo']['address']
  2. picList = data['data']['hotel']['getBaseInfo']['hotelCategoryOutlineImages']
  3. cityName = data['data']['hotel']['getBaseInfo']['cityName']
  4. picList = [item['pictureList'] for item in picList if 'pictureList' in item]
  5. pictureUrls = [item['url'] for sublist in picList for item in sublist if 'url' in item]
  6. return address,pictureUrls,cityName

接下来接着看包,找到一个有酒店名称,电话,酒店开业,装修时间这些信息的包,用同样的方法。

  1. def get_info(hotelId):
  2. burp0_url = "https://m.ctrip.com:443/webapp/hotels/sellingpoint?hotelid="+str(hotelId)+"&checkin=2023-10-11&checkout=2023-10-12&fromminiapp=weixin&allianceid=262684&sid=711465&sourceid=55552689&_cwxobj=%7B%22cid%22%3A%2252271115296426477628%22%2C%22appid%22%3A%22wx0e6ed4f51db9d078%22%2C%22mpopenid%22%3A%22b9787617-697b-4fe6-9942-f5fddd4b8063%22%2C%22mpunionid%22%3A%22oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%22%2C%22allianceid%22%3A%22262684%22%2C%22sid%22%3A%22711465%22%2C%22ouid%22%3A%22%22%2C%22sourceid%22%3A%2255552689%22%2C%22exmktID%22%3A%22%7B%5C%22openid%5C%22%3A%5C%22b9787617-697b-4fe6-9942-f5fddd4b8063%5C%22%2C%5C%22unionid%5C%22%3A%5C%22oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%5C%22%2C%5C%22channelUpdateTime%5C%22%3A%5C%221697011325536%5C%22%2C%5C%22serverFrom%5C%22%3A%5C%22WAP%2FWECHATAPP%5C%22%2C%5C%22innersid%5C%22%3A%5C%22%5C%22%2C%5C%22innerouid%5C%22%3A%5C%22%5C%22%2C%5C%22pushcode%5C%22%3A%5C%22%5C%22%2C%5C%22txCpsId%5C%22%3A%5C%22%5C%22%2C%5C%22amsPid%5C%22%3A%5C%22%5C%22%2C%5C%22gdt_vid%5C%22%3A%5C%22%5C%22%7D%22%2C%22scene%22%3A1007%2C%22personalRecommendSwitch%22%3Atrue%2C%22localRecommendSwitch%22%3Atrue%2C%22marketSwitch%22%3Atrue%2C%22pLen%22%3A3%7D&_obt=1697012693781"
  3. burp0_cookies = {"GUID": "52271115296426477628", "nfes_isSupportWebP": "1", "nfes_isSupportWebP": "1", "UBT_VID": "1697008100680.1e1cmmiBfNEU", "librauuid": "", "_RF1": "111.22.74.62", "_RSG": "2Ha6Cu221L4XAzI9a6MT2B", "_RDG": "282a01bdfbf8df2e6720e1b8de10ac1c86", "_RGUID": "4541082e-0c1a-4442-b09d-5869094bd2a1", "login_type": "0", "login_uid": "B34F49EE0A13E8463539186D23B00A8B", "DUID": "u=B34F49EE0A13E8463539186D23B00A8B&v=0", "IsNonUser": "F", "AHeadUserInfo": "VipGrade=10&VipGradeName=%BB%C6%BD%F0%B9%F3%B1%F6&UserName=&NoReadMessageCount=0", "cticket": "8A38C2D39415D5002658D2670FDD4F8D369AEE5E25A10B5F129505F540395603", "_resDomain": "https%3A%2F%2Fbd-s.tripcdn.cn", "_pd": "%7B%22_o%22%3A4%2C%22s%22%3A13%2C%22_s%22%3A0%7D", "MKT_Pagesource": "H5", "Union": "OUID=&AllianceID=262684&SID=711465&SourceID=55552689&AppID=wx0e6ed4f51db9d078&OpenID=b9787617-697b-4fe6-9942-f5fddd4b8063&exmktID={\"openid\":\"b9787617-697b-4fe6-9942-f5fddd4b8063\",\"unionid\":\"oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM\",\"channelUpdateTime\":\"1697011325536\",\"serverFrom\":\"WAP/WECHATAPP\",\"innersid\":\"\",\"innerouid\":\"\",\"pushcode\":\"\",\"txCpsId\":\"\",\"amsPid\":\"\",\"gdt_vid\":\"\"}&createtime=1697012402&Expires=1697617201862", "MKT_OrderClick": "ASID=262684711465&AID=262684&CSID=711465&OUID=&CT=1697012401865&CURL=https%3A%2F%2Fm.ctrip.com%2Fwebapp%2Fservicechatv2%2F%3FisHideNavBar%3DYES%26isFreeLogin%3D0%26platform%3Dwechat%26appId%3Dwx0e6ed4f51db9d078%26sceneCode%3D2%26channel%3DEBK%26bizType%3D1356%26isPreSale%3D1%26pageCode%3D10320654891%26thirdPartytoken%3Dae1f21df-6e4b-4473-bcf3-8f19a78ac594%26source%3Dminipro_app%26orderInfo%3D%257B%2522amount%2522%253A%2522%2522%252C%2522bu%2522%253A%2522EBK%2522%252C%2522cid%2522%253A%25220%2522%252C%2522ctype%2522%253A%2522%2522%252C%2522currency%2522%253A%2522%2522%252C%2522supplierId%2522%253A1632483%252C%2522supplierName%2522%253A%2522%25E6%2598%259F%25E8%25BE%25B0%25E9%2585%2592%25E5%25BA%2597(%25E6%25A0%25AA%25E6%25B4%25B2%25E6%25B9%2596%25E5%258D%2597%25E5%25B7%25A5%25E4%25B8%259A%25E5%25A4%25A7%25E5%25AD%25A6%25E5%25BA%2597)%2522%252C%2522title%2522%253A%2522%25E6%2598%259F%25E8%25BE%25B0%25E9%2585%2592%25E5%25BA%2597(%25E6%25A0%25AA%25E6%25B4%25B2%25E6%25B9%2596%25E5%258D%2597%25E5%25B7%25A5%25E4%25B8%259A%25E5%25A4%25A7%25E5%25AD%25A6%25E5%25BA%2597)%2522%257D%26q%3DeyJtaW5lIjowLCJ3ZW1jIwoxfQ%3D%3DTW%26mktopenid%3Db9787617-697b-4fe6-9942-f5fddd4b8063%26fromminiapp%3Dweixin%26allianceid%3D262684%26sid%3D711465%26sourceid%3D55552689%26_cwxobj%3D%257B%2522cid%2522%253A%252252271115296426477628%2522%252C%2522appid%2522%253A%2522wx0e6ed4f51db9d078%2522%252C%2522mpopenid%2522%253A%2522b9787617-697b-4fe6-9942-f5fddd4b8063%2522%252C%2522mpunionid%2522%253A%2522oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%2522%252C%2522allianceid%2522%253A%2522262684%2522%252C%2522sid%2522%253A%2522711465%2522%252C%2522ouid%2522%253A%2522%2522%252C%2522sourceid%2522%253A%252255552689%2522%252C%2522exmktID%2522%253A%2522%257B%255C%2522openid%255C%2522%253A%255C%2522b9787617-697b-4fe6-9942-f5fddd4b8063%255C%2522%252C%255C%2522unionid%255C%2522%253A%255C%2522oHkqHt8Zg6-uBtF-Y2UBUVVk1MlM%255C%2522%252C%255C%2522channelUpdateTime%255C%2522%253A%255C%25221697011325536%255C%2522%252C%255C%2522serverFrom%255C%2522%253A%255C%2522WAP%252FWECHATAPP%255C%2522%252C%255C%2522innersid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522innerouid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522pushcode%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522txCpsId%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522amsPid%255C%2522%253A%255C%2522%255C%2522%252C%255C%2522gdt_vid%255C%2522%253A%255C%2522%255C%2522%257D%2522%252C%2522scene%2522%253A1007%252C%2522personalRecommendSwitch%2522%253Atrue%252C%2522localRecommendSwitch%2522%253Atrue%252C%2522marketSwitch%2522%253Atrue%252C%2522pLen%2522%253A3%257D&VAL={\"h5_vid\":\"1697008100680.1e1cmmiBfNEU\"}", "_bfa": "1.1697008100680.1e1cmmiBfNEU.1.1697010109935.1697012401870.1.20.10650084702", "_ubtstatus": "%7B%22vid%22%3A%221697008100680.1e1cmmiBfNEU%22%2C%22sid%22%3A1%2C%22pvid%22%3A20%2C%22pid%22%3A10650084702%7D"}
  4. burp0_headers = {"Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8447", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Sec-Ch-Ua": "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9"}
  5. res = requests.get(burp0_url, headers=burp0_headers, cookies=burp0_cookies)
  6. soup = BeautifulSoup(res.text, "html.parser")
  7. script_tag = soup.find("script", {"id": "__NEXT_DATA__"})
  8. json_data = json.loads(script_tag.string)
  9. initialState = json_data["props"]["pageProps"]["initialState"]
  10. hotelName = initialState["hotelEnName"]
  11. tels = [tel["calTel"] for tel in initialState["contactInfo"]["telInfoList"]]
  12. openTime = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "开业时间")
  13. renewTime = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "装修时间")
  14. roomNums = next(info["text"] for info in initialState["baseInfo"] if info["name"] == "房间数量")
  15. return hotelName,tels,openTime,renewTime,roomNums

同理得到评论的数据

  1. def get_comments(hotelId, pageIndex):
  2. url = "https://m.ctrip.com/restapi/soa2/24626/commentlist"
  3. headers = {
  4. "User-Agent": "Your User Agent",
  5. }
  6. params = {
  7. "hotelId": hotelId,
  8. "pageIndex": pageIndex,
  9. "pageSize": 20, # 每页20条评论
  10. }
  11. response = requests.get(url, headers=headers, params=params)
  12. if response.status_code == 200:
  13. data = response.json()
  14. comment_list = data.get('groupList', [])[0].get('commentList', [])
  15. comments = [comment['content'] for comment in comment_list]
  16. return comments
  17. else:
  18. return []
  19. def get_200_comments(hotelId):
  20. comments = []
  21. total_pages = 10 # 获取10页的评论,每页20条
  22. for page in range(1, total_pages + 1):
  23. comment_list = get_comments(hotelId, page)
  24. comments.extend(comment_list)
  25. if len(comments) >= 200:
  26. break
  27. return comments

最后我们来调用以上这些函数,写入excel

  1. def saveE(name, data):
  2. # 创建一个新的文件夹
  3. folder_name = name # 文件夹名称与传入的 name 参数一致
  4. os.makedirs(folder_name, exist_ok=True)
  5. # 创建一个新的Excel文件
  6. workbook = Workbook()
  7. # 创建第一个表
  8. sheet1 = workbook.active
  9. sheet1.title = "表1"
  10. # 设置第一个表的表头标题栏
  11. headers = [
  12. "酒店名称",
  13. "地址",
  14. "电话",
  15. "开业时间",
  16. "装修时间",
  17. "客房数",
  18. "图片",
  19. "点评"
  20. ]
  21. sheet1.append(headers)
  22. def add1(x, y, value):
  23. # 在第一个表的表头x列的第y个数据赋值为value (表头算入其中,表头是y=1的位置)
  24. sheet1.cell(row=y, column=x, value=value)
  25. y = 2
  26. for v in data:
  27. x = 1
  28. for vv in v:
  29. add1(x, y, vv)
  30. x += 1
  31. y += 1
  32. # 保存Excel文件
  33. excel_file_path = os.path.join(folder_name, f"{name}.xlsx")
  34. workbook.save(filename=excel_file_path)
  35. return excel_file_path
  36. def read_hotel_ids_from_file(file_path):
  37. hotel_ids = []
  38. with open(file_path, 'r') as file:
  39. for line in file:
  40. hotel_id = int(line.strip())
  41. hotel_ids.append(hotel_id)
  42. return hotel_ids
  43. def process_hotel_data(cityid, max_pages):
  44. # 生成和保存酒店ID到txt文件
  45. txt_file_name = save_hotel_ids_to_txt(cityid, max_pages)
  46. city_name = os.path.splitext(txt_file_name)[0]
  47. # 读取酒店ID
  48. hotel_ids = read_hotel_ids_from_file(txt_file_name)
  49. # 处理酒店信息
  50. data = []
  51. for hotelId in hotel_ids:
  52. try:
  53. errorHotelId = hotelId
  54. hotelName, tels, openTime, renewTime, roomNums = get_info(hotelId)
  55. print(hotelName, tels, openTime, renewTime, roomNums)
  56. address, pics,cityName = get_addressApic(hotelId)
  57. print(address, len(pics), "张图片已经获取")
  58. comments = get200comments(hotelId)
  59. print(len(comments), "条点评已经获取")
  60. data.append([hotelName, address, ",".join(tels), openTime, renewTime, roomNums, "\n".join(pics), "\n".join(comments)])
  61. except Exception as e:
  62. print("发生了一个异常:", str(e))
  63. print("已经紧急保存excel文件", "报错的酒店id:", errorHotelId)
  64. download_images_and_update_excel(saveE(city_name, data))
  65. download_images_and_update_excel(saveE(city_name, data))

这个download_images...函数时从excel中下载图片保存到image中,在新的py文件中final.py

  1. import os
  2. import pandas as pd
  3. import requests
  4. from tqdm import tqdm # 导入 tqdm 库
  5. from gethotelid import get_hotel_list
  6. def download_images_and_update_excel(excel_file_path):
  7. # 获取 Excel 文件的目录
  8. excel_dir = os.path.dirname(excel_file_path)
  9. # 读取Excel文件
  10. df = pd.read_excel(excel_file_path)
  11. # 创建 image 文件夹
  12. image_dir = os.path.join(excel_dir, "image")
  13. os.makedirs(image_dir, exist_ok=True)
  14. # 遍历每一行,根据酒店名称和图片链接下载图片
  15. for index, row in df.iterrows():
  16. hotel_name = row[0] # 第一列为酒店名称
  17. image_urls = row[6].split('\n') # 第七列为图片链接,按换行符分割成多个链接
  18. # 遍历图片链接并下载图片
  19. image_paths = []
  20. for i, image_url in enumerate(tqdm(image_urls, desc=f"Downloading {hotel_name} images")):
  21. response = requests.get(image_url, stream=True)
  22. if response.status_code == 200:
  23. # 从链接中提取文件名
  24. filename = image_url.split("/")[-1]
  25. # 构建新文件名
  26. new_filename = f'{hotel_name}_{i + 1}{os.path.splitext(filename)[1]}' # 重命名为酒店名_编号.扩展名
  27. # 构建文件路径
  28. file_path = os.path.join(image_dir, new_filename)
  29. image_paths.append(file_path) # 保存文件路径
  30. # 保存图片
  31. with open(file_path, 'wb') as file:
  32. for chunk in response.iter_content(chunk_size=1024):
  33. if chunk:
  34. file.write(chunk)
  35. print(f"Downloaded image {i + 1}/{len(image_urls)}")
  36. else:
  37. print(f"Failed to download image {i + 1} for {hotel_name} ({image_url})")
  38. # 将文件路径添加到新的一列
  39. df.at[index, 'Image Paths'] = '\n'.join(image_paths)
  40. # 保存更新后的Excel文件在同一个目录下
  41. updated_excel_file_path = os.path.join(excel_dir, os.path.splitext(os.path.basename(excel_file_path))[0] + '_updated.xlsx')
  42. df.to_excel(updated_excel_file_path, index=False)
  43. # 删除原有的 Excel 文件
  44. os.remove(excel_file_path)
  45. print("Images downloaded and Excel file updated successfully.")
  46. return updated_excel_file_path

最后在main.py运行总程序

  1. from hotelinfo import process_hotel_data
  2. while True:
  3. city_id = int(input("输入city_id:")) # 替换成你的城市ID
  4. max_pages = int(input("输入最大采取页面:")) # 替换成你需要的最大页面数
  5. # 调用处理酒店数据的函数
  6. process_hotel_data(city_id, max_pages)
  7. done = input("是否继续?(y/n)")
  8. if done == 'n':
  9. break

输入城市id和你想要爬取的页数即可

结语

谢谢观看

有疑问请留言

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/379836
推荐阅读
相关标签
  

闽ICP备14008679号