当前位置:   article > 正文





这里我们使用命令 pip install selenium进行安装,这里可能安装的过程会有点慢,我们可以加一个镜像进行安装,命令如下:

pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple









Microsoft Edge WebDriver - Microsoft Edge Developer




  1. from selenium import webdriver
  2. from lxml import html
  3. import time
  4. import re
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver import ChromeOptions
  7. import json











  1. option = ChromeOptions()
  2. # 配置浏览器的相关设置,把浏览器设置系统不可检测
  3. option.add_experimental_option('excludeSwitches', ['enable-automation'])
  4. # 设置编码集
  5. option.add_argument('lang=zh_CN.UTF-8')
  6. browser = webdriver.Chrome(options=option)
  7. browser.get('https://hotel.qunar.com/cn/fuzhou_fujian?fromDate=2023-04-15&toDate=2023-04-16&cityName=%E7%A6%8F%E5%B7%9E')
  8. time.sleep(30)
  9. dictCookies = browser.get_cookies() # 获取list的cookies
  10. jsonCookies = json.dumps(dictCookies) # 转换成字符串保存
  11. with open('cookie.txt', 'w') as f:
  12. f.write(jsonCookies)
  13. print('cookies保存成功!')




  1. def crack_permissions(browser):
  2. # 休眠,避免浏览器加载过慢
  3. time.sleep(5)
  4. # 读取cookie文件,拿到用户的登录cookie信息
  5. with open('cookie.txt', 'r', encoding='utf8') as f:
  6. listCookies = json.loads(f.read())
  7. # 往browser里添加cookies
  8. for cookie in listCookies:
  9. cookie_dict = {
  10. 'domain': '.qunar.com',
  11. 'name': cookie.get('name'),
  12. 'value': cookie.get('value'),
  13. "expires": '',
  14. 'path': '/',
  15. 'httpOnly': False,
  16. 'HostOnly': False,
  17. 'Secure': False
  18. }
  19. browser.add_cookie(cookie_dict)
  20. # 刷新浏览器信息
  21. browser.refresh()
  22. time.sleep(2)



  1. # 模拟下滑到底部操作
  2. for j in range(1, 4):
  3. browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  4. time.sleep(1)



  1. # 获取网页信息
  2. resp = browser.page_source
  3. # 加载xpath,用于数据解析
  4. etree = html.etree
  5. xml = etree.HTML(resp)


  1. for k in range(1, 21):
  2. # name: 酒店名称
  3. name = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[1]/a/text()')
  4. if len(name) > 0:
  5. mess_dict['name'] = name[0]
  6. else:
  7. mess_dict['name'] = ''
  8. # 酒店价格
  9. price = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[2]/p[1]/a/text()')
  10. if len(price) > 0:
  11. try:
  12. mess_dict['price'] = int(price[0])
  13. except:
  14. mess_dict['price'] = 0
  15. else:
  16. mess_dict['price'] = 0
  17. # 类型,例如:舒适型、高档型等
  18. dangciText = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[1]/span[2]/text()')
  19. if len(dangciText) > 0:
  20. mess_dict['dangciText'] = dangciText[0]
  21. else:
  22. mess_dict['dangciText'] = ''
  23. # 酒店评分
  24. score = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[1]/text()')
  25. if len(score) > 0:
  26. try:
  27. mess_dict['score'] = float(score[0])
  28. except:
  29. mess_dict['score'] = 0.0
  30. else:
  31. mess_dict['score'] = 0.0
  32. # 酒店整体评价
  33. commentDesc = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[2]/text()')
  34. if len(commentDesc) > 0:
  35. '''
  36. 这里需要加入一个判断逻辑,
  37. 在标签上有时候会与评论数相重叠,
  38. 这里需要判断提取信息是否为评论数
  39. '''
  40. tmp = re.findall('共(.*?)条评论', commentDesc[0])
  41. if len(tmp) > 0:
  42. mess_dict['commentDesc'] = ''
  43. else:
  44. mess_dict['commentDesc'] = commentDesc[0]
  45. else:
  46. mess_dict['commentDesc'] = ''
  47. # 酒店评论数
  48. commentCount = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[3]/text()')
  49. if len(commentCount) == 0:
  50. commentCount = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[2]/text()')
  51. if len(commentCount) > 0:
  52. tmp = re.findall('共(.*?)条评论', commentCount[0])
  53. if len(tmp) > 0:
  54. try:
  55. mess_dict['commentCount'] = int(tmp[0])
  56. except:
  57. mess_dict['commentCount'] = 0
  58. else:
  59. mess_dict['commentCount'] = 0
  60. else:
  61. mess_dict['commentCount'] = 0
  62. # 酒店大致位置
  63. locationInfo = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[3]/text()')
  64. if len(locationInfo) > 0:
  65. mess_dict['locationInfo'] = locationInfo[0]
  66. else:
  67. mess_dict['locationInfo'] = ''



  1. def Connect_Sql(data_name: str):
  2. db = pymysql.connect(
  3. host='localhost',
  4. user='root',
  5. password='root',
  6. db=data_name,
  7. port=3306
  8. )
  9. return db
  10. def save_data_sql(data):
  11. try:
  12. conn = Connect_Sql('ptu')
  13. cursor = conn.cursor()
  14. try:
  15. sql = "insert into hotel_mess values (%s,%s,%s,%s,%s,%s,%s,%s)"
  16. cursor.execute(sql, data)
  17. except:
  18. print("缺失")
  19. conn.commit()
  20. cursor.close()
  21. conn.close()
  22. except:
  23. print("失败!")




  1. from selenium import webdriver
  2. from lxml import html
  3. import time
  4. import re
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver import ChromeOptions
  7. import json
  8. from save_data import save_data_sql
  9. import datetime
  10. option = ChromeOptions()
  11. # 配置浏览器的相关设置,把浏览器设置系统不可检测
  12. option.add_experimental_option('excludeSwitches', ['enable-automation'])
  13. # 设置编码集
  14. option.add_argument('lang=zh_CN.UTF-8')
  15. browser = webdriver.Chrome(options=option)
  16. browser.get('https://hotel.qunar.com/cn/fuzhou_fujian?fromDate=2023-04-15&toDate=2023-04-16&cityName=%E7%A6%8F%E5%B7%9E')
  17. time.sleep(30)
  18. dictCookies = browser.get_cookies() # 获取list的cookies
  19. jsonCookies = json.dumps(dictCookies) # 转换成字符串保存
  20. with open('cookie2.txt', 'w') as f:
  21. f.write(jsonCookies)
  22. print('cookies保存成功!')


  1. from selenium import webdriver
  2. from lxml import html
  3. import time
  4. import re
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver import ChromeOptions
  7. import json
  8. from save_data import save_data_sql
  9. # 获取浏览器驱动
  10. def get_driver():
  11. option = ChromeOptions()
  12. # 配置浏览器的相关设置,把浏览器设置系统不可检测
  13. option.add_experimental_option('excludeSwitches', ['enable-automation'])
  14. # 设置编码集
  15. option.add_argument('lang=zh_CN.UTF-8')
  16. browser = webdriver.Chrome(options=option)
  17. return browser
  18. # 破解权限,拿到浏览器的cookie,进行模拟登录,绕开登录反爬
  19. def crack_permissions(browser):
  20. # 休眠,避免浏览器加载过慢
  21. time.sleep(5)
  22. # 读取cookie文件,拿到用户的登录cookie信息
  23. with open('cookie2.txt', 'r', encoding='utf8') as f:
  24. listCookies = json.loads(f.read())
  25. # 往browser里添加cookies
  26. for cookie in listCookies:
  27. cookie_dict = {
  28. 'domain': '.qunar.com',
  29. 'name': cookie.get('name'),
  30. 'value': cookie.get('value'),
  31. "expires": '',
  32. 'path': '/',
  33. 'httpOnly': False,
  34. 'HostOnly': False,
  35. 'Secure': False
  36. }
  37. browser.add_cookie(cookie_dict)
  38. # 刷新浏览器信息
  39. browser.refresh()
  40. time.sleep(2)
  41. # 启动任务
  42. def start_task(browser):
  43. for i in range(120):
  44. # 模拟下滑到底部操作
  45. for j in range(1, 4):
  46. browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  47. time.sleep(1)
  48. # 获取网页信息
  49. resp = browser.page_source
  50. # 加载xpath,用于数据解析
  51. etree = html.etree
  52. xml = etree.HTML(resp)
  53. # 指定日期
  54. date_time = '2023-05-20'
  55. mess_dict = {}
  56. for k in range(1, 21):
  57. # name: 酒店名称
  58. name = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[1]/a/text()')
  59. if len(name) > 0:
  60. mess_dict['name'] = name[0]
  61. else:
  62. mess_dict['name'] = ''
  63. # 酒店价格
  64. price = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[2]/p[1]/a/text()')
  65. if len(price) > 0:
  66. try:
  67. mess_dict['price'] = int(price[0])
  68. except:
  69. mess_dict['price'] = 0
  70. else:
  71. mess_dict['price'] = 0
  72. # 类型,例如:舒适型、高档型等
  73. dangciText = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[1]/span[2]/text()')
  74. if len(dangciText) > 0:
  75. mess_dict['dangciText'] = dangciText[0]
  76. else:
  77. mess_dict['dangciText'] = ''
  78. # 酒店评分
  79. score = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[1]/text()')
  80. if len(score) > 0:
  81. try:
  82. mess_dict['score'] = float(score[0])
  83. except:
  84. mess_dict['score'] = 0.0
  85. else:
  86. mess_dict['score'] = 0.0
  87. # 酒店整体评价
  88. commentDesc = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[2]/text()')
  89. if len(commentDesc) > 0:
  90. '''
  91. 这里需要加入一个判断逻辑,
  92. 在标签上有时候会与评论数相重叠,
  93. 这里需要判断提取信息是否为评论数
  94. '''
  95. tmp = re.findall('共(.*?)条评论', commentDesc[0])
  96. if len(tmp) > 0:
  97. mess_dict['commentDesc'] = ''
  98. else:
  99. mess_dict['commentDesc'] = commentDesc[0]
  100. else:
  101. mess_dict['commentDesc'] = ''
  102. # 酒店评论数
  103. commentCount = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[3]/text()')
  104. if len(commentCount) == 0:
  105. commentCount = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[2]/span[2]/text()')
  106. if len(commentCount) > 0:
  107. tmp = re.findall('共(.*?)条评论', commentCount[0])
  108. if len(tmp) > 0:
  109. try:
  110. mess_dict['commentCount'] = int(tmp[0])
  111. except:
  112. mess_dict['commentCount'] = 0
  113. else:
  114. mess_dict['commentCount'] = 0
  115. else:
  116. mess_dict['commentCount'] = 0
  117. # 酒店大致位置
  118. locationInfo = xml.xpath(f'//*[@id="hotel_lst_body"]/li[{k}]/div/div[3]/p[3]/text()')
  119. if len(locationInfo) > 0:
  120. mess_dict['locationInfo'] = locationInfo[0]
  121. else:
  122. mess_dict['locationInfo'] = ''
  123. # 读入数据库
  124. save_data_sql((mess_dict['name'], mess_dict['price'], mess_dict['dangciText'], mess_dict['score'],
  125. mess_dict['commentDesc'], mess_dict['commentCount'], mess_dict['locationInfo'], date_time))
  126. print(mess_dict)
  127. time.sleep(1)
  128. browser.find_element(By.XPATH, '//*[@id="root"]/div/section/section[1]/aside[1]/div[7]/p[1]').click()
  129. time.sleep(1)
  130. # 加载浏览器驱动
  131. browser = get_driver()
  132. # 进入网页
  133. browser.get(
  134. 'https://hotel.qunar.com/cn/fuzhou_fujian?fromDate=2023-05-20&toDate=2023-05-21&cityName=%E7%A6%8F%E5%B7%9E')
  135. # 破解权限
  136. crack_permissions(browser=browser)
  137. # 启动任务
  138. start_task(browser=browser)



