当前位置:   article > 正文

2023最新当当网爬取书籍信息python源代码,使用selenium库动态爬取!欢迎各位学习交流!_爬取当当网python书籍

爬取当当网python书籍

本期文章算是对之前使用request库爬取的一个小升级吧,代码呢其实早在7月份就写好了,最近有位小伙伴问我有无源码,刚好代码还在,就顺便发一下,写的不是很好,基础能爬的水平(勿喷谢谢),能爬取到的东西多了(比如"评论数量", "好评数", "中评数", "差评数"),但是速度慢了,而且容易被封ip或者弹验证,建议一次性最多爬取5页或者使用更多反爬手段!!!

注意!!!使用之前记得selenium使用3.141.0版本!!!

安装方法: pip install selenium==3.141.0 ,其他版本或可,但4.0以上版本不支持

报错:ValueError: Timeout value connect was <object object at 0x0000019A00694540>, but it must be an int, float or None.

解决方法: pip install urllib3==1.26.2

如遇到其他出错可能是网络问题,可适当添加: time.sleep(3)

话不多说,源码如下:

  1. from selenium import webdriver
  2. from selenium.webdriver.chrome.options import Options
  3. import csv
  4. from bs4 import BeautifulSoup
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8. import re
  9. options = Options()
  10. # 解析商品信息
  11. def get_product_info(driver, div, csv_writer):
  12. # 解析页面内容
  13. name = div.find("p", class_="name").get_text().strip()
  14. price = div.find("span", class_="search_now_price").get_text().strip()
  15. print("书名:" + name)
  16. print("价格: " + price.encode('gbk', 'ignore').decode('gbk'))
  17. isbn_info = div.find("p", class_="search_book_author")
  18. if isbn_info is not None:
  19. spans = isbn_info.find_all("span")
  20. author = spans[0].find("a").get_text().strip().replace("/", "")
  21. publisher = spans[2].find("a").get_text().strip().replace("/", "")
  22. publish_date = spans[1].get_text().strip().replace("/", "")
  23. else:
  24. author = div.find("a", class_="search_book_author").get_text().strip()
  25. publisher = div.find_all("a", class_="search_book_author")[1].get_text().strip()
  26. publish_date = div.find_all("span", class_="search_book_author")[1].get_text().strip()
  27. print("作者:" + author.encode('gbk', 'ignore').decode('gbk'))
  28. print("出版社:" + publisher.encode('gbk', 'ignore').decode('gbk'))
  29. print("出版年份:" + publish_date)
  30. link = div.find("p", class_="name").find("a").get("href")
  31. if "http" not in link:
  32. link = "https:" + link
  33. # 在新标签页中打开链接
  34. driver.execute_script(f'''window.open("{link}","_blank");''')
  35. windows = driver.window_handles
  36. driver.switch_to.window(windows[-1])
  37. # 检测新标签页是否成功打开
  38. if len(windows) != len(driver.window_handles):
  39. print("无法打开新标签页")
  40. driver.switch_to.window(windows[0])
  41. return None
  42. WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "li[id='comment_tab']")))
  43. soup_new = BeautifulSoup(driver.page_source, "html.parser")
  44. # 点击“店铺评价”链接
  45. remark_link = driver.find_element_by_css_selector("li[id='comment_tab']")
  46. driver.execute_script("arguments[0].click();", remark_link)
  47. WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.on")))
  48. remark_page_html = driver.page_source
  49. remark_page_soup = BeautifulSoup(remark_page_html, "html.parser")
  50. # Check if the element with class 'on' is present
  51. comment_num = remark_page_soup.find("span", class_='on').get_text().split("(")[1].split(")")[0] \
  52. if remark_page_soup.find("span", class_="on") is not None else '未找到'
  53. print("评论数量:" + comment_num)
  54. good_comment = remark_page_soup.find("span", {"data-type": "2"})
  55. if good_comment is not None:
  56. good_count = good_comment.get_text().split("(")[1].split(")")[0]
  57. print("好评数:" + good_count)
  58. else:
  59. print("未找到好评")
  60. common_comment = remark_page_soup.find("span", {"data-type": "3"}).get_text().split("(")[1].split(")")[0] \
  61. if remark_page_soup.find("span", {"data-type": "3"}) is not None else '未找到'
  62. print("中评数:" + common_comment)
  63. bad_comment = remark_page_soup.find("span", {"data-type": "4"}).get_text().split("(")[1].split(")")[0] \
  64. if remark_page_soup.find("span", {"data-type": "4"}) is not None else '未找到'
  65. print("差评数:" + bad_comment)
  66. driver.close()
  67. driver.switch_to.window(windows[0])
  68. info = {
  69. "书名": name,
  70. "价格": price,
  71. "作者": author,
  72. "出版社": publisher,
  73. "出版年份": publish_date,
  74. "评论数量": comment_num,
  75. "好评数": good_count,
  76. "中评数": common_comment,
  77. "差评数": bad_comment,
  78. }
  79. # 实时写入CSV文件
  80. csv_writer.writerow(info)
  81. return info
  82. # 使用Chrome浏览器和无界面模式
  83. chrome_options = Options()
  84. chrome_options.add_argument("--headless")
  85. chrome_options.add_argument("--disable-gpu")
  86. chrome_options.add_argument("--no-sandbox")
  87. chrome_options.add_argument("--disable-dev-shm-usage")
  88. chrome_options.add_argument("--window-size=1920,1080")
  89. chrome_options.add_argument(
  90. "user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'")
  91. driver = webdriver.Chrome(options=chrome_options)
  92. data = []
  93. page_num = 1 # 设置开始页数
  94. keyword = "人工智能" # 设置关键词
  95. total_pages = 5 # 设置要爬取的总页数
  96. filename = f"{keyword}.csv"
  97. fields = ["书名", "价格", "作者", "出版社", "出版年份", "评论数量", "好评数", "中评数", "差评数"]
  98. # 打开CSV文件并写入表头
  99. with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
  100. csv_writer = csv.DictWriter(csvfile, fieldnames=fields)
  101. csv_writer.writeheader()
  102. while page_num <= total_pages:
  103. url = f"http://search.dangdang.com/?key={keyword}&act=input&page_index={page_num}"
  104. driver.get(url)
  105. html_content = driver.page_source
  106. soup = BeautifulSoup(html_content, "html.parser")
  107. div_list = soup.find_all("li", class_=re.compile("line\d+"))
  108. for div in div_list:
  109. get_product_info(driver, div, csv_writer)
  110. page_num += 1
  111. # 关闭浏览器
  112. driver.quit()
  113. print("数据已保存到", filename)

效果如图:

感谢各位朋友们阅读,下期再见!!!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/825056
推荐阅读
相关标签
  

闽ICP备14008679号