当前位置:   article > 正文

爬取百度相关搜索结果_百度搜索结果提取

百度搜索结果提取

背景:

有一大堆query,需要获取在百度上的搜索结果。然后整理到Excel上。

代码:

项目结构如下:

一共有两个脚本文件 paqushuju.py 和 threadingdata.py

paqushuju.py 是单线程脚本

 threadingdata.py 是多线程脚本

paqushuju.py 如下:

  1. # -*- coding:utf-8 -*-
  2. # paqushuju.py
  3. from selenium import webdriver
  4. import time
  5. import xlrd
  6. import xlsxwriter
  7. import time
  8. import os
  9. # 读取 query2文件夹下所有的文件,list
  10. query_path = r'D:\babytree\codetest\zuoye\pachong\query2'
  11. wenjianlist = os.listdir(query_path)
  12. # 给生成的新的文件,命名
  13. namelist = []
  14. PathList = []
  15. for single in wenjianlist:
  16. newsingle = single.split('.')[0] + '的搜索结果.xlsx'
  17. namelist.append(newsingle)
  18. # 要读取的每个文件的位置
  19. PathList.append(r'D:\babytree\codetest\zuoye\pachong\query2' + "\\" + single)
  20. for namei in range(len(wenjianlist)):
  21. time.sleep(5)
  22. ###############################################
  23. # 初始化浏览器
  24. browser = webdriver.Chrome(r'D:\python3\Lib\chromedriver.exe')
  25. browser.get('https://www.baidu.com/')
  26. browser.maximize_window()
  27. browser.find_element_by_xpath('//*[@id="kw"]').send_keys(u'怎么样看早孕试纸11')
  28. browser.find_element_by_xpath('//*[@id="su"]').click()
  29. time.sleep(1)
  30. browser.find_element_by_id('kw').clear()
  31. ##################################################
  32. # 创建一个workbook 设置编码
  33. workbook = xlsxwriter.Workbook(namelist[namei])
  34. #####################################################
  35. # 数据初始化
  36. # 文件地址+名称
  37. thePath = PathList[namei]
  38. # 读取第几列,这是一个列表
  39. dataColumn = []
  40. # 读取名称是第几列的数据
  41. dataCouName = '点击位置'
  42. # 打开文件
  43. data = xlrd.open_workbook(thePath)
  44. # 所有的sheet页,是一个list
  45. sheetlist = data.sheet_names()
  46. for sheetNamei in range(len(sheetlist)):
  47. # 创建一个worksheet
  48. worksheet = workbook.add_worksheet(sheetlist[sheetNamei])
  49. # 循环读取的sheet页
  50. # 通过文件名获得工作表,获取工作表
  51. table = data.sheet_by_name(sheetlist[sheetNamei])
  52. # 获取第一列数据
  53. # 第一列数据,是个list,然后干掉列表第一个
  54. fistCol = table.col_values(0)
  55. if fistCol[0] == 'QUERY' or fistCol[0] == 'yunyu':
  56. fistCol.pop(0)
  57. else:
  58. pass
  59. for searchKyi in range(len(fistCol)):
  60. browser.implicitly_wait(15)
  61. try:
  62. browser.find_element_by_id('kw').send_keys(fistCol[searchKyi])
  63. except:
  64. time.sleep(2)
  65. browser.find_element_by_id('kw').send_keys(fistCol[searchKyi])
  66. browser.implicitly_wait(15)
  67. try:
  68. browser.find_element_by_id('su').click()
  69. except:
  70. time.sleep(2)
  71. browser.find_element_by_id('su').click()
  72. browser.implicitly_wait(15)
  73. browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  74. time.sleep(6)
  75. try:
  76. a = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[1]/a').text
  77. except:
  78. a = '无相关数据'
  79. try:
  80. b = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[2]/a').text
  81. except:
  82. b = '无相关数据'
  83. try:
  84. c = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[3]/a').text
  85. except:
  86. c = '无相关数据'
  87. try:
  88. d = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[1]/a').text
  89. except:
  90. d = '无相关数据'
  91. try:
  92. e = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[2]/a').text
  93. except:
  94. e = '无相关数据'
  95. try:
  96. f = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[3]/a').text
  97. except:
  98. f = '无相关数据'
  99. try:
  100. g = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[1]/a').text
  101. except:
  102. g = '无相关数据'
  103. try:
  104. h = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[2]/a').text
  105. except:
  106. h = '无相关数据'
  107. try:
  108. i = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[3]/a').text
  109. except:
  110. i = '无相关数据'
  111. ai = fistCol[searchKyi]
  112. searchData = [ai, a, b, c, d, e, f, g, h, i]
  113. for m in range(len(searchData)):
  114. worksheet.write(searchKyi, m, searchData[m])
  115. print(searchData[m])
  116. browser.implicitly_wait(15)
  117. browser.find_element_by_id('kw').clear()
  118. # 保存
  119. workbook.close()

threadingdata.py 代码如下

  1. # -*- coding:utf-8 -*-
  2. # threadingdata.py
  3. from selenium import webdriver
  4. import time
  5. import xlrd
  6. import xlsxwriter
  7. import time
  8. import os
  9. import threading
  10. def crawler(query_path):
  11. wenjianlist = os.listdir(query_path)
  12. # 给生成的新的文件,命名
  13. namelist = []
  14. PathList = []
  15. for single in wenjianlist:
  16. newsingle = single.split('.')[0] + '的搜索结果.xlsx'
  17. namelist.append(newsingle)
  18. # 要读取的每个文件的位置
  19. PathList.append(query_path + "\\" + single)
  20. for namei in range(len(wenjianlist)):
  21. time.sleep(5)
  22. ###############################################
  23. # 初始化浏览器
  24. browser = webdriver.Chrome(r'D:\python3\Lib\chromedriver.exe')
  25. browser.get('https://www.baidu.com/')
  26. browser.maximize_window()
  27. browser.find_element_by_xpath('//*[@id="kw"]').send_keys(u'怎么样看早孕试纸11')
  28. browser.find_element_by_xpath('//*[@id="su"]').click()
  29. time.sleep(1)
  30. browser.find_element_by_id('kw').clear()
  31. ##################################################
  32. # 创建一个workbook 设置编码
  33. workbook = xlsxwriter.Workbook(namelist[namei])
  34. #####################################################
  35. # 数据初始化
  36. # 文件地址+名称
  37. thePath = PathList[namei]
  38. # 读取第几列,这是一个列表
  39. dataColumn = []
  40. # 读取名称是第几列的数据
  41. dataCouName = '点击位置'
  42. # 打开文件
  43. data = xlrd.open_workbook(thePath)
  44. # 所有的sheet页,是一个list
  45. sheetlist = data.sheet_names()
  46. for sheetNamei in range(len(sheetlist)):
  47. # 创建一个worksheet
  48. worksheet = workbook.add_worksheet(sheetlist[sheetNamei])
  49. # 循环读取的sheet页
  50. # 通过文件名获得工作表,获取工作表
  51. table = data.sheet_by_name(sheetlist[sheetNamei])
  52. # 获取第一列数据
  53. # 第一列数据,是个list,然后干掉列表第一个
  54. fistCol = table.col_values(0)
  55. if fistCol[0] == 'QUERY' or fistCol[0] == 'yunyu':
  56. fistCol.pop(0)
  57. else:
  58. pass
  59. for searchKyi in range(len(fistCol)):
  60. browser.implicitly_wait(15)
  61. try:
  62. browser.find_element_by_id('kw').send_keys(fistCol[searchKyi])
  63. except:
  64. time.sleep(2)
  65. browser.find_element_by_id('kw').send_keys(fistCol[searchKyi])
  66. browser.implicitly_wait(15)
  67. try:
  68. browser.find_element_by_id('su').click()
  69. except:
  70. time.sleep(2)
  71. browser.find_element_by_id('su').click()
  72. browser.implicitly_wait(15)
  73. browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  74. time.sleep(6)
  75. try:
  76. a = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[1]/a').text
  77. except:
  78. a = '无相关数据'
  79. try:
  80. b = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[2]/a').text
  81. except:
  82. b = '无相关数据'
  83. try:
  84. c = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[1]/th[3]/a').text
  85. except:
  86. c = '无相关数据'
  87. try:
  88. d = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[1]/a').text
  89. except:
  90. d = '无相关数据'
  91. try:
  92. e = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[2]/a').text
  93. except:
  94. e = '无相关数据'
  95. try:
  96. f = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[2]/th[3]/a').text
  97. except:
  98. f = '无相关数据'
  99. try:
  100. g = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[1]/a').text
  101. except:
  102. g = '无相关数据'
  103. try:
  104. h = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[2]/a').text
  105. except:
  106. h = '无相关数据'
  107. try:
  108. i = browser.find_element_by_xpath('//*[@id="rs"]/div/table/tbody/tr[3]/th[3]/a').text
  109. except:
  110. i = '无相关数据'
  111. ai = fistCol[searchKyi]
  112. searchData = [ai, a, b, c, d, e, f, g, h, i]
  113. for m in range(len(searchData)):
  114. worksheet.write(searchKyi, m, searchData[m])
  115. print(searchData[m])
  116. browser.implicitly_wait(15)
  117. browser.find_element_by_id('kw').clear()
  118. # 保存
  119. workbook.close()
  120. browser.quit()
  121. # 读取 query2文件夹下所有的文件,list
  122. query_path1 = r'D:\babytree\codetest\zuoye\pachong\query2'
  123. query_path2 = r'D:\babytree\codetest\zuoye\pachong\query3'
  124. query_path3 = r'D:\babytree\codetest\zuoye\pachong\query4'
  125. query_path4 = r'D:\babytree\codetest\zuoye\pachong\query5'
  126. # query_path5 = r'D:\babytree\codetest\zuoye\pachong\query6'
  127. # query_path6 = r'D:\babytree\codetest\zuoye\pachong\query7'
  128. threads = []
  129. t1 = threading.Thread(target=crawler, args=(query_path1,))
  130. threads.append(t1)
  131. t2 = threading.Thread(target=crawler, args=(query_path2,))
  132. threads.append(t2)
  133. t3 = threading.Thread(target=crawler, args=(query_path3,))
  134. threads.append(t3)
  135. t4 = threading.Thread(target=crawler, args=(query_path4,))
  136. threads.append(t4)
  137. # t5 = threading.Thread(target=crawler,args=(query_path5,))
  138. # threads.append(t5)
  139. # t6 = threading.Thread(target=crawler,args=(query_path6,))
  140. # threads.append(t6)
  141. if __name__ == '__main__':
  142. for t in threads:
  143. t.setDaemon(True)
  144. t.start()
  145. for t in threads:
  146. t.join()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/768233
推荐阅读
相关标签
  

闽ICP备14008679号