当前位置:   article > 正文

毕业设计 基于python的boss直聘数据可视化系统_d:\毕设\boss直聘数据可视化\spider\spidermain.py:9: deprecat

d:\毕设\boss直聘数据可视化\spider\spidermain.py:9: deprecationwarning: pyarr

可运行的完整项目,如有需要课私信联系

爬虫部分

  1. import json
  2. import time
  3. from selenium import webdriver
  4. from selenium.webdriver.common.by import By
  5. import csv
  6. import pandas as pd
  7. import os
  8. import django
  9. from selenium.webdriver.chrome.service import Service
  10. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'boss直聘数据可视化分析.settings')
  11. django.setup()
  12. from myApp.models import *
  13. class spider(object):
  14. def __init__(self,type,page):
  15. self.type = type
  16. self.page = page
  17. self.spiderUrl = "https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s"
  18. def startBrower(self):
  19. s = Service("chromedriver.exe")
  20. browser = webdriver.Chrome(service=s)
  21. # browser=webdriver.Chrome(executable_path='./chromedriver.exe')
  22. return browser
  23. def main(self,**info):
  24. if info['page'] < self.page:return
  25. brower = self.startBrower()
  26. print('页表页面URL:' + self.spiderUrl % (self.type,self.page))
  27. brower.get(self.spiderUrl % (self.type,self.page))
  28. time.sleep(15)
  29. # return
  30. # //*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul
  31. job_list = brower.find_elements(by=By.XPATH, value="//ul[@class='job-list-box']/li")
  32. for index,job in enumerate(job_list):
  33. try:
  34. print("爬取的是第 %d 条" % (index + 1))
  35. jobData = []
  36. # title 工作名字
  37. title = job.find_element(by=By.XPATH,
  38. value=".//div[contains(@class,'job-title')]/span[@class='job-name']").text
  39. # address 地址
  40. addresses = job.find_element(by=By.XPATH,
  41. value=".//div[contains(@class,'job-title')]//span[@class='job-area']").text.split(
  42. '·')
  43. address = addresses[0]
  44. # dist 行政区
  45. if len(addresses) != 1:dist = addresses[1]
  46. else: dist = ''
  47. # type 工作类型
  48. type = self.type
  49. # // *[ @ id = "wrap"] / div[2] / div[2] / div / div[1] / div[1] / ul / li[5] / div[1] / div / div[2] / ul
  50. tag_list = job.find_elements(by=By.XPATH,
  51. value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li")
  52. if len(tag_list) == 2:
  53. educational = job.find_element(by=By.XPATH,
  54. value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]").text
  55. workExperience = job.find_element(by=By.XPATH,
  56. value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[1]").text
  57. else:
  58. educational = job.find_element(by=By.XPATH,
  59. value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[3]").text
  60. workExperience = job.find_element(by=By.XPATH,
  61. value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]").text
  62. # hr
  63. hrWork = job.find_element(by=By.XPATH,
  64. value=".//div[contains(@class,'job-info')]/div[@class='info-public']/em").text
  65. hrName = job.find_element(by=By.XPATH,
  66. value=".//div[contains(@class,'job-info')]/div[@class='info-public']").text
  67. # workTag 工作标签
  68. workTag = job.find_elements(by=By.XPATH,
  69. value="./div[contains(@class,'job-card-footer')]/ul[@class='tag-list']/li")
  70. workTag = json.dumps(list(map(lambda x: x.text, workTag)))
  71. # salary 薪资
  72. salaries = job.find_element(by=By.XPATH,
  73. value=".//div[contains(@class,'job-info')]/span[@class='salary']").text
  74. # 是否为实习单位
  75. pratice = 0
  76. if salaries.find('K') != -1:
  77. salaries = salaries.split('·')
  78. if len(salaries) == 1:
  79. salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
  80. salaryMonth = '0薪'
  81. else:
  82. # salaryMonth 年底多薪
  83. salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
  84. salaryMonth = salaries[1]
  85. else:
  86. salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-')))
  87. salaryMonth = '0薪'
  88. pratice = 1
  89. # companyTitle 公司名称
  90. companyTitle = job.find_element(by=By.XPATH, value=".//h3[@class='company-name']/a").text
  91. # companyAvatar 公司头像
  92. companyAvatar = job.find_element(by=By.XPATH,
  93. value=".//div[contains(@class,'job-card-right')]//img").get_attribute(
  94. "src")
  95. companyInfoList = job.find_elements(by=By.XPATH,
  96. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li")
  97. if len(companyInfoList) == 3:
  98. companyNature = job.find_element(by=By.XPATH,
  99. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[1]").text
  100. companyStatus = job.find_element(by=By.XPATH,
  101. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[2]").text
  102. try:
  103. companyPeople = list(map(lambda x: int(x), job.find_element(by=By.XPATH,
  104. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[3]").text.replace(
  105. '人', '').split('-')))
  106. except:
  107. companyPeople = [0, 10000]
  108. else:
  109. companyNature = job.find_element(by=By.XPATH,
  110. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[1]").text
  111. companyStatus = "未融资"
  112. try:
  113. companyPeople = list(map(lambda x: int(x), job.find_element(by=By.XPATH,
  114. value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[2]").text.replace(
  115. '人', '').split('-')))
  116. except:
  117. companyPeople = [0, 10000]
  118. # companyTag 公司标签
  119. companyTag = job.find_element(by=By.XPATH,
  120. value="./div[contains(@class,'job-card-footer')]/div[@class='info-desc']").text
  121. if companyTag:
  122. companyTag = json.dumps(companyTag.split(','))
  123. else:
  124. companyTag = '无'
  125. # 详情地址
  126. detailUrl = job.find_element(by=By.XPATH,
  127. value="./div[@class='job-card-body clearfix']/a").get_attribute('href')
  128. # 公司详情
  129. companyUrl = job.find_element(by=By.XPATH, value="//h3[@class='company-name']/a").get_attribute('href')
  130. jobData.append(title)
  131. jobData.append(address)
  132. jobData.append(type)
  133. jobData.append(educational)
  134. jobData.append(workExperience)
  135. jobData.append(workTag)
  136. jobData.append(salary)
  137. jobData.append(salaryMonth)
  138. jobData.append(companyTag)
  139. jobData.append(hrWork)
  140. jobData.append(hrName)
  141. jobData.append(pratice)
  142. jobData.append(companyTitle)
  143. jobData.append(companyAvatar)
  144. jobData.append(companyNature)
  145. jobData.append(companyStatus)
  146. jobData.append(companyPeople)
  147. jobData.append(detailUrl)
  148. jobData.append(companyUrl)
  149. jobData.append(dist)
  150. self.save_to_csv(jobData)
  151. except:
  152. pass
  153. self.page += 1
  154. self.main(page=info['page'])
  155. def save_to_csv(self,rowData):
  156. with open('./temp.csv', 'a', newline='', encoding='utf-8') as f:
  157. writer = csv.writer(f)
  158. writer.writerow(rowData)
  159. def clear_numTemp(self):
  160. with open('./numTemp.txt','w',encoding='utf-8') as f:
  161. f.write('')
  162. def init(self):
  163. if not os.path.exists('./temp.csv'):
  164. with open('./temp.csv','a',newline='',encoding='utf-8') as f:
  165. writer = csv.writer(f)
  166. writer.writerow(["title","address","type","educational","workExperience","workTag","salary","salaryMonth",
  167. "companyTags","hrWork","hrName","pratice","companyTitle","companyAvatar","companyNature",
  168. "companyStatus","companyPeople","detailUrl","companyUrl","dist"])
  169. def save_to_sql(self):
  170. data = self.clearData()
  171. for job in data:
  172. JobInfo.objects.create(
  173. title=job[0],
  174. address = job[1],
  175. type = job[2],
  176. educational = job[3],
  177. workExperience = job[4],
  178. workTag = job[5],
  179. salary = job[6],
  180. salaryMonth = job[7],
  181. companyTags = job[8],
  182. hrWork = job[9],
  183. hrName = job[10],
  184. pratice = job[11],
  185. companyTitle = job[12],
  186. companyAvatar = job[13],
  187. companyNature = job[14],
  188. companyStatus = job[15],
  189. companyPeople = job[16],
  190. detailUrl = job[17],
  191. companyUrl = job[18],
  192. dist=job[19]
  193. )
  194. print("导入数据库成功")
  195. os.remove("./temp.csv")
  196. def clearData(self):
  197. df = pd.read_csv('./temp.csv')
  198. df.dropna(inplace=True)
  199. df.drop_duplicates(inplace=True)
  200. df['salaryMonth'] = df['salaryMonth'].map(lambda x:x.replace('薪',''))
  201. print("总条数为%d" % df.shape[0])
  202. return df.values
  203. if __name__ == '__main__':
  204. spiderObj = spider("微信小程序",1);
  205. spiderObj.init()
  206. spiderObj.main(page=3)
  207. spiderObj.save_to_sql()

pycharm里面的详细内容

项目运行

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/380140
推荐阅读
相关标签
  

闽ICP备14008679号