赞
踩
可运行的完整项目,如有需要课私信联系
爬虫部分
- import json
- import time
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- import csv
- import pandas as pd
- import os
- import django
- from selenium.webdriver.chrome.service import Service
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'boss直聘数据可视化分析.settings')
- django.setup()
- from myApp.models import *
- class spider(object):
- def __init__(self,type,page):
- self.type = type
- self.page = page
- self.spiderUrl = "https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s"
-
- def startBrower(self):
- s = Service("chromedriver.exe")
- browser = webdriver.Chrome(service=s)
- # browser=webdriver.Chrome(executable_path='./chromedriver.exe')
- return browser
-
- def main(self,**info):
- if info['page'] < self.page:return
- brower = self.startBrower()
- print('页表页面URL:' + self.spiderUrl % (self.type,self.page))
- brower.get(self.spiderUrl % (self.type,self.page))
- time.sleep(15)
- # return
- # //*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul
- job_list = brower.find_elements(by=By.XPATH, value="//ul[@class='job-list-box']/li")
- for index,job in enumerate(job_list):
- try:
- print("爬取的是第 %d 条" % (index + 1))
- jobData = []
- # title 工作名字
- title = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-title')]/span[@class='job-name']").text
- # address 地址
- addresses = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-title')]//span[@class='job-area']").text.split(
- '·')
- address = addresses[0]
- # dist 行政区
- if len(addresses) != 1:dist = addresses[1]
- else: dist = ''
- # type 工作类型
- type = self.type
- # // *[ @ id = "wrap"] / div[2] / div[2] / div / div[1] / div[1] / ul / li[5] / div[1] / div / div[2] / ul
- tag_list = job.find_elements(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li")
- if len(tag_list) == 2:
- educational = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]").text
- workExperience = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[1]").text
- else:
- educational = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[3]").text
- workExperience = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]").text
- # hr
- hrWork = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/div[@class='info-public']/em").text
- hrName = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/div[@class='info-public']").text
-
- # workTag 工作标签
- workTag = job.find_elements(by=By.XPATH,
- value="./div[contains(@class,'job-card-footer')]/ul[@class='tag-list']/li")
- workTag = json.dumps(list(map(lambda x: x.text, workTag)))
-
- # salary 薪资
- salaries = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-info')]/span[@class='salary']").text
- # 是否为实习单位
- pratice = 0
- if salaries.find('K') != -1:
- salaries = salaries.split('·')
- if len(salaries) == 1:
- salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
- salaryMonth = '0薪'
- else:
- # salaryMonth 年底多薪
- salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
- salaryMonth = salaries[1]
- else:
- salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-')))
- salaryMonth = '0薪'
- pratice = 1
-
- # companyTitle 公司名称
- companyTitle = job.find_element(by=By.XPATH, value=".//h3[@class='company-name']/a").text
- # companyAvatar 公司头像
- companyAvatar = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//img").get_attribute(
- "src")
- companyInfoList = job.find_elements(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li")
- if len(companyInfoList) == 3:
- companyNature = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[1]").text
- companyStatus = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[2]").text
- try:
- companyPeople = list(map(lambda x: int(x), job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[3]").text.replace(
- '人', '').split('-')))
- except:
- companyPeople = [0, 10000]
- else:
- companyNature = job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[1]").text
- companyStatus = "未融资"
- try:
- companyPeople = list(map(lambda x: int(x), job.find_element(by=By.XPATH,
- value=".//div[contains(@class,'job-card-right')]//ul[@class='company-tag-list']/li[2]").text.replace(
- '人', '').split('-')))
- except:
- companyPeople = [0, 10000]
- # companyTag 公司标签
- companyTag = job.find_element(by=By.XPATH,
- value="./div[contains(@class,'job-card-footer')]/div[@class='info-desc']").text
- if companyTag:
- companyTag = json.dumps(companyTag.split(','))
-
- else:
- companyTag = '无'
-
- # 详情地址
- detailUrl = job.find_element(by=By.XPATH,
- value="./div[@class='job-card-body clearfix']/a").get_attribute('href')
- # 公司详情
- companyUrl = job.find_element(by=By.XPATH, value="//h3[@class='company-name']/a").get_attribute('href')
-
- jobData.append(title)
- jobData.append(address)
- jobData.append(type)
- jobData.append(educational)
- jobData.append(workExperience)
- jobData.append(workTag)
- jobData.append(salary)
- jobData.append(salaryMonth)
- jobData.append(companyTag)
- jobData.append(hrWork)
- jobData.append(hrName)
- jobData.append(pratice)
- jobData.append(companyTitle)
- jobData.append(companyAvatar)
- jobData.append(companyNature)
- jobData.append(companyStatus)
- jobData.append(companyPeople)
- jobData.append(detailUrl)
- jobData.append(companyUrl)
- jobData.append(dist)
-
- self.save_to_csv(jobData)
- except:
- pass
-
- self.page += 1
- self.main(page=info['page'])
-
- def save_to_csv(self,rowData):
- with open('./temp.csv', 'a', newline='', encoding='utf-8') as f:
- writer = csv.writer(f)
- writer.writerow(rowData)
-
- def clear_numTemp(self):
- with open('./numTemp.txt','w',encoding='utf-8') as f:
- f.write('')
-
- def init(self):
- if not os.path.exists('./temp.csv'):
- with open('./temp.csv','a',newline='',encoding='utf-8') as f:
- writer = csv.writer(f)
- writer.writerow(["title","address","type","educational","workExperience","workTag","salary","salaryMonth",
- "companyTags","hrWork","hrName","pratice","companyTitle","companyAvatar","companyNature",
- "companyStatus","companyPeople","detailUrl","companyUrl","dist"])
-
- def save_to_sql(self):
- data = self.clearData()
- for job in data:
- JobInfo.objects.create(
- title=job[0],
- address = job[1],
- type = job[2],
- educational = job[3],
- workExperience = job[4],
- workTag = job[5],
- salary = job[6],
- salaryMonth = job[7],
- companyTags = job[8],
- hrWork = job[9],
- hrName = job[10],
- pratice = job[11],
- companyTitle = job[12],
- companyAvatar = job[13],
- companyNature = job[14],
- companyStatus = job[15],
- companyPeople = job[16],
- detailUrl = job[17],
- companyUrl = job[18],
- dist=job[19]
- )
- print("导入数据库成功")
- os.remove("./temp.csv")
-
- def clearData(self):
- df = pd.read_csv('./temp.csv')
- df.dropna(inplace=True)
- df.drop_duplicates(inplace=True)
- df['salaryMonth'] = df['salaryMonth'].map(lambda x:x.replace('薪',''))
- print("总条数为%d" % df.shape[0])
- return df.values
-
- if __name__ == '__main__':
- spiderObj = spider("微信小程序",1);
- spiderObj.init()
- spiderObj.main(page=3)
- spiderObj.save_to_sql()
pycharm里面的详细内容
项目运行
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。