当前位置:   article > 正文

安居客无锡二手房数据获取

安居客无锡二手房数据获取


网址: https://wuxi.anjuke.com/sale/

selenium模拟浏览器抓取

# 用selenium模拟浏览器抓取
# selenium版本3.141.0
# firefox版本86.0
# geckodriver0.29.0
# geckodriver下载地址:https://github.com/mozilla/geckodriver/releases
from selenium import webdriver
import time
import pandas as pd
driver = webdriver.Firefox(executable_path = r'D:\geckodriver.exe') #geckodriver.exe的存放位置
link = "https://wuxi.anjuke.com/sale/"
driver.get(link)
df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags'))
a = 0
for x in range(50):
    print('正在爬取第%s页'%(x+1))
    house_list = driver.find_elements_by_css_selector('div.property')
    for i in range(len(house_list)):
        title = house_list[i].find_elements_by_css_selector('h3.property-content-title-name')[0].text
        name = house_list[i].find_elements_by_css_selector('p.property-content-info-comm-name')[0].text
        price = house_list[i].find_elements_by_css_selector('span.property-price-total-num')[0].text
        price_area = house_list[i].find_elements_by_css_selector('p.property-price-average')[0].text
        no_rooms = house_list[i].find_elements_by_xpath('//div[@class="property"]/a/div[2]/div[1]/section/div[1]/p[1]/span')
        no_room = no_rooms[6*i].text+no_rooms[6*i+1].text+no_rooms[6*i+2].text+no_rooms[6*i+3].text+no_rooms[6*i+4].text+no_rooms[6*i+5].text
        infos = house_list[i].find_elements_by_xpath('//div[@class="property-content-info"]/p[@class="property-content-info-text"]')
        area = infos[4*i].text
        orientations = infos[4*i+1].text
        floor = infos[4*i+2].text
        try:
            year = infos[4*i+3].text
        except:
            year = ''
        address = house_list[i].find_elements_by_css_selector('p.property-content-info-comm-address')[0].text[:2]
        tag_list = house_list[i].find_elements_by_xpath('//div[@class="property"]['+str(i)+'+1]//div[@class="property-content-info"]/span')
        tags = [i.text for i in tag_list]
        row = {'title':title, 'name':name, 'price':price, 'price_area':price_area, 'no_room':no_room, 'area':area, 'orientations':orientations, 'floor':floor, 'year':year, 'address':address, 'tags':tags}
        df.loc[a]= row #将数据存储至df第a+1行
        a += 1
    time.sleep(5) #间隔5秒再爬取下一个页面
    next_page = driver.find_elements_by_css_selector('a.next')[0] #点击下一页
    next_page.click()
print('爬取完毕,一共爬取%s条数据'%(a))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41

使用requests抓取网页,用bs4解析网页

# 使用requests抓取网页,用bs4解析网页
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from functools import reduce
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags', 'broker', 'rate', 'company'))
a = 0
for i in range(1,51):
    print('正在爬取第%s页'%(i))
    link = 'https://wuxi.anjuke.com/sale/p'+str(i) #由于网站初次访问需要验证码,运行程序前需要先手动打开一次网页输入验证码
    r = requests.get(link, headers = headers)
    soup = BeautifulSoup(r.text, 'lxml')
    house_list = soup.find_all('div', class_="property")
    for house in house_list:
        title = house.find('div', class_ ='property-content-title').h3.text.strip()
        price = house.find('span', class_ ='property-price-total-num').text.strip()
        price_area = house.find('p', class_='property-price-average').text.strip()
        no_room = house.find('div', class_='property-content-info').contents[0].text.strip()
        area = house.find('div', class_='property-content-info').contents[2].text.strip()
        orientations = house.find('div', class_='property-content-info').contents[4].text.strip()
        floor = house.find('div', class_='property-content-info').contents[6].text.strip()
        try:
            year = house.find('div', class_='property-content-info').contents[8].text.strip()
        except:
            year = ''
        name = house.find('p', class_='property-content-info-comm-name').text.strip()
        address = house.find('p', class_='property-content-info-comm-address').text.strip()[:2]
        tag_list = house.find_all('span', class_='property-content-info-tag')
        tags = [i.text for i in tag_list]
        broker = house.find('span', class_='property-extra-text').text
        broker_list = house.find_all('span', class_='property-extra-text')
        rate = broker_list[1].text.strip()[:3]
        try:
            company = broker_list[2].text.strip()
        except:
            company = ''
        row = {'title':title, 'name':name, 'price':price, 'price_area':price_area,
               'no_room':no_room, 'area':area, 'orientations':orientations,
               'floor':floor, 'year':year, 'address':address, 'tags':tags,
               'broker':broker, 'rate':rate, 'company':company}
        df.loc[a]= row
        a += 1
    time.sleep(5) 
print('爬取完毕,一共爬取%s条数据'%(a))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46

使用lxml解析网页 (xpath)

# 使用lxml解析网页 (xpath)
import requests
from lxml import etree
import pandas as pd
import time
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags', 'broker', 'rate', 'company'))
a = 0
for x in range(50):
    print('正在爬取第%s页'%(x+1))
    time.sleep(5)
    link = "https://wuxi.anjuke.com/sale/p"+str(x+1)
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} 
    r = requests.get(link, headers= headers)
    html = etree.HTML(r.text)
    title_list = html.xpath('//h3[@class="property-content-title-name"]/text()')
    name = html.xpath('//p[@class="property-content-info-comm-name"]/text()')
    price = html.xpath('//span[@class="property-price-total-num"]/text()')
    price_area = html.xpath('//p[@class="property-price-average"]/text()')
    no_rooms = html.xpath('//div[@class="property"]/a/div[2]/div[1]/section/div[1]/p[1]/span/text()')
    infos = html.xpath('//div[@class="property-content-info"]/p[@class="property-content-info-text"]/text()')
    info = [i.strip() for i in infos]
    address = html.xpath('//p[@class="property-content-info-comm-address"]/span[1]/text()')
    broker = html.xpath('//span[@class="property-extra-text"][1]/text()')
    rate = html.xpath('//span[@class="property-extra-text"][2]/text()')
    company = html.xpath('//span[@class="property-extra-text"][3]/text()')
    for i in range(len(title_list)):
        tags = html.xpath('//div[@class="property"]['+str(i)+'+1]//div[@class="property-content-info"]/span/text()')
        no_room = no_rooms[6*i]+no_rooms[6*i+1]+no_rooms[6*i+2]+no_rooms[6*i+3]+no_rooms[6*i+4]+no_rooms[6*i+5]
        row = {'title':title_list[i], 'name':name[i], 'price':price[i], 
               'price_area':price_area[i],'no_room':no_room, 'area':info[4*i],
               'orientations':info[4*i+1],'floor':info[4*i+2], 'year':info[4*i+3],
               'address':address[i], 'tags':tags, 'broker':broker[i],
               'rate':rate[i], 'company':company[i]}
        df.loc[a]=row
        a+=1
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/192584?site
推荐阅读
相关标签
  

闽ICP备14008679号