赞
踩
# 用selenium模拟浏览器抓取 # selenium版本3.141.0 # firefox版本86.0 # geckodriver0.29.0 # geckodriver下载地址:https://github.com/mozilla/geckodriver/releases from selenium import webdriver import time import pandas as pd driver = webdriver.Firefox(executable_path = r'D:\geckodriver.exe') #geckodriver.exe的存放位置 link = "https://wuxi.anjuke.com/sale/" driver.get(link) df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags')) a = 0 for x in range(50): print('正在爬取第%s页'%(x+1)) house_list = driver.find_elements_by_css_selector('div.property') for i in range(len(house_list)): title = house_list[i].find_elements_by_css_selector('h3.property-content-title-name')[0].text name = house_list[i].find_elements_by_css_selector('p.property-content-info-comm-name')[0].text price = house_list[i].find_elements_by_css_selector('span.property-price-total-num')[0].text price_area = house_list[i].find_elements_by_css_selector('p.property-price-average')[0].text no_rooms = house_list[i].find_elements_by_xpath('//div[@class="property"]/a/div[2]/div[1]/section/div[1]/p[1]/span') no_room = no_rooms[6*i].text+no_rooms[6*i+1].text+no_rooms[6*i+2].text+no_rooms[6*i+3].text+no_rooms[6*i+4].text+no_rooms[6*i+5].text infos = house_list[i].find_elements_by_xpath('//div[@class="property-content-info"]/p[@class="property-content-info-text"]') area = infos[4*i].text orientations = infos[4*i+1].text floor = infos[4*i+2].text try: year = infos[4*i+3].text except: year = '' address = house_list[i].find_elements_by_css_selector('p.property-content-info-comm-address')[0].text[:2] tag_list = house_list[i].find_elements_by_xpath('//div[@class="property"]['+str(i)+'+1]//div[@class="property-content-info"]/span') tags = [i.text for i in tag_list] row = {'title':title, 'name':name, 'price':price, 'price_area':price_area, 'no_room':no_room, 'area':area, 'orientations':orientations, 'floor':floor, 'year':year, 'address':address, 'tags':tags} df.loc[a]= row #将数据存储至df第a+1行 a += 1 time.sleep(5) #间隔5秒再爬取下一个页面 next_page = driver.find_elements_by_css_selector('a.next')[0] #点击下一页 next_page.click() print('爬取完毕,一共爬取%s条数据'%(a))
# 使用requests抓取网页,用bs4解析网页 import requests from bs4 import BeautifulSoup import pandas as pd import time from functools import reduce headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags', 'broker', 'rate', 'company')) a = 0 for i in range(1,51): print('正在爬取第%s页'%(i)) link = 'https://wuxi.anjuke.com/sale/p'+str(i) #由于网站初次访问需要验证码,运行程序前需要先手动打开一次网页输入验证码 r = requests.get(link, headers = headers) soup = BeautifulSoup(r.text, 'lxml') house_list = soup.find_all('div', class_="property") for house in house_list: title = house.find('div', class_ ='property-content-title').h3.text.strip() price = house.find('span', class_ ='property-price-total-num').text.strip() price_area = house.find('p', class_='property-price-average').text.strip() no_room = house.find('div', class_='property-content-info').contents[0].text.strip() area = house.find('div', class_='property-content-info').contents[2].text.strip() orientations = house.find('div', class_='property-content-info').contents[4].text.strip() floor = house.find('div', class_='property-content-info').contents[6].text.strip() try: year = house.find('div', class_='property-content-info').contents[8].text.strip() except: year = '' name = house.find('p', class_='property-content-info-comm-name').text.strip() address = house.find('p', class_='property-content-info-comm-address').text.strip()[:2] tag_list = house.find_all('span', class_='property-content-info-tag') tags = [i.text for i in tag_list] broker = house.find('span', class_='property-extra-text').text broker_list = house.find_all('span', class_='property-extra-text') rate = broker_list[1].text.strip()[:3] try: company = broker_list[2].text.strip() except: company = '' row = {'title':title, 'name':name, 'price':price, 'price_area':price_area, 'no_room':no_room, 'area':area, 'orientations':orientations, 'floor':floor, 'year':year, 'address':address, 'tags':tags, 'broker':broker, 'rate':rate, 'company':company} df.loc[a]= row a += 1 time.sleep(5) print('爬取完毕,一共爬取%s条数据'%(a))
# 使用lxml解析网页 (xpath) import requests from lxml import etree import pandas as pd import time headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} df = pd.DataFrame(columns=('title', 'name', 'price', 'price_area', 'no_room', 'area', 'orientations', 'floor', 'year', 'address', 'tags', 'broker', 'rate', 'company')) a = 0 for x in range(50): print('正在爬取第%s页'%(x+1)) time.sleep(5) link = "https://wuxi.anjuke.com/sale/p"+str(x+1) headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} r = requests.get(link, headers= headers) html = etree.HTML(r.text) title_list = html.xpath('//h3[@class="property-content-title-name"]/text()') name = html.xpath('//p[@class="property-content-info-comm-name"]/text()') price = html.xpath('//span[@class="property-price-total-num"]/text()') price_area = html.xpath('//p[@class="property-price-average"]/text()') no_rooms = html.xpath('//div[@class="property"]/a/div[2]/div[1]/section/div[1]/p[1]/span/text()') infos = html.xpath('//div[@class="property-content-info"]/p[@class="property-content-info-text"]/text()') info = [i.strip() for i in infos] address = html.xpath('//p[@class="property-content-info-comm-address"]/span[1]/text()') broker = html.xpath('//span[@class="property-extra-text"][1]/text()') rate = html.xpath('//span[@class="property-extra-text"][2]/text()') company = html.xpath('//span[@class="property-extra-text"][3]/text()') for i in range(len(title_list)): tags = html.xpath('//div[@class="property"]['+str(i)+'+1]//div[@class="property-content-info"]/span/text()') no_room = no_rooms[6*i]+no_rooms[6*i+1]+no_rooms[6*i+2]+no_rooms[6*i+3]+no_rooms[6*i+4]+no_rooms[6*i+5] row = {'title':title_list[i], 'name':name[i], 'price':price[i], 'price_area':price_area[i],'no_room':no_room, 'area':info[4*i], 'orientations':info[4*i+1],'floor':info[4*i+2], 'year':info[4*i+3], 'address':address[i], 'tags':tags, 'broker':broker[i], 'rate':rate[i], 'company':company[i]} df.loc[a]=row a+=1
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。