赞
踩
本文简单介绍如何使用Python爬取搜房网二手房数据,并保存到MySQL数据库以备深入分析和应用。
Python爬虫有很多第三方库或者框架可使用,本文使用到的库主要有 requests、BeautifulSoup4、MySQLdb。
闲话少说,直接上代码,相关说明已经在代码中作了说明和注释。
Python代码:
- # -*- coding:utf-8 -*-
- ############################################################################
- '''
- # 程序:上海搜房网爬虫
- # 功能:抓取上海搜房网二手房在售、成交数据
- # 创建时间:2017/01/03
- # 更新历史:2017/01/07 增加多城市处理、随机Header;
- # 增加爬取城市URL信息;封装为类,补充注释和日志
- #
- # 使用库:requests、BeautifulSoup4、MySQLdb
- # 作者:yuzhucu
- '''
- #############################################################################
- import requests
- from bs4 import BeautifulSoup
- import lxml
- import time
- import random
- import MySQLdb
-
- def randHeader():
- '''
- 随机生成User-Agent
- :return:
- '''
- head_connection = ['Keep-Alive', 'close']
- head_accept = ['text/html, application/xhtml+xml, */*']
- head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
- head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
- 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
- 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
- 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
- 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
- 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
- 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
- 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
- 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
- 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
- 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0'
- ]
- result = {
- 'Connection': head_connection[0],
- 'Accept': head_accept[0],
- 'Accept-Language': head_accept_language[1],
- 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
- }
- return result
-
- def getCurrentTime():
- # 获取当前时间
- return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localti
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。