赞
踩
代码原创性问题
以下例程代码全部由本人所编写,供大家学习Python爬虫,以及练习Python操作Mysql数据库使用。
适用范围
这个小项目是本人才接触Python爬虫的时候所编写,出发点是熟悉爬虫数据爬取的特性,以及结合Mysql数据库的简单应用,所以功能上较为简单。代码注释较少但编写以见名知意为原则,并且用到的函数与逻辑均较为简单,本人认为不会过分影响阅读及学习。
适合才接触爬虫的新手学习,但需要一定耐心。
该例程可以做更多的事吗?
完全可以,
虽然该爬虫是一个单线程最基本的爬虫,但是对数据爬取的封装是独立分割成不同函数的,读者可以根据自己的学习以及使用需要对其进行增改,我相信对于准备接触数据挖掘、ML或者DL的小伙伴来说是一个不错的小工具。
import MySQLdb #用来连接Mysql数据库
import requests #用来爬取数据
import bs4 #用来处理爬取到的数据
注意
connect
函数中,即第7
行需要针对自己的Mysql配置进行修改
#封装的MySQLdb的方法类 class ToLocalMysql(object): def __init__(self): self.__flag = 0 def connect(self): self.conn = MySQLdb.connect('127.0.0.1', 'root', '', '', 3306, charset='utf8') self.cursor = self.conn.cursor() self.cursor.execute("set names utf8;") self.__flag = 1 def execute(self, sql): if self.__flag == 1: self.cursor.execute(sql) result = self.cursor.fetchall() returnOut = [] for i in result: out = [] for j in i: out.append(j) print(out) returnOut.append(out) return returnOut else: print('execute() error : 连接未建立') def close(self): if self.__flag == 1: self.conn.close() self.__flag = 0
#爬取豆瓣top250数据的类 class DouBan(object): def __init__(self, httpAddress): print("豆瓣类创建成功", time.ctime()) originData = requests.get(httpAddress) self.soup = bs4.BeautifulSoup(originData.text, 'lxml') def find(self,keyWord, class_=""): thing = self.soup.find_all(keyWord, class_) out = [] for i in thing: temp = i.string if temp[0] != chr(160): out.append(temp) return out def getNamegeOfSiglePage(self): return self.find('span', class_="title") def getScoreOfSiglePage(self): return self.find('span', class_="rating_num") def getYearOfSiglePage(self): thing = self.soup.find_all('p', class_ = "") out = [] for i in thing: temp = i.contents[2].replace(chr(32), "")[1:5] out.append(temp) return out def getCountryOfSiglePage(self): thing = self.soup.find_all('p', class_ = "") out = [] for i in thing: temp = i.contents[2].replace(chr(32), "")[8:10] out.append(temp) return out def getTypeOfSiglePage(self): thing = self.soup.find_all('p', class_ = "") out = [] for i in thing: temp = (i.contents[2].split("/")[2])[1:-25] out.append(temp) return out def getDirectorOfSiglePage(self): thing = self.soup.find_all('p', class_ = "") out = [] for i in thing: temp = i.contents[0] start = temp.find("导演") end = temp.find("主演") out.append(temp[start:end].split(" ")[1]) return out def getNumOfJudgeOfSiglePage(self): thing = self.soup.find_all('span', class_ = "") out = [] for i in thing: temp = i.contents if temp != []: temp = str(temp[0]) if len(temp) > 3: if(temp[-1] == "价"): out.append(temp[:-3]) return out
这里需要提前在Mysql数据库中建好表,表的列信息参考列名,也可以自行增改。
#传入库名和表名(已经存在) #列名为(name, year, country, kind, drictor, score, judges) def get_douban_top250_data(db, table): del mysql mysql = ToLocalMysql() print(mysql) mysql.close() mysql.connect() mysql.execute("use %s;"%db) for loop in range(10): if loop ==0: address = 'https://movie.douban.com/top250' else: address = "https://movie.douban.com/top250?start=%d&filter="%(loop*25) douban = DouBan(address) name = douban.getNamegeOfSiglePage() year = douban.getYearOfSiglePage() country = douban.getCountryOfSiglePage() director = douban.getDirectorOfSiglePage() score = douban.getScoreOfSiglePage() jugesNum = douban.getNumOfJudgeOfSiglePage() kind = douban.getTypeOfSiglePage() for i in range(len(name)): print(str(name[i]), str(year[i]), str(country[i]), str(kind[i]),\ str(director[i]), float(score[i]), int(jugesNum[i])) mysql.execute("insert into '%s'\n \ (name, year, country, kind, drictor, score, judges)\n\ values('%s', '%s', '%s', '%s', '%s', %f, %d);"\ %(table, str(name[i]), str(year[i]), \ str(country[i]), str(kind[i]), \ str(director[i]), float(score[i]), int(jugesNum[i]))) mysql.conn.commit() mysql.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。