赞
踩
- 候选数据网站的选择
- 选取原则:股票信息静态存在于HTML页面中,非js代码生成,没有Robots协议限制
- 选取方法:浏览器F12,源代码查看等
-
- 程序的结构设计:
- 步骤1:从东方财富网获取股票列表
- 步骤2:根据股票列表 逐个到百度股票获取个股信息
- 步骤3:将结果存储到文件
参考视频:https://www.bilibili.com/video/av9784617?p=47
代码:
- import requests
- import time
- from bs4 import BeautifulSoup
- import traceback
- import re
-
- def getHTMLText(url,code='utf-8'):
- try:
- r = requests.get(url,timeout=30)
- r.raise_for_status()
- r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_encoding
- return r.text
- except:
- return ""
-
- def getStockList(lst,stockURL):
- html = getHTMLText(stockURL)
- soup = BeautifulSoup(html,"html.parser")
- a = soup.find_all('a')
- for i in a :
- # print(type(i)) #<class 'bs4.element.Tag'>
- # s = "连板"
- # print(s)
- # if s in str(i): #判断字符串中是否包含子串
- try:
- href= i.attrs['href']
- lst.append(re.findall(r"\d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了
- except:
- continue
-
- def getStockInfo(lst,stockURL,fpath):
- count=0 # 实现 进度条
- for stock in lst:
- url = stockURL + stock +".html" #拼接url
- html = getHTMLText(url) # 获取股票页面内容
- try:
- if html == "": #空页面的处理
- continue
- infoDict = {}
- soup = BeautifulSoup(html,'html.parser') # 解析网页
- text =soup.text
- # 正则匹配出股票名称
- stockName = re.search(r'[\w\u4e00-\u9fcc]+',text).group(0)
- # 正则匹配出股票代码
- stockNumber = re.search(r'[0-9]\d{5}',text).group(0)
- # 正则匹配出股票个股日历
- p = re.compile(r"[个][股][日][\u4e00-\u9fa5]+[\s\S]+")
- stockHistory=p.findall(text)
- print(stockHistory)
- stockHistory1= re.split(r" +",stockHistory[0]) #按照多个空格分割
- print(stockHistory1[0])
- #将“个股日历 替换为空格”
- stockHistory2 = stockHistory1[0].replace("个股日历","")
- print(stockHistory2)
- stockHistory3 = stockHistory2.replace("\n",";",100).replace("\r",";")
- print(str(stockHistory3))
-
- infoDict.update({'股票名称':stockName}) #将 这个信息增加到字典中
- infoDict.update({'股票代码':stockNumber}) #将 这个信息增加到字典中
- infoDict.update({'股票日历':stockHistory2}) #将 这个信息增加到字典中
- # 股票信息部分如下
- # keyList = stockInfo.find_all('dt') #键
- # valueList = stockInfo.find_all('dd') #值
- # 还原为键值对并存到字典中
- # for i in range(len(keyList)):
- # key = keyList[i].text
- # val = valueList[i].text
- # infoDict[key]= val #字典可以直接使用key=value向字典中新增内容
- #将相关股票信息保存在文件中
- with open(fpath,'a',encoding='utf-8') as f:
- f.write(str(infoDict) + '\n')
- count = count +1 # 实现 进度条
- print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # \r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条
- # 每10秒抓一次数据
- time.sleep(10)
- except:
- count = count +1 # 实现 进度条
- print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')
- traceback.print_exc()
- continue
-
-
-
- if __name__ == '__main__':
- # stock_list_url = "https://www.banban.cn/gupiao/list_sh.html"
- stock_list_url = "https://www.banban.cn/gupiao/list_sz.html"
- # stock_info_url = "http://quote.eastmoney.com/sh" #sh上证 sz深圳
- stock_info_url = "http://quote.eastmoney.com/sz"
- output_file = "D://pythontest/files/gupiao/gupiao20191212.txt"
- # slist = ['002656','002702','000001','000002']
- slist=[]
- getStockList(slist,stock_list_url)
- getStockInfo(slist,stock_info_url,output_file)
抓取多个信息并入库mysql实例:
- import requests
- from bs4 import BeautifulSoup
- import traceback
- import re
- from 基础用法.toMysql import ToMySql
- def getHTMLText(url,code='utf-8'):
- try:
- r = requests.get(url,timeout=30)
- r.raise_for_status()
- r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_encoding
- return r.text
- except:
- return ""
-
- def getStockList(lst,stockURL):
- html = getHTMLText(stockURL)
- soup = BeautifulSoup(html,"html.parser")
- a = soup.find_all('a')
- for i in a :
- try:
- href= i.attrs['href']
- lst.append(re.findall(r"\d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了
- except:
- continue
-
- def getStockInfo(lst,stockURL,fpath):
- count=0 # 实现 进度条
- for stock in lst:
- url = stockURL + stock +".html" #拼接url
- html = getHTMLText(url) # 获取股票页面内容
- print('==================')
- print(url)
- try:
- if html == "": #空页面的处理
- continue
- infoDict = {}
- soup = BeautifulSoup(html,'html.parser') # 解析网页
- text =soup.text
- # 正则匹配出股票名称
- stockName = re.search(r'[\w\u4e00-\u9fcc]+',text).group(0)
- # 正则匹配出股票代码
- stockNumber = re.search(r'[0-9]\d{5}',text).group(0)
- # 正则匹配出股票个股日历
- jk = re.compile(r"[今][开][::][0-9]*[.][0-9]*")
- zs = re.compile(r"[昨][收][::][0-9]*[.][0-9]*")
- zg = re.compile(r"[最][高][::][0-9]*[.][0-9]*")
- zd = re.compile(r"[最][低][::][0-9]*[.][0-9]*")
- hsl = re.compile(r"[换][手][率][::][0-9]*[.][0-9]*")
- syl = re.compile(r"[市][盈][率][::][0-9]*[.][0-9]*")
-
-
- jkV=jk.findall(text)
- zsV=zs.findall(text)
- zgV=zg.findall(text)
- zdV=zd.findall(text)
- hslV=hsl.findall(text)
- sylV=syl.findall(text)
- # print(sylV[0])
-
- hslV = str(hslV[0])+"%"
- print(jkV[0])
- print(jkV[0].split(":")[1])
- print(zsV[0])
- print(zgV[0])
- print(zdV[0])
- print(str(hslV))
- print(sylV[0])
-
- # 将数据写入mysql
- sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
- data = (stockName,stockNumber,jkV[0].split(":")[1],zsV[0].split(":")[1],zgV[0].split(":")[1],zdV[0].split(":")[1],hslV.split(":")[1],str(sylV[0].split(":")[1])) #直接写数字类型也能写入
- result = ToMySql.writeDb(sql, data)
-
-
- # 将数据写入文件中
- infoDict.update({'名称':stockName}) #将 这个信息增加到字典中
- infoDict.update({'代码':stockNumber})
- infoDict.update({'今开':jkV[0].split(":")[1]})
- infoDict.update({'昨收':zsV[0].split(":")[1]})
- infoDict.update({'最高':zgV[0].split(":")[1]})
- infoDict.update({'最低':zdV[0].split(":")[1]})
- infoDict.update({'换手率':hslV.split(":")[1]})
- infoDict.update({'市盈率':str(sylV[0].split(":")[1])})
-
- #将相关股票信息保存在文件中
- with open(fpath,'a',encoding='utf-8') as f:
- f.write(str(infoDict) + '\n')
- count = count +1 # 实现 进度条
- print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # \r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条
- # 每10秒抓一次数据
- # time.sleep(10)
- except:
- count = count +1 # 实现 进度条
- print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')
- traceback.print_exc()
- continue
-
-
-
- if __name__ == '__main__':
- # stock_list_url = "https://www.banban.cn/gupiao/list_sh.html"
- stock_list_url = "https://www.banban.cn/gupiao/list_sz.html"
- # stock_info_url = "http://quote.eastmoney.com/sh" #sh上证 sz深圳
- stock_info_url = "http://quote.cfi.cn/quote_"
- output_file = "D://pythontest/files/gupiao/我的股票信息.csv"
- # slist = ['002656','002702','000001','000002']
- slist=[]
- getStockList(slist,stock_list_url)
- getStockInfo(slist,stock_info_url,output_file)
入库mysql封装的方法:
- import pymysql
- import logging
- import pandas as pd
-
- db_name = 'python'
- db_user = 'root'
- db_pass = 'root'
- db_ip = '127.0.0.1'
- db_port = 3306
-
-
- #写入数据到数据库中
- def writeDb(sql,db_data=()):
- """
- 连接mysql数据库(写),并进行写的操作
- """
- try:
- conn = pymysql.connect(db=db_name,user=db_user,passwd=db_pass,host=db_ip,port=int(db_port),charset="utf8")
- cursor = conn.cursor()
- except Exception as e:
- print(e)
- logging.error('数据库连接失败:%s' % e)
- return False
-
- try:
- cursor.execute(sql, db_data)
- conn.commit()
- except Exception as e:
- conn.rollback()
- logging.error('数据写入失败:%s' % e)
- return False
- finally:
- cursor.close()
- conn.close()
- return True
-
-
- #
- # sql = """ INSERT INTO user(email,last_name) VALUES(%s,%s) """
- # data = ("632443020@qq.com", "男")
- # result = writeDb(sql, data)
-
-
- sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
- data = ("1","2","3","4","5","6","7","8") #直接写数字类型也能写入
- result = writeDb(sql, data)
Stocks信息表:
- CREATE TABLE
- stock_infos
- (
- id INT NOT NULL AUTO_INCREMENT COMMENT '主键',
- stock_name VARCHAR(30),
- stock_code VARCHAR(30),
- jk VARCHAR(10) COMMENT '今开',
- zs VARCHAR(10) COMMENT '昨收 ',
- zg VARCHAR(10) COMMENT '最高 ',
- zd VARCHAR(10) COMMENT '最低',
- hsl VARCHAR(10) COMMENT '换手率',
- syl VARCHAR(10) COMMENT '市盈率',
- PRIMARY KEY (id)
- )
- ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
爬取股票信息实现百分比进度条:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。