基于Request+bs4-re技术路线实现股票数据定向爬虫_股票数据定向爬虫python技术路线:requests‐bs4‐re

作者：IT小白 | 2024-07-13 23:16:28

踩

股票数据定向爬虫python技术路线:requests‐bs4‐re

基于Request+bs4-re技术路线实现股票数据定向爬虫

一、功能描述
二、程序结构设计
三、程序整体框架
四、程序功能的实现
五、程序的优化
六、代码的进一步优化
七、心得体会
八、参考资料

一、功能描述

1. 目标

获取上交所和深交所所有股票的名称和交易信息，并保存到txt文件中。

2. 可行性分析

查看网页源代码，看所需要的信息是否在HTML源代码中，非js代码生成，从而选择不同的技术路线。
查看相应的根目录下的/robots.txt协议，查看允许爬取的权限，以及是否需要伪装成浏览器。
不要纠结于某个网站，多找信息源尝试
经过分析，新浪股票网是使用的js生成的，而百度股票网是静态的，所以选择爬取百度股票网上股票数据。
然后股票的列表我们从东方财富网获得。
从而确定了数据源：
获取股票列表：
东方财富网：http://quote.eastmoney.com/stocklist.html
获取个股信息：
百度股票：https://gupiao.baidu.com/stock/
单个股票：https://gupiao.baidu.com/stock/sz002439.html

3. 技术路线选择

根据前面的分析，以及目前掌握的技术，所以采用requests+bs4+re的技术路线进行实现。

4. 项目的技术难点

分析确定数据源
提取股票列表的数据
提取单个股票信息的数据
数据的存储方式

二、程序结构设计

Step1：从东方财富网获取股票列表——getHTMLText() getStockList(lst,stockURL)
Step2：根据股票列表逐个到百度股票获取个股信息——getStockInfo(lst,stockURL,fpath)
Step3：将结果存储到文件

三、程序整体框架

写出程序的框架

import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url):
    return ""

def getStockList(lst, stockURL):
    return ""

def getStockInfo(lst, stockURL, fpath):
    return ""

def main():
    pass

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
'运行

四、程序功能的实现

实现每一个函数的功能，使得程序能够运行得到我们想要的结果。

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import traceback # 追踪，便于调试

def getHTMLText(url): # 获取网页内容
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def getStockList(lst,stockURL): # 获取股票列表
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html,"html.parser")
    a = soup.find_all('a') # 需要的股票代码在a标签的“href"属性的值,故先获取全部a标签的列表
    for i in a:
        try:
            href = i.attrs['href'] # q取“href"属性的值
            lst.append(re.findall(r"s[hz]\d{6}",href)[0]) # 利用正则表达式提取股票代码
        except: # 因为有的a标签没有股票代码,可以作为异常
            continue

def getStockInfo(lst,stockURL,fpath): # 获得每只个股的股票信息
    for stock in lst:
        url = stockURL + stock + ".html" # 构建爬取股票信息的URL
        html = getHTMLText(url) # 获取股票信息网页内容
        try:
            if html == "": # 有些网址打不开,或为空，则跳过
                continue
            infoDict = {} # 以字典的形式保存股票信息,方便后面数据分析
            soup = BeautifulSoup(html,"html.parser")
            stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) # 获取股票信息所在的标签

            name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获取股票所在标签,这里注意find_all()返回的是一个bs4.element.ResultSet
            # 不能用name.string,原因是这里的name标签内有两个字符串子项，所以返回None,详情参照help文档
            infoDict.update({'股票名称': name.text.split()[0]}) # 通过text以字符串形式获得name标签的所有字符串子项,然后用split()方法通过空格分隔字符串,提取股票名称

            # 获取其它的键值对信息
            keyList = stockInfo.find_all('dt') # 键在dt标签
            valueList = stockInfo.find_all('dd') # 值在dd标签
            for i in range(len(keyList)): 
                key = keyList[i].text # 这里可以用keyList[i].string,获取键信息
                val = valueList[i].text # 获取值信息
                infoDict[key] = val # 往字典中添加键值对
            
            with open(fpath, 'a', encoding='utf-8') as f:# 以追加模式打开,因为有中文所以用utf-8编码
                f.write( str(infoDict) + '\n' ) # 这里为了输出美观,加了一个换行符
        except:
            traceback.print_exc() # 追踪错误，便于调试
            continue
                
def main():
    stock_list_url = "http://quote.eastmoney.com/stocklist.html"
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = "D://BaiduStockInfo.txt"
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'运行

程序报错：
在这里插入图片描述
分析错误代码，提示属性错误，None类型没有find_all属性，解决方案：定位到Alt+G定位到40行，做一个条件判断，排除None类型的情况。
更正后代码如下：

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import traceback # 追踪，便于调试

def getHTMLText(url): # 获取网页内容
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def getStockList(lst,stockURL): # 获取股票列表
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html,"html.parser")
    a = soup.find_all('a') # 需要的股票代码在a标签的“href"属性的值,故先获取全部a标签的列表
    for i in a:
        try:
            href = i.attrs['href'] # q取“href"属性的值
            lst.append(re.findall(r"s[hz]\d{6}",href)[0]) # 利用正则表达式提取股票代码
        except: # 因为有的a标签没有股票代码,可以作为异常
            continue

def getStockInfo(lst,stockURL,fpath): # 获得每只个股的股票信息
    for stock in lst:
        url = stockURL + stock + ".html" # 构建爬取股票信息的URL
        html = getHTMLText(url) # 获取股票信息网页内容
        try:
            if html == "": # 有些网址打不开,或为空，则跳过
                continue
            infoDict = {} # 以字典的形式保存股票信息,方便后面数据分析
            soup = BeautifulSoup(html,"html.parser")
            stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) # 获取股票信息所在的标签,有可能部分网页丢失,所以要做类型判断

            if stockInfo: # stockInfo非None才执行
                name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获取股票所在标签,这里注意find_all()返回的是一个bs4.element.ResultSet
                # 不能用name.string,原因是这里的name标签内有两个字符串子项，所以返回None,详情参照help文档
                infoDict.update({'股票名称': name.text.split()[0]}) # 通过text以字符串形式获得name标签的所有字符串子项,然后用split()方法通过空格分隔字符串,提取股票名称

                # 获取其它的键值对信息
                keyList = stockInfo.find_all('dt') # 键在dt标签
                valueList = stockInfo.find_all('dd') # 值在dd标签
                for i in range(len(keyList)): 
                    key = keyList[i].text # 这里可以用keyList[i].string,获取键信息
                    val = valueList[i].text # 获取值信息
                    infoDict[key] = val # 往字典中添加键值对
            
                with open(fpath, 'a', encoding='utf-8') as f:# 以追加模式打开,因为有中文所以用utf-8编码
                    f.write( str(infoDict) + '\n' ) # 这里为了输出美观,加了一个换行符
        except:
            traceback.print_exc() # 追踪错误，便于调试
            continue
                
def main():
    stock_list_url = "http://quote.eastmoney.com/stocklist.html"
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = "D://BaiduStockInfo.txt"
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'运行

程序正常执行，但是不显示进度，很慢，打开结果文件部分数据如下：
在这里插入图片描述

五、程序的优化

上面的代码初步实现了功能，但是很多代码还有待完善。依旧从性能和用户体验两个方面优化代码。以下是嵩天老师的优化。
从性能上：

r.encoding = r.apparent_encoding，每次获取一次页面，都需要分析整个网页的内容，重复分析，造成了浪费。因为只有两个网址，可以直接在控制台分析其编码方式，然后代码中直接赋值。
从用户体验上：
实现能够直观的看到下载的进度。

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import traceback # 追踪，便于调试

def getHTMLText(url,code="utf-8"): # 获取网页内容
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""
    
def getStockList(lst,stockURL): # 获取股票列表
    html = getHTMLText(stockURL,"GB2312")
    soup = BeautifulSoup(html,"html.parser")
    a = soup.find_all('a') # 需要的股票代码在a标签的“href"属性的值,故先获取全部a标签的列表
    for i in a:
        try:
            href = i.attrs['href'] # q取“href"属性的值
            lst.append(re.findall(r"s[hz]\d{6}",href)[0]) # 利用正则表达式提取股票代码
        except: # 因为有的a标签没有股票代码,可以作为异常
            continue

def getStockInfo(lst,stockURL,fpath): # 获得每只个股的股票信息
    count = 0
    for stock in lst:
        url = stockURL + stock + ".html" # 构建爬取股票信息的URL
        html = getHTMLText(url) # 获取股票信息网页内容
        try:
            if html == "": # 有些网址打不开,或为空，则跳过
                continue
            infoDict = {} # 以字典的形式保存股票信息,方便后面数据分析
            soup = BeautifulSoup(html,"html.parser")
            stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) # 获取股票信息所在的标签,有可能部分网页丢失,所以要做类型判断

            if stockInfo: # stockInfo非None才执行
                name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获取股票所在标签,这里注意find_all()返回的是一个bs4.element.ResultSet
                # 不能用name.string,原因是这里的name标签内有两个字符串子项，所以返回None,详情参照help文档
                infoDict.update({'股票名称': name.text.split()[0]}) # 通过text以字符串形式获得name标签的所有字符串子项,然后用split()方法通过空格分隔字符串,提取股票名称

                # 获取其它的键值对信息
                keyList = stockInfo.find_all('dt') # 键在dt标签
                valueList = stockInfo.find_all('dd') # 值在dd标签
                for i in range(len(keyList)): 
                    key = keyList[i].text # 这里可以用keyList[i].string,获取键信息
                    val = valueList[i].text # 获取值信息
                    infoDict[key] = val # 往字典中添加键值对
            
                with open(fpath, 'a', encoding='utf-8') as f:# 以追加模式打开,因为有中文所以用utf-8编码
                    f.write( str(infoDict) + '\n' ) # 这里为了输出美观,加了一个换行符
                    count += 1
                    print("\r当前股票信息下载进度：{:.2f}%".format(count*100/len(lst)),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
        except:
            count += 1
            print("\r当前股票信息下载进度：{:.2f}%".format(count*100/len(lst)),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
            traceback.print_exc() # 追踪错误，便于调试
            continue
                
def main():
    stock_list_url = "http://quote.eastmoney.com/stocklist.html"
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = "D://BaiduStockInfo.txt"
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
'运行

需要在cmd命令行下执行才能看到\r效果，在IDLE下\r的功能被禁止了。
执行过程如下：
在这里插入图片描述

六、代码的进一步优化

性能上：

getStockList()函数中其实可以不用bs4库就可以匹配到股票信息，直接通过正则reg =re.compile( r’(sh\d{6}|sz\d{6})’)实现匹配。
在遍历keyList时，每次循环都会调用一次len()，浪费性能，完全可以只调用一次len()函数，然后赋值给一个变量，如：length
在打印进度条时len()函数的应用，每次执行print，会调用一次len()，处理同2。
用户体验上：
我还想感受到我下载股票代码要花多久，多久后才是开始下载个股信息。
改进后代码如下：

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import traceback # 追踪，便于调试

def getHTMLText(url,code="utf-8"): # 获取网页内容
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""
    
def getStockList(lst,stockURL,fpath): # 获取股票列表
    html = getHTMLText(stockURL,"GB2312")
    pat = r'(sh\d{6}|sz\d{6})'
    reg = re.compile(pat)
    sl = reg.findall(html)
    length = len(sl)
    count = 0
    for i in sl:
        lst.append(i) # 这种方法不会改变原来传入的lst引用的地址
        with open(fpath,'a',encoding='utf-8') as f:
            f.write(str(i) +'\n')
        count +=1
        print('\r当前下载股票列表速度：{:.2f}%'.format(count*100/length),end='')

def getStockInfo(lst,stockURL,fpath): # 获得每只个股的股票信息
    count = 0
    lstLength = len(lst)
    for stock in lst:
        url = stockURL + stock + ".html" # 构建爬取股票信息的URL
        html = getHTMLText(url) # 获取股票信息网页内容
        try:
            if html == "": # 有些网址打不开,或为空，则跳过
                continue
            infoDict = {} # 以字典的形式保存股票信息,方便后面数据分析
            soup = BeautifulSoup(html,"html.parser")
            stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) # 获取股票信息所在的标签,有可能部分网页丢失,所以要做类型判断

            if stockInfo: # stockInfo非None才执行
                name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获取股票所在标签,这里注意find_all()返回的是一个bs4.element.ResultSet
                # 不能用name.string,原因是这里的name标签内有两个字符串子项，所以返回None,详情参照help文档
                infoDict.update({'股票名称': name.text.split()[0]}) # 通过text以字符串形式获得name标签的所有字符串子项,然后用split()方法通过空格分隔字符串,提取股票名称

                # 获取其它的键值对信息
                keyList = stockInfo.find_all('dt') # 键在dt标签
                valueList = stockInfo.find_all('dd') # 值在dd标签
                keyListLength = len(keyList)
                for i in range(keyListLength): 
                    key = keyList[i].text # 这里可以用keyList[i].string,获取键信息
                    val = valueList[i].text # 获取值信息
                    infoDict[key] = val # 往字典中添加键值对
            
                with open(fpath, 'a', encoding='utf-8') as f:# 以追加模式打开,因为有中文所以用utf-8编码
                    f.write( str(infoDict) + '\n' ) # 这里为了输出美观,加了一个换行符
                    count += 1
                    print("\r当前股票信息下载进度：{:.2f}%".format(count*100/lstLength),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
        except:
            count += 1
            print("\r当前股票信息下载进度：{:.2f}%".format(count*100/lstLength),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
            traceback.print_exc() # 追踪错误，便于调试
            continue
                
def main():
    stock_list_url = "http://quote.eastmoney.com/stocklist.html"
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file1 = "D://BaiduStockList.txt"
    output_file2 = "D://BaiduStockInfo.txt"
    slist = []
    getStockList(slist,stock_list_url,output_file1)
    print("\n") # 换行,避免两个进度条叠在一起
    getStockInfo(slist,stock_info_url,output_file2)

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
'运行

运行过程图：
在这里插入图片描述
输出结果：

七、心得体会

通过写这篇blog，很多语句都是一句一句在控制台下测试，理解每一步机器在做什么，虽然很耗时间，不过学到的东西比老师视频讲的要多，对于知识的理解也更深刻了。总结一下之前老师上课没有怎么提到的内容吧。

Tag.text：返回以字符串的形式返回所有的子孙标签中的字符串。
Tag.string：有且只有一个字符串时，可以跨越多层，通过递归方式返回一个字符串子项。如果没有字符串或者超过1个字符串子项都是返回None
充分利用控制台来测试代码，知道代码每一步在做什么，然后再写到脚本里面。
时刻关注性能和用户体验问题，写完代码记得优化。
基础记不清了，及时回去翻看课本和笔记，还有多利用help()查看官方文档。
再最后自己优化时，发生了一个bug，是python中函数传参数问题。

一般情况下，在python中函数的传参，很多人都在那儿争，到底是传的是它的值还是它的引用，其实最最最准确的说法，应该传的对象。那是什么意思呢，就是说，getStockList(lst,stockURL)函数第一个参数是list。那么这个类似的，假如说你是原地操作，比如说你list.append()，list.extend()，list.insert(),list.reverse(),list.sort()等，这些在原有的list的基础上进行操作的时候，他是改变原来那个list的内存地址的地方，但如果你写list等于另外一个新的list。这个时候list指到了新的内存地址。这样的话就导致了原来的内存地址的值没有变化，getStockInfo(lst,stockURL,fpath)调用时仍然为[]空列表。
重点看lst在getStockList(lst,stockURL)和getStockInfo(lst,stockURL,fpath)中参数传递的问题。
有兴趣的朋友可以看看试试下面这个代码，对比我最后优化的那个版本，有意思的是你会发现它很快就执行结束了，最后什么都没有输出。原因就在上面那段话。（作为初学者的我踩的又一个坑…)

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import traceback # 追踪，便于调试

def getHTMLText(url,code="utf-8"): # 获取网页内容
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""
    
def getStockList(lst,stockURL): # 获取股票列表
    html = getHTMLText(stockURL,"GB2312")
    pat = r'(sh\d{6}|sz\d{6})'
    reg = re.compile(pat)
    lst = reg.findall(html)
    # 这个相当于把lst这个标签重新贴到了一个新的地址上，导致getStockInfo()函数调用lst时为空列表


def getStockInfo(lst,stockURL,fpath): # 获得每只个股的股票信息
    count = 0
    lstLength = len(lst)
    for stock in lst:
        url = stockURL + stock + ".html" # 构建爬取股票信息的URL
        html = getHTMLText(url) # 获取股票信息网页内容
        try:
            if html == "": # 有些网址打不开,或为空，则跳过
                continue
            infoDict = {} # 以字典的形式保存股票信息,方便后面数据分析
            soup = BeautifulSoup(html,"html.parser")
            stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) # 获取股票信息所在的标签,有可能部分网页丢失,所以要做类型判断

            if stockInfo: # stockInfo非None才执行
                name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 获取股票所在标签,这里注意find_all()返回的是一个bs4.element.ResultSet
                # 不能用name.string,原因是这里的name标签内有两个字符串子项，所以返回None,详情参照help文档
                infoDict.update({'股票名称': name.text.split()[0]}) # 通过text以字符串形式获得name标签的所有字符串子项,然后用split()方法通过空格分隔字符串,提取股票名称

                # 获取其它的键值对信息
                keyList = stockInfo.find_all('dt') # 键在dt标签
                valueList = stockInfo.find_all('dd') # 值在dd标签
                keyListLength = len(keyList)
                for i in range(keyListLength): 
                    key = keyList[i].text # 这里可以用keyList[i].string,获取键信息
                    val = valueList[i].text # 获取值信息
                    infoDict[key] = val # 往字典中添加键值对
            
                with open(fpath, 'a', encoding='utf-8') as f:# 以追加模式打开,因为有中文所以用utf-8编码
                    f.write( str(infoDict) + '\n' ) # 这里为了输出美观,加了一个换行符
                    count += 1
                    print("\r当前股票信息下载进度：{:.2f}%".format(count*100/lstLength),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
        except:
            count += 1
            print("\r当前股票信息下载进度：{:.2f}%".format(count*100/lstLength),end="") # \r实现光标不换行,覆盖之前的答应内容,end=""实现禁掉print函数的自动换行功能
            traceback.print_exc() # 追踪错误，便于调试
            continue
                
def main():
    stock_list_url = "http://quote.eastmoney.com/stocklist.html"
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = "D://BaiduStockInfo.txt"
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'运行

八、参考资料

北京理工大学嵩天老师的《Python网络爬虫与信息提取》

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/IT小白/article/detail/822043