赞
踩
本文参考链接:https://www.cnblogs.com/xuchunlin/p/9415620.html
首先收集原始的验证码图片:
这种验证码的特点是统一都是红色字体的四个字符组成,其中包括大写字母 A-Z,小写字母 a-z 和数字 0-9。我这儿需要的验证码识别对大小写不加以区分,所以大小写识别错误不算作识别错误,比如第一个验证码:DwP6,如果识别成:Dwp6,我们也把他算作识别正确。通过观察,我们可以提前想到,这种验证码识别的难度在于部分字符的难以区分,比如 S 和 5、I 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g 等等之类。字符间的间距也会有一定的干扰,降低了单个字符的区分度,比如会把紧邻的两个字符 vv识别成一个字符 w。验证码中的部分字符连人本身都难以区分,对机器而言难度也就可想而知了。
首先进行灰度处理,将原来的彩色图片变成灰度图片,同时去除部分图片中的暗色条纹。这里选择合适的阈值参数比较重要,阈值太大过滤不掉太多杂质信息,阈值太小则剩下的有效信息又太少。下面是阈值 threshold 分别为 210, 180, 150 对应的灰度图片,我程序里选择的 threshold=180。
- import os
-
- import pytesseract
- from PIL import Image
-
- before = r"D:\MyProject\Python\ReturnVisit\auth_code\before\\" # 原始验证码图片目录
- after = r"D:\MyProject\Python\ReturnVisit\auth_code\after\\" # 处理后验证码图片目录
-
-
- def grey_processing(): # 灰度处理
- threshold = 180
- file_list = os.listdir(before)
- for file in file_list:
- path = before + file
- img = Image.open(path)
- w, h = img.size
- for x in range(w):
- for y in range(h):
- r, g, b = img.getpixel((x, y))
- if 190 <= r <= 255 and 170 <= g <= 255 and 0 <= b <= 140:
- img.putpixel((x, y), (0, 0, 0))
- if 0 <= r <= 90 and 210 <= g <= 255 and 0 <= b <= 90:
- img.putpixel((x, y), (0, 0, 0))
- img = img.convert('L').point([0] * threshold + [1] * (256 - threshold), '1')
- # 边缘噪点处理
- for x in range(0, img.size[0]):
- for y in range(0, img.size[1]):
- if x in [0, img.size[0] - 1] or y in [0, img.size[1] - 1]:
- img.putpixel((x, y), 255) # 将边缘全变为非黑的点(简单粗暴)
- path = path.replace('PNG', 'jpeg')
- path = path.replace('before', 'after') # 更换新图片保存路径
- img.save(path)
灰度处理后的图片:
然后进行二值化处理和内部噪点处理:
- t2val = {} # 使用字典来保存二值化像素矩阵
-
- def two_value(image, G): # 二值化处理, G是图像二值化阀值
- for y in range(0, image.size[1]):
- for x in range(0, image.size[0]):
- g = image.getpixel((x, y))
- if g < G: # 灰度处理以后,点的区分度已经比较明显了。
- t2val[(x, y)] = 1
- else:
- t2val[(x, y)] = 0
-
-
- # 自定义孤立点为噪点
- def clear_noise(image): # 清噪
- # 边缘噪点处理(本来想放到这块,但是这儿对像素的修改不能即使生效,对内部噪点处理会有影响,这不是我想要的结果!)
- # for x in range(0, image.size[0]):
- # for y in range(0, image.size[1]):
- # if x in [0, image.size[0] - 1] or y in [0, image.size[1] - 1]:
- # image.putpixel((x, y), 255) # 将边缘全变为非黑的点(简单粗暴)
- # 内部噪点处理
- for x in range(1, image.size[0] - 1):
- for y in range(1, image.size[1] - 1):
- nearDots = 0
- L = 1 # 黑点用1表示
- if L == t2val[(x - 1, y - 1)]:
- nearDots += 1
- if L == t2val[(x - 1, y)]:
- nearDots += 1
- if L == t2val[(x - 1, y + 1)]:
- nearDots += 1
- if L == t2val[(x, y - 1)]:
- nearDots += 1
- if L == t2val[(x, y + 1)]:
- nearDots += 1
- if L == t2val[(x + 1, y - 1)]:
- nearDots += 1
- if L == t2val[(x + 1, y)]:
- nearDots += 1
- if L == t2val[(x + 1, y + 1)]:
- nearDots += 1
-
- if nearDots == 0 and t2val[(x, y)] == 1: # 如果当前是黑点且周围领域没有其他黑点
- image.putpixel((x, y), 255) # 将当前点变为非黑的点
- def denoise(): # 降噪
- file_list = os.listdir(after)
- for file in file_list:
- path = after + file
- image = Image.open(path)
- two_value(image, 100)
- clear_noise(image)
- path = path.replace('before', 'after')
- image.save(path)
降噪处理后的图片:
然后就是关于对降噪后的验证码图片进行识别了,其中会用到 pytesseract,这个包的安装和使用请自行参考链接https://blog.csdn.net/EB_NUM/article/details/77060009。需要下载 tesseract-ocr-setup-3.02.02.exe 进行安装,并对 pytesseract.py文件进行相应修改。
- def recognize(): # 识别
- file_list = os.listdir(after)
- for file in file_list:
- img_path = after + file
- im = Image.open(img_path)
- text = pytesseract.image_to_string(im, lang='eng')
- exclude_char_list = ' ‘’.:\\|\'\"?![],()~@#$%^&*_+-={};<>/¥©“連'
- # 去除识别结果中的特殊字符
- text = ''.join([x for x in text if x not in exclude_char_list])[:4]
- # 对易混淆字符添加识别结果可选项,提高识别率
- # S 和 5、l 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g、h 和 n
- options = [text]
- if len(text) == 4:
- if 'i' in text:
- options.append(text.replace('i', 'j'))
- if 'j' in text:
- options.append(text.replace('j', 'i'))
- if 'l' in text:
- options.append(text.replace('l', '1'))
- if '1' in text:
- options.append(text.replace('1', 'l'))
- if 's' in text:
- options.append(text.replace('s', '5'))
- if '5' in text:
- options.append(text.replace('5', 's'))
- if '2' in text:
- options.append(text.replace('2', 'z'))
- if 'z' in text:
- options.append(text.replace('z', '2'))
- if '0' in text:
- options.append(text.replace('0', 'o'))
- if 'o' in text:
- options.append(text.replace('o', '0'))
- if 'y' in text:
- options.append(text.replace('y', 'v'))
- if 'v' in text:
- options.append(text.replace('v', 'y'))
- if '9' in text:
- options.append(text.replace('9', 'g'))
- if 'g' in text:
- options.append(text.replace('g', '9'))
- if 'h' in text:
- options.append(text.replace('h', 'n'))
- if 'n' in text:
- options.append(text.replace('n', 'h'))
- if '8' in text:
- options.append(text.replace('8', 'B'))
- if 'B' in text:
- options.append(text.replace('B', '8'))
- elif len(text) == 3 and "W" in text.upper():
- options.append(text.upper().replace('W', 'vv'))
- options.append(text.upper().replace('W', 'yy'))
- print(file, options)
识别结果如下:
- "D:\Program Files\Python36\python3.exe" D:/MyProject/Python/ReturnVisit/verification_code_recognize.py
- IMG_0326.jpeg ['DwP6']
- IMG_0327.jpeg ['QXH']
- IMG_0328.jpeg ['hzew', 'h2ew', 'nzew']
- IMG_0329.jpeg ['']
- IMG_0330.jpeg ['iQlg', 'jQlg', 'iQ1g', 'iQl9']
- IMG_0331.jpeg ['apor', 'ap0r']
- IMG_0332.jpeg ['apor', 'ap0r']
- IMG_0333.jpeg ['wsn', 'vvSN', 'yySN']
- IMG_0334.jpeg ['uSZ5', 'uSZs']
- IMG_0335.jpeg ['nz']
- IMG_0336.jpeg ['aNpW']
- IMG_0337.jpeg ['RWU', 'RvvU', 'RyyU']
- IMG_0338.jpeg ['LNX1', 'LNXl']
- IMG_0339.jpeg ['Mzln', 'Mz1n', 'M2ln', 'Mzlh']
- IMG_0340.jpeg ['Xo1v', 'Xolv', 'X01v', 'Xo1y']
- IMG_0341.jpeg ['WBJ4']
- IMG_0342.jpeg ['Qc1S', 'QclS']
- IMG_0343.jpeg ['JTPB']
- IMG_0344.jpeg ['rW7N']
- IMG_0345.jpeg ['ZSLE']
- IMG_0347.jpeg ['b3D']
- IMG_0348.jpeg ['OY7']
- IMG_0349.jpeg ['HQK7']
- IMG_0350.jpeg ['X304', 'X3o4']
- IMG_0351.jpeg ['2PQW', 'zPQW']
- IMG_0352.jpeg ['psks', 'p5k5']
- IMG_0353.jpeg ['NOE']
- IMG_0354.jpeg ['WYOY']
- IMG_0355.jpeg ['M3oA', 'M30A']
- IMG_0356.jpeg ['AMV']
- IMG_0357.jpeg ['VFSR']
- IMG_0358.jpeg ['ZZIH']
- IMG_0359.jpeg ['4RoS', '4R0S']
- IMG_0360.jpeg ['mu']
- IMG_0361.jpeg ['mm']
- IMG_0362.jpeg ['FBV2', 'FBVz']
- IMG_0363.jpeg ['2eEc', 'zeEc']
- IMG_0364.jpeg ['Wm']
- IMG_0365.jpeg ['EoFR', 'E0FR']
- IMG_0366.jpeg ['CQU9', 'CQUg']
-
- Process finished with exit code 0
其中列表第一个元素为原始识别结果,通过统计,识别正确率大概只有25%左右。如果把S 和 5、I 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g这些容易混淆字符稍作处理给出多个可选项,识别正确率应该会有一定的提升。
如果你想自己训练模型来进行验证码的识别,请继续往下看:
自己训练模型的话,最重要的是训练集的收集了。为表诚意,小弟先奉上1056张带标记的验证码图片百度云链接(为了便于分享,已经将图片文件夹压缩成了rar文件,其中图片文件名为验证码生成的真实字符串,完全手工标记,因此也有部分出错的地方,不过出错的部分很少,不用过于担心!):链接:https://pan.baidu.com/s/14pJdTXD9OarUSrkgeDGE5g 提取码:xfym,我自己使用了其中 1000 张作为训练集,剩下的 56 张作为验证集使用。
目录结构如图:其中 train 目录是训练集所在目录,before 为带标签的原始验证码图片目录;after 为二值化、降噪处理后的验证码图片目录;cutting 则为验证码处理后的图片再次切割后的单个字符图片目录。validation 目录是验证集所在目录,目录结构与 train 目录类似:
验证集有了以后,先进行灰度化处理(同上),然后对每四个字符的验证码图片进行切割标记:
- import numpy as np
- from sklearn.externals import joblib
- from sklearn.neighbors import KNeighborsClassifier
- before = r"D:\MyProject\Python\ReturnVisit\auth_code\train\before\\" # 原始验证码图片目录
- after = r"D:\MyProject\Python\ReturnVisit\auth_code\train\after\\" # 处理后验证码图片目录
- cutting = r"D:\MyProject\Python\ReturnVisit\auth_code\train\cutting\\" # 切割后的图片目录
-
-
- def smart_slice_image(img, outDir, file, count=4, p_w=3):
- """
- :param img: image对象
- :param outDir: 切割图片输出目录
- :param count: 图片中有多少个图片
- :param p_w: 对切割地方多少像素内进行判断
- """
- w, h = img.size
- pixdata = img.load()
- eachWidth = int(w / count) # 每个切割图片的宽度
- beforeX = 0
- for i in range(count):
- # temp_dir = file[i]
- allBCount = []
- nextXOri = (i + 1) * eachWidth
- for x in range(nextXOri - p_w, nextXOri + p_w):
- if x >= w:
- x = w - 1
- if x < 0:
- x = 0
- b_count = 0
- for y in range(h):
- if pixdata[x, y] == 0:
- b_count += 1
- allBCount.append({'x_pos': x, 'count': b_count})
- sort = sorted(allBCount, key=lambda e: e.get('count'))
- nextX = sort[0]['x_pos']
- box = (beforeX, 0, nextX, h)
- isExists = os.path.exists(outDir + file[i].upper() + "\\")
- # 判断结果
- if not isExists:
- os.makedirs(outDir + file[i].upper() + "\\")
- img.crop(box).save(outDir + file[i].upper() + "\\" + file[i] + "_" + file)
- beforeX = nextX
-
-
- def cutting_image():
- file_list = os.listdir(after)
- for file in file_list:
- path = after + file
- img = Image.open(path)
- out_dir = cutting
- smart_slice_image(img, out_dir, file, count=4, p_w=3)
验证码图片切割好以后,开始进行模型训练,我这儿的原始验证码图片规格为 49×25:
- def load_dataset(): # 将切割后的图片与字符标签对应起来保存成数组
- X = []
- Y = []
- for i in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ":
- target_path = cutting + i # 训练集
- for title in os.listdir(target_path):
- # pix = np.asarray(Image.open(os.path.join(target_path, title)).convert('L'))
- pix = np.asarray(Image.open(os.path.join(target_path, title)).resize((13, 25), Image.ANTIALIAS))
- pix = pix.reshape(13 * 25)
- X.append(pix)
- Y.append(i)
- X = np.asarray(X)
- Y = np.asarray(Y)
- return X, Y
-
-
- def train_model():
- X, Y = load_dataset()
- knn = KNeighborsClassifier()
- knn.fit(X, Y)
- joblib.dump(knn, 'recognize.model')
模型训练完成以后,开始进行模型验证:
- def verify_model(model):
- pre_list = []
- y_list = []
- for i in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ":
- part_path = cutting.replace('train', 'validation') + i # 验证集
- if os.path.exists(part_path):
- for title in os.listdir(part_path):
- pix = np.asarray(Image.open(os.path.join(part_path, title)).resize((13, 25), Image.ANTIALIAS))
- pix = pix.reshape(13 * 25)
- pre_list.append(pix) # 待预测列表
- y_list.append(i)
- pre_list = np.asarray(pre_list)
- # y_list = np.asarray(y_list)
- predict_list = list(model.predict(pre_list)) # 预测结果
- acc = 0
- for i in range(len(y_list)):
- if y_list[i] == predict_list[i]:
- acc += 1
- print("原始字符:", list(y_list))
- print("预测字符:", list(predict_list))
- print("单个字符识别正确率:", "%.2f" % (100 * acc / len(y_list)) + "%")
-
-
- if __name__ == '__main__':
- train_model()
- model = joblib.load('recognize.model')
- verify_model(model)
运行输出结果为(一张验证码图片如果有相同字符,按照我当前对分割图片的命名规则会覆盖掉,所以这儿的原始字符个数并不是 56*4=224张),验证码字符的粘连性导致切割后的图片与字符的对应关系出错:
- 原始字符: ['0', '0', '1', '2', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7',
- '9', '9', '9', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
- 'D', 'D', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'F', 'F',
- 'F', 'G', 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'I', 'I', 'I', 'I', 'I',
- 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'L', 'M',
- 'M', 'M', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', 'O', 'P', 'P', 'P', 'P',
- 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
- 'R', 'S', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'U', 'U', 'V', 'V', 'V', 'V', 'V', 'V',
- 'V', 'V', 'V', 'V', 'V', 'V', 'W', 'W', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y',
- 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
- 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z',
- 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z',
- 'Z', 'Z', 'Z', 'Z']
- 预测字符: ['1', '0', '1', '2', '4', '5', '5', '5', '1', '5', '6', '6', '5', '7', '7', '7',
- 'D', '9', '9', 'Q', 'A', 'A', '1', 'B', 'B', 'B', 'C', '1', 'C', 'O', 'C', '1', 'C', 'O',
- 'L', 'D', 'D', 'Q', 'D', 'W', 'E', 'C', '5', 'E', 'E', 'E', 'E', 'E', 'E', 'C', 'F', 'F',
- '4', 'G', 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', '5', 'J', 'I', 'I', 'I',
- 'I', 'D', '1', 'B', '1', '1', 'K', 'K', 'K', 'K', 'K', 'I', 'L', '1', 'I', '1', 'L', 'M',
- 'H', 'M', 'N', 'N', 'M', 'H', 'N', '1', 'K', 'H', 'O', 'O', 'C', '0', 'B', 'P', '1', 'C',
- 'B', 'O', 'D', 'O', 'Q', 'O', 'O', 'Q', 'I', 'Q', '0', 'S', 'L', 'R', '1', 'R', 'R', 'R',
- 'H', 'H', 'S', 'S', 'S', 'S', 'S', 'H', 'U', 'T', 'U', 'I', 'V', '1', '1', 'V', 'V', '5',
- 'Y', 'V', 'V', 'V', 'V', 'V', 'W', '1', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y',
- 'H', 'Y', '4', '1', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'W',
- 'Y', 'Y', '1', '1', 'Y', 'Y', 'Y', '1', 'I', 'Z', 'Z', '7', '7', 'Z', '2', 'Z', '7', 'Z',
- '7', 'Z', '7', 'Z', '2', 'Z', '7', '7', '7', 'Z', 'Z', '7', '1', '7', 'Z', '7', 'Z', '7',
- '7', 'Z', 'Z', 'Z']
- 单个字符识别正确率: 61.01%
单个字符识别率在60%左右的话,整体的识别率还是偏低的,肯定还有可以进行改进的地方,比如降噪这块,可以使用 8-连通 区域范围黑点的个数来进行更加合理的降噪。为了便于查看整个识别过程的步骤,我将单张验证码图片的识别单独写了一个函数。
- import os
-
- import pytesseract
- from PIL import Image
- import numpy as np
- from sklearn.externals import joblib
- from sklearn.neighbors import KNeighborsClassifier
-
- before = r"D:\MyProject\Python\ReturnVisit\auth_code\train\before\\" # 原始验证码图片目录
- after = r"D:\MyProject\Python\ReturnVisit\auth_code\train\after\\" # 处理后验证码图片目录
- cutting = r"D:\MyProject\Python\ReturnVisit\auth_code\train\cutting\\" # 切割后的图片目录
-
-
- def single_picture_recognize(model, image_path): # 单张验证码图片识别
- # 灰度处理
- img = Image.open(image_path)
- threshold = 180 # 阈值
- w, h = img.size
- for x in range(w):
- for y in range(h):
- r, g, b = img.getpixel((x, y))
- if 190 <= r <= 255 and 170 <= g <= 255 and 0 <= b <= 140:
- img.putpixel((x, y), (0, 0, 0))
- if 0 <= r <= 90 and 210 <= g <= 255 and 0 <= b <= 90:
- img.putpixel((x, y), (0, 0, 0))
- img = img.convert('L').point([0] * threshold + [1] * (256 - threshold), '1')
-
- # 边缘噪点处理
- for x in range(0, img.size[0]):
- for y in range(0, img.size[1]):
- if x in [0, img.size[0] - 1] or y in [0, img.size[1] - 1]:
- img.putpixel((x, y), 255) # 将边缘全变为非黑的点(简单粗暴)
- img.save(image_path)
-
- # 降噪处理
- image = Image.open(image_path)
- two_value(image, 100)
- clear_noise(image)
- image.save(image_path)
-
- # 图片切割
- """
- :param img: image对象
- :param outDir: 切割图片输出目录
- :param count: 图片中有多少个图片
- :param p_w: 对切割地方多少像素内进行判断
- """
- img = Image.open(image_path)
- cutting_picture = []
- count = 4
- p_w = 3
- w, h = img.size
- pixdata = img.load()
- eachWidth = int(w / count) # 每个切割图片的宽度
- beforeX = 0
- for i in range(count):
- # temp_dir = file[i]
- allBCount = []
- nextXOri = (i + 1) * eachWidth
- for x in range(nextXOri - p_w, nextXOri + p_w):
- if x >= w:
- x = w - 1
- if x < 0:
- x = 0
- b_count = 0
- for y in range(h):
- if pixdata[x, y] == 0:
- b_count += 1
- allBCount.append({'x_pos': x, 'count': b_count})
- sort = sorted(allBCount, key=lambda e: e.get('count'))
- nextX = sort[0]['x_pos']
- box = (beforeX, 0, nextX, h)
- pix = np.asarray(img.crop(box).resize((13, 25), Image.ANTIALIAS))
- pix = pix.reshape(13 * 25)
- cutting_picture.append(pix)
- beforeX = nextX
-
- # 开始识别
- pre_list = np.asarray(cutting_picture)
- predict_list = list(model.predict(pre_list)) # 预测结果
- return predict_list
-
-
- if __name__ == '__main__':
- # train_model()
- model = joblib.load('recognize.model')
- image_path = r"D:\MyProject\Python\ReturnVisit\auth_code\test.png"
- predict_result = single_picture_recognize(model, image_path)
- print(predict_result)
识别输出结果如下:
- "D:\Program Files\Python36\python3.exe" D:/MyProject/Python/ReturnVisit/auth_code/verification_code_recognize.py
- ['Y', 'E', 'U', 'V']
-
- Process finished with exit code 0
原始图片为:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。