一、引言
验证码(CAPTCHA)作为互联网身份验证的第一道防线,广泛用于防止恶意爬虫和自动操作。尽管深度学习方法已成为主流,传统机器学习方法如支持向量机(SVM)在处理结构化、规则清晰的验证码中依然具有较高效率与鲁棒性。本文结合图像处理与SVM,构建了一个轻量化验证码识别系统,具有良好的实用性与解释性。
二、验证码类型说明
本系统识别的是4位数字验证码,图像尺寸为 100x40,背景为白色,字符为黑色,存在少量噪点和干扰线,无旋转或扭曲变形。
三、开发环境
Python 3.9+
更多内容访问ttocr.com或联系1436423940
OpenCV
scikit-learn
numpy
pillow(用于生成验证码)
pip install opencv-python scikit-learn numpy pillow
四、数据集生成(generate_captcha.py)
from captcha.image import ImageCaptcha
import os, random
from PIL import Image
import string
CHARS = string.digits
captcha_gen = ImageCaptcha(width=100, height=40)
def generate_dataset(n=5000, save_dir="dataset"):
os.makedirs(save_dir, exist_ok=True)
for i in range(n):
text = ''.join(random.choices(CHARS, k=4))
img = captcha_gen.generate_image(text)
img.save(os.path.join(save_dir, f"{text}_{i}.png"))
generate_dataset()
五、图像预处理(preprocess.py)
import cv2
import numpy as np
def preprocess_image(path):
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
_, binary = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY_INV)
return binary
def segment_characters(image):
contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
chars = []
for cnt in sorted(contours, key=lambda c: cv2.boundingRect(c)[0]):
x, y, w, h = cv2.boundingRect(cnt)
if w > 5 and h > 10:
char_img = image[y:y+h, x:x+w]
resized = cv2.resize(char_img, (20, 20))
chars.append(resized)
return chars
六、特征提取与模型训练(train_svm.py)
import os
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from preprocess import preprocess_image, segment_characters
X, y = [], []
for file in os.listdir("dataset"):
if file.endswith(".png"):
label = file.split("_")[0]
img = preprocess_image(os.path.join("dataset", file))
chars = segment_characters(img)
if len(chars) == 4:
for i, ch in enumerate(chars):
X.append(ch.flatten())
y.append(label[i])
X = np.array(X) / 255.0
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("测试集准确率:", accuracy_score(y_test, y_pred))
import joblib
joblib.dump(model, 'svm_model.pkl')
七、验证码识别测试(predict.py)
import joblib
from preprocess import preprocess_image, segment_characters
def recognize(path):
model = joblib.load("svm_model.pkl")
img = preprocess_image(path)
chars = segment_characters(img)
result = ''
for ch in chars:
vec = ch.flatten().reshape(1, -1) / 255.0
result += model.predict(vec)[0]
return result
print(recognize("dataset/1234_10.png"))