一、背景简介
传统验证码识别方法依赖图像预处理和字符切割,对抗干扰能力差。为此,近年来更先进的端到端识别方法被广泛采用,结合卷积神经网络(CNN)与循环神经网络(RNN),无需切割即可识别整个验证码序列。本文将介绍一个基于 PyTorch 框架构建的端到端验证码识别模型。
二、验证码特性说明
验证码图像尺寸统一为 160x60,内容为长度固定的4位或5位英数字混合字符串,字体扭曲、存在背景噪声和干扰线,适合使用 CNN + CTC 的结构识别。
三、环境与依赖
更多内容访问ttocr.com或联系1436423940
pip install torch torchvision matplotlib numpy opencv-python
四、数据准备与生成(captcha_dataset.py)
我们使用 captcha 库生成数据:
from captcha.image import ImageCaptcha
import string, random, os
from PIL import Image
import torch
from torch.utils.data import Dataset
ALL_CHARS = string.digits + string.ascii_uppercase
CHAR2IDX = {c: i for i, c in enumerate(ALL_CHARS)}
IDX2CHAR = {i: c for c, i in CHAR2IDX.items()}
def generate_code(length=4):
return ''.join(random.choices(ALL_CHARS, k=length))
class CaptchaDataset(Dataset):
def init(self, size=10000, length=4, width=160, height=60):
self.data = []
self.labels = []
image_gen = ImageCaptcha(width, height)
for _ in range(size):
text = generate_code(length)
image = image_gen.generate_image(text).convert('L')
image = image.resize((width, height))
self.data.append(torch.tensor(np.array(image) / 255.0).unsqueeze(0).float())
self.labels.append([CHAR2IDX[c] for c in text])
def __getitem__(self, idx):return self.data[idx], torch.tensor(self.labels[idx])def __len__(self):return len(self.data)
五、模型设计(captcha_model.py)
import torch.nn as nn
class CaptchaModel(nn.Module):
def init(self, num_classes):
super().init()
self.cnn = nn.Sequential(
nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1), nn.ReLU()
)
self.rnn = nn.LSTM(128 * 15, 128, num_layers=2, bidirectional=True, batch_first=True)
self.classifier = nn.Linear(256, num_classes + 1) # +1 for CTC blank
def forward(self, x):out = self.cnn(x) # [B, 128, H/4, W/4] → [B, 128, 15, 40]b, c, h, w = out.size()out = out.permute(0, 3, 1, 2).contiguous().view(b, w, -1) # [B, W, C×H]out, _ = self.rnn(out) # [B, T, 256]out = self.classifier(out) # [B, T, C]return out.log_softmax(2)
六、训练流程(train.py)
from captcha_dataset import CaptchaDataset, CHAR2IDX, IDX2CHAR
from captcha_model import CaptchaModel
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
def train():
dataset = CaptchaDataset()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
model = CaptchaModel(num_classes=len(CHAR2IDX))
criterion = nn.CTCLoss(blank=len(CHAR2IDX))
optimizer = Adam(model.parameters(), lr=0.001)for epoch in range(10):for images, labels in dataloader:preds = model(images) # [B, T, C]preds = preds.permute(1, 0, 2) # CTC expects [T, B, C]input_lengths = torch.full((images.size(0),), preds.size(0), dtype=torch.long)target_lengths = torch.full((images.size(0),), labels.size(1), dtype=torch.long)loss = criterion(preds, labels, input_lengths, target_lengths)optimizer.zero_grad()loss.backward()optimizer.step()print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")torch.save(model.state_dict(), 'captcha_model.pth')
if name == "main":
train()
七、预测与解码
def decode(preds):
preds = preds.argmax(2).squeeze(0).tolist()
result = []
prev = -1
for p in preds:
if p != prev and p != len(CHAR2IDX): # Skip CTC blank
result.append(IDX2CHAR[p])
prev = p
return ''.join(result)