!!! 本文重现的实验来自V Kishore Ayyadevara 著作的Modern Computer Vision with PyTorch (Second Edition) 一书,以下论述近代表个人理解,如想体验原汁原味,强烈建议阅读原书。
Fully Connected Neural Network在目标识别的局限,通过以下代码实现
# Import libraries and download data
from torchvision import datasets
from pathlib import Path
import torch
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import SGD, Adam
import seaborn as sns# Download FashionMNIST dataset
data_folder = Path('./data/fashion_mnist')
data_folder.mkdir(parents=True, exist_ok=True)
fmnist = datasets.FashionMNIST(data_folder, download=True, train=True)# Load training data
tr_images = fmnist.data
tr_targets = fmnist.targets# Load validation data
val_fmnist = datasets.FashionMNIST(data_folder, download=True, train=False)
val_images = val_fmnist.data
val_targets = val_fmnist.targets# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'# Define Dataset class
class FMNISTDataset(Dataset):def __init__(self, x, y):x = x.float()/255x = x.view(-1,28*28)self.x, self.y = x, y def __getitem__(self, ix):x, y = self.x[ix], self.y[ix] return x.to(device), y.to(device)def __len__(self): return len(self.x)# Define model function
def get_model():model = nn.Sequential(nn.Linear(28 * 28, 1000),nn.ReLU(),nn.Linear(1000, 10)).to(device)loss_fn = nn.CrossEntropyLoss()optimizer = Adam(model.parameters(), lr=1e-3)return model, loss_fn, optimizer# Define training function
def train_batch(x, y, model, opt, loss_fn):prediction = model(x)batch_loss = loss_fn(prediction, y)batch_loss.backward()optimizer.step()optimizer.zero_grad()return batch_loss.item()# Define accuracy function
def accuracy(x, y, model):with torch.no_grad():prediction = model(x)max_values, argmaxes = prediction.max(-1)is_correct = argmaxes == yreturn is_correct.cpu().numpy().tolist()# Define data loading function
def get_data(): train = FMNISTDataset(tr_images, tr_targets) trn_dl = DataLoader(train, batch_size=32, shuffle=True)val = FMNISTDataset(val_images, val_targets) val_dl = DataLoader(val, batch_size=len(val_images), shuffle=True)return trn_dl, val_dl# Define validation loss function
def val_loss(x, y, model, loss_fn):with torch.no_grad():prediction = model(x)val_loss = loss_fn(prediction, y)return val_loss.item()# Get data and model
trn_dl, val_dl = get_data()
model, loss_fn, optimizer = get_model()# Training loop
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []
for epoch in range(5):print(epoch)train_epoch_losses, train_epoch_accuracies = [], []for ix, batch in enumerate(iter(trn_dl)):x, y = batchbatch_loss = train_batch(x, y, model, optimizer, loss_fn)train_epoch_losses.append(batch_loss) train_epoch_loss = np.array(train_epoch_losses).mean()for ix, batch in enumerate(iter(trn_dl)):x, y = batchis_correct = accuracy(x, y, model)train_epoch_accuracies.extend(is_correct)train_epoch_accuracy = np.mean(train_epoch_accuracies)for ix, batch in enumerate(iter(val_dl)):x, y = batchval_is_correct = accuracy(x, y, model)validation_loss = val_loss(x, y, model, loss_fn)val_epoch_accuracy = np.mean(val_is_correct)train_losses.append(train_epoch_loss)train_accuracies.append(train_epoch_accuracy)val_losses.append(validation_loss)val_accuracies.append(val_epoch_accuracy)# Display sample image
ix = 24300
plt.imshow(tr_images[ix], cmap='gray')
plt.title(fmnist.classes[tr_targets[ix]])
plt.show()# Prepare image for prediction
img = tr_images[ix]/255.
img = img.view(28*28)
img = img.to(device)# Get prediction
np_output = model(img).cpu().detach().numpy()
print(np.exp(np_output)/np.sum(np.exp(np_output)))# Show target label
print(f"Target: {tr_targets[ix]}")# Test translation invariance
preds = []
for px in range(-5,6):img = tr_images[ix]/255.img = img.view(28, 28)img2 = np.roll(img, px, axis=1)plt.imshow(img2)plt.show()img3 = torch.Tensor(img2).view(28*28).to(device)np_output = model(img3).cpu().detach().numpy()preds.append(np.exp(np_output)/np.sum(np.exp(np_output)))# Visualize translation results
fig, ax = plt.subplots(1,1, figsize=(12,10))
plt.title('Probability of each class for various translations')
sns.heatmap(np.array(preds), annot=True, ax=ax, fmt='.2f', xticklabels=fmnist.classes, yticklabels=[str(i)+str(' pixels') for i in range(-5,6)], cmap='gray')import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns# 假设模型和数据都已加载好
# 这里 ix 是你选择的图片索引
ix = 24300
img = tr_images[ix] / 255.preds = [] # 存放每次平移后的概率分布
classes = fmnist.classes # ['T-shirt/top', 'Trouser', 'Pullover', ...]print(f"原始类别标签: {classes[tr_targets[ix]]}\n")# 对图像在水平轴上从 -5 到 +5 像素平移
for px in range(-5, 6):img2 = np.roll(img, px, axis=1) # 平移图像img3 = torch.Tensor(img2).view(28 * 28).to(device)np_output = model(img3).cpu().detach().numpy() # 模型输出softmax_output = np.exp(np_output) / np.sum(np.exp(np_output))preds.append(softmax_output)# 计算预测类别predicted_class = classes[np.argmax(softmax_output)]confidence = np.max(softmax_output)print(f"平移 {px:>2} 像素时预测结果: {predicted_class:15s} 概率: {confidence:.2f}")# 生成热图
fig, ax = plt.subplots(1, 1, figsize=(12, 10))
plt.title('Probability of each class for various translations')
sns.heatmap(np.array(preds), annot=True, ax=ax, fmt='.2f',xticklabels=classes,yticklabels=[f"{i} pixels" for i in range(-5, 6)],cmap='gray')
plt.show()
原始类别标签: Trouser
平移 -5 像素时预测结果: Shirt 概率: 0.99
平移 -4 像素时预测结果: Shirt 概率: 0.89
平移 -3 像素时预测结果: Dress 概率: 0.58
平移 -2 像素时预测结果: Trouser 概率: 1.00
平移 -1 像素时预测结果: Trouser 概率: 1.00
平移 0 像素时预测结果: Trouser 概率: 1.00
平移 1 像素时预测结果: Trouser 概率: 0.71
平移 2 像素时预测结果: Trouser 概率: 0.54
平移 3 像素时预测结果: Ankle boot 概率: 0.91
平移 4 像素时预测结果: Ankle boot 概率: 0.99
平移 5 像素时预测结果: Ankle boot 概率: 1.00

实验结论
- 位置敏感性
- MLP 将图像展平成一维向量(28×28 → 784)
- 每个像素位置对应一个固定的权重
- 模型学习的是"在特定位置的特定像素值"
- 例如:模型可能学到"第100个位置应该是白色表示裤子"
- 平移后预测崩溃
代码中的实验显示:
对同一张图片平移 -5 到 +5 像素
for px in range(-5, 6):
img2 = np.roll(img, px, axis=1) # 水平平移
结果:
- 原始图片:模型正确识别为"裤子"(99.9%置信度)
- 平移几个像素后:预测结果可能完全改变
- 即使人眼看来完全是同一个物体,模型却给出不同答案
- 为什么会这样?
原始图片(正确识别):
位置 0-10: 黑色背景
位置 11-20: 裤子边缘 ← MLP学到这里应该有边缘
位置 21-28: 裤子主体
平移后(识别错误):
位置 0-10: 裤子边缘 ← 但MLP期望这里是背景!
位置 11-20: 裤子主体 ← 不匹配!
位置 21-28: 黑色背景
- 实际影响
- 泛化能力差:训练数据中的图片位置稍有不同,模型就无法识别
- 需要大量数据:必须看过物体在各种位置的样本才能学会
- 不符合人类视觉:人类看到物体时,不管它在哪个位置都能识别
- 实际应用受限:真实世界中物体位置是随机的
- 热图可视化显示的问题
代码最后生成的热图会显示:
- Y 轴:-5到+5像素的平移
- X 轴:10个类别的预测概率
- 结果:每一行的预测概率分布差异很大
解决方案 → CNN
这正是为什么需要 CNN:
- 卷积层:使用滑动窗口,对位置不敏感
- 权重共享:同一个特征检测器在整张图上使用
- 池化层:进一步增强位置不变性
- 局部连接:关注局部特征而不是全局位置
结论:MLP 把图像当作一个固定位置的向量来处理,完全忽略了图像的空间结构和平移不变性,这是它在计算机视觉任务中的致命缺陷。