文章目录 仓库地址 练习:图像自动识别模型 练习:图像自动分割模型
仓库地址
图像识别(image_classification_and_recognition) 图像分割(image_segmentation)
练习:图像自动识别模型
图像识别是指利用计算机对图像进行处理、分析和理解,以识别各种不同模式的目标和对象的技术,是深度学习算法的一种实践应用
数据集说明
使用MNIST数据集,MNIST数据集来自美国国家标准与技术研究所(National Institute of Standards and Technology,NIST)。该数据集分成训练集(Training Set)和测试集(Test Set)两个部分,其中训练集由来自250个不同人手写的数字构成,其中50%是高中学生,50%来自人口普查局(theCensusBureau)的工作人员,测试集也是同样比例的手写数字数据。
模型训练和保存
导入数据集
import torch
import torchvision
from torch import nn
import torch. utils. data as data
from tqdm import tqdmtrain_dataset = torchvision. datasets. MNIST( root= './data' , train= True , transform= torchvision. transforms. ToTensor( ) , download= True
) test_dataset = torchvision. datasets. MNIST( root= './data' , train= False , transform= torchvision. transforms. ToTensor( ) , download= True
)
搭建神经网络
class CNN ( nn. Module) : def __init__ ( self) : super ( CNN, self) . __init__( ) self. conv1 = nn. Sequential( nn. Conv2d( in_channels= 1 , out_channels= 16 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. ReLU( ) , nn. MaxPool2d( kernel_size= 2 ) ) self. conv2 = nn. Sequential( nn. Conv2d( in_channels= 16 , out_channels= 32 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. ReLU( ) , nn. MaxPool2d( kernel_size= 2 ) ) self. output = nn. Linear( 32 * 7 * 7 , 10 ) def forward ( self, x) : out = self. conv1( x) out = self. conv2( out) out = out. view( out. size( 0 ) , - 1 ) out = self. output( out) return out
训练和保存实现
if __name__ == '__main__' : device = torch. device( "cuda" if torch. cuda. is_available( ) else "cpu" ) LR = 0.001 EPOCHS = 3 BATCH_SIZE = 50 cnn = CNN( ) . to( device) optimizer = torch. optim. Adam( cnn. parameters( ) , lr= LR) loss_func = nn. CrossEntropyLoss( ) train_loader = data. DataLoader( dataset= train_dataset, batch_size= BATCH_SIZE, shuffle= True ) test_data = torchvision. datasets. MNIST( root= './data' , train= False ) test_x = torch. unsqueeze( test_dataset. data, dim= 1 ) . type ( torch. float32) / 255 test_y = test_dataset. targetstest_x, test_y = test_x. to( device) , test_y. to( device) for epoch in range ( EPOCHS) : progress_bar = tqdm( enumerate ( train_loader) , total= len ( train_loader) ) for step, ( x, y) in progress_bar: x, y = x. to( device) , y. to( device) output = cnn( x) loss = loss_func( output, y) optimizer. zero_grad( ) loss. backward( ) optimizer. step( ) if step % 50 == 0 : test_output = cnn( test_x) """test_output:这是模型对测试数据的原始输出,通常是一个二维张量(Tensor),其中每一行对应一个样本,每一列对应每个类别的得分或概率torch.max(test_output, 1):这个操作会在指定的维度(这里是1,即行维度)上找到最大值。对于分类任务来说,这通常意味着找到具有最高得分或概率的类别返回值是一个元组,包含两个张量:第一个是最大值本身,第二个是这些最大值的索引[1]:从torch.max返回的元组中选取第二个元素,即最大值的索引。这些索引代表了模型对每个测试样本预测的类别.cpu 将预测结果移回CPU以便后续处理.numpy():将PyTorch Tensor转换为NumPy数组,以便进行后续的处理或者评估,如计算准确率或者其他性能指标 """ pred_y = torch. max ( test_output, 1 ) [ 1 ] . cpu( ) . numpy( ) """pred_y == test_y.cpu().numpy():将模型预测结果pred_y与真实标签test_y进行比较,返回一个布尔数组,其中值为True表示预测正确,False表示预测错误.astype(int):将布尔数组转换为整数数组,True变为1,False变为0.sum():对数组中的所有元素求和,得到预测正确的样本数量float(test_y.size(0)):测试集中总样本的数量""" accuracy = float ( ( pred_y == test_y. cpu( ) . numpy( ) ) . astype( int ) . sum ( ) ) / float ( test_y. size( 0 ) ) progress_bar. set_description( f"Epoch { epoch} | Step { step} | Loss: { loss. item( ) : .4f } | test accuracy: { accuracy} :.2f" ) """1. torch.save(cnn, 'image_recognition.pkl')保存的是整个模型对象(包括模型结构和参数)。可以直接保存整个 CNN 模型实例 cnn。优点:加载时可以直接恢复完整的模型,不需要重新定义网络结构。缺点:保存的文件体积较大,且对模型结构耦合度高,如果模型结构发生变化,加载会出错。加载方式:model = torch.load('image_recognition.pkl')2. torch.save(cnn.state_dict(), 'image_recognition.pth')只保存模型的参数(权重和偏置等),不保存模型结构。更加轻量、灵活,推荐在实际项目中使用。使用前提:需要先定义好相同的模型结构,然后通过 load_state_dict() 加载参数。 加载方式:model = CNN() # 先创建模型结构model.load_state_dict(torch.load('image_recognition.pth')) # 再加载参数通常建议使用第二种方式 state_dict() 来保存模型参数""" torch. save( cnn, './data/image_recognition.pkl' ) torch. save( cnn. state_dict( ) , './data/image_recognition.pth' )
模型测试
测试代码
image_segmentation_test.py
import torch
import torchvision
from torch import nn
import torch. utils. data as data
class CNN ( nn. Module) : def __init__ ( self) : super ( CNN, self) . __init__( ) self. conv1 = nn. Sequential( nn. Conv2d( in_channels= 1 , out_channels= 16 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. ReLU( ) , nn. MaxPool2d( kernel_size= 2 ) ) self. conv2 = nn. Sequential( nn. Conv2d( in_channels= 16 , out_channels= 32 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. ReLU( ) , nn. MaxPool2d( kernel_size= 2 ) ) self. output = nn. Linear( 32 * 7 * 7 , 10 ) def forward ( self, x) : out = self. conv1( x) out = self. conv2( out) out = out. view( out. size( 0 ) , - 1 ) out = self. output( out) return outif __name__ == '__main__' : device = torch. device( "cuda" if torch. cuda. is_available( ) else "cpu" ) test_dataset = torchvision. datasets. MNIST( root= './data' , train= False , transform= torchvision. transforms. ToTensor( ) , download= True ) test_x = torch. unsqueeze( test_dataset. data, dim= 1 ) . type ( torch. float32) / 255 test_y = test_dataset. targetstest_x, test_y = test_x. to( device) , test_y. to( device) cnn_test = CNN( ) . to( device) cnn_test. load_state_dict( torch. load( "./data/image_recognition.pth" ) ) test_output1 = cnn_test( test_x) pred_y1 = torch. max ( test_output1, 1 ) [ 1 ] . cpu( ) . numpy( ) accuracy = float ( ( pred_y1 == test_y. cpu( ) . numpy( ) ) . astype( int ) . sum ( ) ) / float ( test_y. size( 0 ) ) print ( '准确率' , accuracy)
测试结果
练习:图像自动分割模型
图像分割就是指把图像分成若干特定的、具有独特性质的区域并提出感兴趣的目标的技术和过程。为了更好地理解和应用图像自动分割技术。
模型训练和保存
加载数据集
import os
import torch
import cv2
import numpy as np
from torch import nn
import torch. optim as optim
from torch. utils. data import Dataset, DataLoader
from torchvision import transformsclass MyDataset ( Dataset) : def __init__ ( self, train_path, transform= None ) : """初始化方法:param train_path: 训练数据的路径:param transform: 数据预处理函数""" self. images = os. listdir( train_path + '/last' ) self. labels = os. listdir( train_path + '/last_msk' ) assert len ( self. images) == len ( self. labels) , '图像数量与标签数量不匹配' self. transform = transformself. images_and_labels = [ ] for i in range ( len ( self. images) ) : self. images_and_labels. append( ( train_path + '/last/' + self. images[ i] , train_path + '/last_msk/' + self. labels[ i] ) ) def __getitem__ ( self, item) : image_path, label_path = self. images_and_labels[ item] image = cv2. imread( image_path) image = cv2. resize( image, ( 224 , 224 ) ) label = cv2. imread( label_path, 0 ) label = cv2. resize( label, ( 224 , 224 ) ) label = label / 255 label = label. astype( 'uint8' ) label = np. eye( 2 ) [ label] label = np. array( list ( map ( lambda x: abs ( x - 1 ) , label) ) ) . astype( 'float32' ) label = label. transpose( 2 , 0 , 1 ) if self. transform is not None : image = self. transform( image) return image, labeldef __len__ ( self) : return len ( self. images)
搭建神经网络
"""
输入图像大小为3×224×224 卷积部分使用的是VGG11模型 经过第5个最大池化后开始上采样 经过5个反卷积层还原成图像大小
"""
class Net ( nn. Module) : def __init__ ( self) : super ( Net, self) . __init__( ) self. encoder_one = nn. Sequential( nn. Conv2d( 3 , 64 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 64 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_two = nn. Sequential( nn. Conv2d( 64 , 128 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 128 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_three = nn. Sequential( nn. Conv2d( 128 , 256 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 256 , 256 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_four = nn. Sequential( nn. Conv2d( 256 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_five = nn. Sequential( nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. decoder_one = nn. Sequential( nn. ConvTranspose2d( in_channels= 512 , out_channels= 256 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) ) self. decoder_two = nn. Sequential( nn. ConvTranspose2d( in_channels= 256 , out_channels= 128 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 128 ) , nn. ReLU( inplace= True ) ) self. decoder_three = nn. Sequential( nn. ConvTranspose2d( in_channels= 128 , out_channels= 64 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 64 ) , nn. ReLU( inplace= True ) ) self. decoder_four = nn. Sequential( nn. ConvTranspose2d( in_channels= 64 , out_channels= 32 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 32 ) , nn. ReLU( inplace= True ) ) self. decoder_five = nn. Sequential( nn. ConvTranspose2d( in_channels= 32 , out_channels= 16 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 16 ) , nn. ReLU( inplace= True ) ) self. classifier = nn. Conv2d( in_channels= 16 , out_channels= 2 , kernel_size= 1 ) def forward ( self, x) : out = self. encoder_one( x) out = self. encoder_two( out) out = self. encoder_three( out) out = self. encoder_four( out) out = self. encoder_five( out) out = self. decoder_one( out) out = self. decoder_two( out) out = self. decoder_three( out) out = self. decoder_four( out) out = self. decoder_five( out) out = self. classifier( out) return out
训练和保存
if __name__ == '__main__' : """# 加载数据集测试image=cv2.imread('./data/train/last_msk/50.jpg',0)image=cv2.resize(image,(16,16))image_2=image/255image_3=image_2.astype('uint8')hot_1=np.eye(2)[image_3]hot_2=np.array(list(map(lambda x:abs(x-1),hot_1)))print(hot_2.shape) # (16, 16, 2)""" """# 网络模型测试# 生成一个随机图像张量img=torch.randn(2,3,224,224)net=Net() # 创建一个网络模型sample=net(img) # 输入图像张量进行前向传播print(sample.shape) # 打印输出的形状 torch.Size([2, 2, 224, 224])""" batch_size = 8 epochs = 2000 train_data_path = './data/train' transform = transforms. Compose( [ transforms. ToTensor( ) , transforms. Normalize( mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] ) ] ) bag = MyDataset( train_data_path, transform) data_loader = DataLoader( bag, batch_size= batch_size, shuffle= True ) device = torch. device( "cuda" if torch. cuda. is_available( ) else "cpu" ) net = Net( ) . to( device) loss_func = nn. BCELoss( ) optimizer = optim. SGD( net. parameters( ) , lr= 1e-2 , momentum= 0.7 ) if not os. path. exists( './model' ) : os. mkdir( './model' ) for epoch in range ( 1 , epochs + 1 ) : for i, ( img, label) in enumerate ( data_loader) : img, label = img. to( device) , label. to( device) output = torch. sigmoid( net( img) ) loss = loss_func( output, label) if i % 20 == 0 : print ( 'Epoch:[{}/{}]\t Step:[{}/{}]\t Loss:{:.6f}' . format ( epoch, epochs, ( i + 1 ) * len ( img) , len ( data_loader. dataset) , loss. item( ) ) ) optimizer. zero_grad( ) loss. backward( ) optimizer. step( ) if epoch % 200 == 0 : torch. save( net. state_dict( ) , './model/model_epoch_{}.pth' . format ( epoch) ) print ( './model/model_epoch_{}.pth saved!' . format ( epoch) )
模型测试
测试代码
image_recognition_test.py
import os
import torch
import cv2
import numpy as np
from torch import nn
import torch. optim as optim
from torch. utils. data import Dataset, DataLoader
from torchvision import transformsclass Net ( nn. Module) : def __init__ ( self) : super ( Net, self) . __init__( ) self. encoder_one = nn. Sequential( nn. Conv2d( 3 , 64 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 64 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_two = nn. Sequential( nn. Conv2d( 64 , 128 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 128 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_three = nn. Sequential( nn. Conv2d( 128 , 256 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 256 , 256 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_four = nn. Sequential( nn. Conv2d( 256 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. encoder_five = nn. Sequential( nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. Conv2d( 512 , 512 , kernel_size= 3 , stride= 1 , padding= 1 ) , nn. BatchNorm2d( 512 ) , nn. ReLU( inplace= True ) , nn. MaxPool2d( kernel_size= 2 , stride= 2 ) ) self. decoder_one = nn. Sequential( nn. ConvTranspose2d( in_channels= 512 , out_channels= 256 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 256 ) , nn. ReLU( inplace= True ) ) self. decoder_two = nn. Sequential( nn. ConvTranspose2d( in_channels= 256 , out_channels= 128 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 128 ) , nn. ReLU( inplace= True ) ) self. decoder_three = nn. Sequential( nn. ConvTranspose2d( in_channels= 128 , out_channels= 64 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 64 ) , nn. ReLU( inplace= True ) ) self. decoder_four = nn. Sequential( nn. ConvTranspose2d( in_channels= 64 , out_channels= 32 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 32 ) , nn. ReLU( inplace= True ) ) self. decoder_five = nn. Sequential( nn. ConvTranspose2d( in_channels= 32 , out_channels= 16 , kernel_size= 3 , stride= 2 , padding= 1 , output_padding= 1 ) , nn. BatchNorm2d( 16 ) , nn. ReLU( inplace= True ) ) self. classifier = nn. Conv2d( in_channels= 16 , out_channels= 2 , kernel_size= 1 ) def forward ( self, x) : out = self. encoder_one( x) out = self. encoder_two( out) out = self. encoder_three( out) out = self. encoder_four( out) out = self. encoder_five( out) out = self. decoder_one( out) out = self. decoder_two( out) out = self. decoder_three( out) out = self. decoder_four( out) out = self. decoder_five( out) out = self. classifier( out) return outclass TestDataset ( Dataset) : def __init__ ( self, test_img_path, transform= None ) : self. test_img = os. listdir( test_img_path) self. transform = transformself. images = [ ] for i in range ( len ( self. test_img) ) : self. images. append( os. path. join( test_img_path, self. test_img[ i] ) ) def __getitem__ ( self, index) : img_path = self. images[ index] img = cv2. imread( img_path) img = cv2. resize( img, ( 224 , 224 ) ) if self. transform is not None : img = self. transform( img) return imgdef __len__ ( self) : return len ( self. images) if __name__ == '__main__' : test_img_path = './data/test/last' check_point_path = './model/model_epoch_200.pth' save_dir = './data/test/result' if not os. path. exists( save_dir) : os. makedirs( save_dir) transform = transforms. Compose( [ transforms. ToTensor( ) , transforms. Normalize( mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] ) ] ) bag = TestDataset( test_img_path, transform) test_loader = DataLoader( bag, batch_size= 1 , shuffle= False ) model = Net( ) model. load_state_dict( torch. load( check_point_path) ) for i, img in enumerate ( test_loader) : output = torch. sigmoid( model( img) ) output_np = output. detach( ) . cpu( ) . numpy( ) output_np = np. argmin( output_np, axis= 1 ) img_arr = np. squeeze( output_np) img_arr = ( img_arr * 255 ) . astype( np. uint8) cv2. imwrite( '%s/%03d.png' % ( save_dir, i) , img_arr) print ( '%s/%03d.png' % ( save_dir, i) )
测试效果