基于PyTorch的YOLOv5目标检测 - 教程

news/2025/11/28 17:54:17/文章来源:https://www.cnblogs.com/yangykaifa/p/19283662
import argparse
import logging
import os
import random
import shutil
import time
from pathlib import Path
import math
import numpy as np
import torch.distributed as dist
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.utils.data
import yaml
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import test  # import test.py to get mAP after each epoch
from models.yolo import Model
from utils.datasets import create_dataloader
from utils.general import (torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution, set_logging, init_seeds)
from utils.google_utils import attempt_download
from utils.torch_utils import ModelEMA, select_device, intersect_dicts
logger = logging.getLogger(__name__)
def train(hyp, opt, device, tb_writer=None):logger.info(f'Hyperparameters {hyp}')#太够意思了,训练时候参数,各epoch情况,损失,测试集的结果全部保存log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve'  # logging directorywdir = log_dir / 'weights'  # weights directoryos.makedirs(wdir, exist_ok=True)#保存路径last = wdir / 'last.pt'best = wdir / 'best.pt'results_file = str(log_dir / 'results.txt')#训练过程中各种指标epochs, batch_size, total_batch_size, weights, rank = \opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank# Save run settings 保存当前参数with open(log_dir / 'hyp.yaml', 'w') as f:yaml.dump(hyp, f, sort_keys=False)with open(log_dir / 'opt.yaml', 'w') as f:yaml.dump(vars(opt), f, sort_keys=False)# Configurecuda = device.type != 'cpu'init_seeds(2 + rank)#随机种子with open(opt.data) as f:data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dictwith torch_distributed_zero_first(rank):#所有进程都一起check_dataset(data_dict)  # checktrain_path = data_dict['train']#数据路径与类别名字test_path = data_dict['val']nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names'])  # number classes, namesassert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check# Modelpretrained = weights.endswith('.pt')if pretrained:#有预训练模型的话,会自动下载,最好在github下载好 然后放到对应位置with torch_distributed_zero_first(rank):attempt_download(weights)  # download if not found locallyckpt = torch.load(weights, map_location=device)  # load checkpointif hyp.get('anchors'):ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchormodel = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # createexclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keysstate_dict = ckpt['model'].float().state_dict()  # to FP32state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersectmodel.load_state_dict(state_dict, strict=False)  # loadlogger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # reportelse:model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create 就是咱们之前讲的创建模型那块# Freeze 要不要冻结一些层,做迁移学习.感觉没必要。。。freeze = ['', ]  # parameter names to freeze (full or partial)if any(freeze):for k, v in model.named_parameters():if any(x in k for x in freeze):print('freezing %s' % k)v.requires_grad = False# Optimizernbs = 64  # nominal batch size 累计多少次更新一次模型,咱们的话就是64/16=4次,相当于扩大batchaccumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizinghyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decaypg0, pg1, pg2 = [], [], []  # optimizer parameter groups 设置了个优化组:权重,偏置,其他参数for k, v in model.named_parameters():v.requires_grad = Trueif '.bias' in k:pg2.append(v)  # biaseselif '.weight' in k and '.bn' not in k:pg1.append(v)  # apply weight decayelse:pg0.append(v)  # all elseif opt.adam: #优化器与学习率衰减optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentumelse:optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decayoptimizer.add_param_group({'params': pg2})  # add pg2 (biases)logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))del pg0, pg1, pg2# Scheduler https://arxiv.org/pdf/1812.01187.pdf# https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLRlf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf']  # cosinescheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)# plot_lr_scheduler(optimizer, scheduler, epochs)# Resume 这个best_fitness是sum([0.0, 0.0, 0.1, 0.9]*[精确度, 召回率, mAP@0.5, mAP@0.5:0.95])# 相当于一个综合指标来判断每一次的得分start_epoch, best_fitness = 0, 0.0if pretrained:# Optimizer 优化器if ckpt['optimizer'] is not None:optimizer.load_state_dict(ckpt['optimizer'])best_fitness = ckpt['best_fitness']# Results结果if ckpt.get('training_results') is not None:with open(results_file, 'w') as file:file.write(ckpt['training_results'])  # write results.txt# Epochs 训练了多少次了已经start_epoch = ckpt['epoch'] + 1if opt.resume:#又保存了一份?新训练的会覆盖之前旧的?assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}')  # save previous weightsif epochs < start_epoch:#就是你设置的epoch为100 但是现在模型已经训练了150 那就再训练100logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %(weights, ckpt['epoch'], epochs))epochs += ckpt['epoch']  # finetune additional epochsdel ckpt, state_dict# Image sizes stride是总的下采样比例 目的是看下数据的大小能不能整除这个比例gs = int(max(model.stride))  # grid size (max stride)imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples# DP mode 如果你的机器里面有过个GPU,需要改一些参数。官网教程:https://github.com/ultralytics/yolov5/issues/475if cuda and rank == -1 and torch.cuda.device_count() > 1:model = torch.nn.DataParallel(model)# SyncBatchNorm 多卡同步做BNif opt.sync_bn and cuda and rank != -1:model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)logger.info('Using SyncBatchNorm()')# Exponential moving average 滑动平均能让参数更新的更平滑一点不至于波动太大# 参考博客:https://www.jianshu.com/p/f99f982ad370ema = ModelEMA(model) if rank in [-1, 0] else None# DDP mode 多机多卡,有时候DP可能会出现负载不均衡,这个能直接解决该问题。DP用的时候 经常ID为0的GPU干满,其他的没咋用if cuda and rank != -1:model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)# Trainloader 创建dataloader就是我们一开始讲的部分dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect,rank=rank, world_size=opt.world_size, workers=opt.workers)mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class 判断类别数是否正常nb = len(dataloader)  # number of batchesassert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)# Process 0if rank in [-1, 0]:ema.updates = start_epoch * nb // accumulate  # set EMA updatestestloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True,rank=-1, world_size=opt.world_size, workers=opt.workers)[0]  # testloaderif not opt.resume:labels = np.concatenate(dataset.labels, 0)c = torch.tensor(labels[:, 0])  # classes# cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency# model._initialize_biases(cf.to(device))plot_labels(labels, save_dir=log_dir)if tb_writer:# tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384tb_writer.add_histogram('classes', c, 0)# Anchorsif not opt.noautoanchor:check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)# Model parameters 类别个数,hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current datasetmodel.nc = nc  # attach number of classes to modelmodel.hyp = hyp  # attach hyperparameters to modelmodel.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)#根据标签设置各类别数据初始权重model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weightsmodel.names = names# Start trainingt0 = time.time()#热身持续多少个epochnw = max(round(hyp['warmup_epochs'] * nb), 1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)# nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training# 日志要保存的结果,先初始化maps = np.zeros(nc)  # mAP per classresults = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)scheduler.last_epoch = start_epoch - 1  # do not move#混合精度训练,参考官网说明:https://pytorch.org/docs/stable/amp.html 1.6新功能 fp32与fp16混合 提速比较多scaler = amp.GradScaler(enabled=cuda)#打印信息logger.info('Image sizes %g train, %g test\n''Using %g dataloader workers\nLogging results to %s\n''Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------model.train()# Update image weights (optional)if opt.image_weights:# Generate indicesif rank in [-1, 0]:cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weightsiw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weightsdataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx# Broadcast if DDPif rank != -1:indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()dist.broadcast(indices, 0)if rank != 0:dataset.indices = indices.cpu().numpy()# Update mosaic border# b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)# dataset.mosaic_border = [b - imgsz, -b]  # height, width bordersmloss = torch.zeros(4, device=device)  # mean lossesif rank != -1: #DDP模式每次取数据的随机种子都不同dataloader.sampler.set_epoch(epoch)#创建进度条pbar = enumerate(dataloader)logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))if rank in [-1, 0]:pbar = tqdm(pbar, total=nb)  # progress baroptimizer.zero_grad()for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------ni = i + nb * epoch  # number integrated batches (since train start)# 归一化imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0# Warmup 热身if ni <= nw:xi = [0, nw]  # x interp# model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())for j, x in enumerate(optimizer.param_groups):# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 lf就是余弦衰退函数x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])if 'momentum' in x:x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])# Multi-scale 各种输入的大小,也是随机的范围[imgsz * 0.5, imgsz * 1.5 + gs] 其中gs=32if opt.multi_scale:sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # sizesf = sz / max(imgs.shape[2:])  # scale factorif sf != 1: #得到新的输入大小ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)# Forwardwith amp.autocast(enabled=cuda):# 用到了1.6新特性 混合精度pred = model(imgs)  # forward#总损失,分类损失,回归损失,置信度损失loss, loss_items = compute_loss(pred, targets.to(device), model)  # loss scaled by batch_sizeif rank != -1:loss *= opt.world_size  # gradient averaged between devices in DDP mode# Backwardscaler.scale(loss).backward()# Optimize 相当于Backward多次才更新一次参数if ni % accumulate == 0:scaler.step(optimizer)  # optimizer.stepscaler.update()optimizer.zero_grad()if ema:ema.update(model)# Print 展示信息if rank in [-1, 0]:mloss = (mloss * i + loss_items) / (i + 1)  # update mean lossesmem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])pbar.set_description(s)# Plotif ni < 3:f = str(log_dir / ('train_batch%g.jpg' % ni))  # filenameresult = plot_images(images=imgs, targets=targets, paths=paths, fname=f)if tb_writer and result is not None:tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)# tb_writer.add_graph(model, imgs)  # add model to tensorboard# end batch ------------------------------------------------------------------------------------------------# Scheduler 学习率衰减lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboardscheduler.step()# DDP process 0 or single-GPUif rank in [-1, 0]:# mAP 更新EMAif ema:ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])final_epoch = epoch + 1 == epochsif not opt.notest or final_epoch:  # Calculate mAPresults, maps, times = test.test(opt.data,batch_size=total_batch_size,imgsz=imgsz_test,model=ema.ema,single_cls=opt.single_cls,dataloader=testloader,save_dir=log_dir,plots=epoch == 0 or final_epoch)  # plot first and last# Writewith open(results_file, 'a') as f:f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)if len(opt.name) and opt.bucket:#这个整不了,涉及上传os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))# Tensorboardif tb_writer:tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95','val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss'x/lr0', 'x/lr1', 'x/lr2']  # paramsfor x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):tb_writer.add_scalar(tag, x, epoch)# Update best mAPfi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]if fi > best_fitness:best_fitness = fi# Save modelsave = (not opt.nosave) or (final_epoch and not opt.evolve)if save:with open(results_file, 'r') as f:  # create checkpointckpt = {'epoch': epoch,'best_fitness': best_fitness,'training_results': f.read(),'model': ema.ema,'optimizer': None if final_epoch else optimizer.state_dict()}# Save last, best and deletetorch.save(ckpt, last)if best_fitness == fi:torch.save(ckpt, best)del ckpt# end epoch ----------------------------------------------------------------------------------------------------# end trainingif rank in [-1, 0]:# Strip optimizersn = opt.name if opt.name.isnumeric() else ''fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt'for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]):if os.path.exists(f1):os.rename(f1, f2)  # renameif str(f2).endswith('.pt'):  # is *.ptstrip_optimizer(f2)  # strip optimizeros.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None  # upload# Finishif not opt.evolve:plot_results(save_dir=log_dir)  # save as results.pnglogger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))dist.destroy_process_group() if rank not in [-1, 0] else Nonetorch.cuda.empty_cache()return results
if __name__ == '__main__':parser = argparse.ArgumentParser()parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path')parser.add_argument('--cfg', type=str, default='', help='model.yaml path')#网络配置# parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')#数据parser.add_argument('--data', type=str, default='C:/Users/27623/Downloads/sitting_pose/sitting_pose/data.yaml',help='data.yaml path')  # 数据parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')parser.add_argument('--epochs', type=int, default=1)parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')parser.add_argument('--rect', action='store_true', help='rectangular training')#矩形训练parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')#接着之前的训练parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')#不保存parser.add_argument('--notest', action='store_true', help='only test final epoch')#不测试parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')#是否调整候选框parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')#超参数更新parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')#缓存图片parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')parser.add_argument('--name', default='', help='renames experiment folder exp{N} to exp{N}_{name} if supplied')parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')#是否多尺度训练parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')#是否一个类别parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')#优化器选择parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')#跨GPU的BNparser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')#GPU IDparser.add_argument('--logdir', type=str, default='runs/', help='logging directory')parser.add_argument('--workers', type=int, default=0, help='maximum number of dataloader workers')#windows的同学别改opt = parser.parse_args()# Set DDP variables WORLD_SIZE:进程数 RANK:进程编号opt.total_batch_size = opt.batch_sizeopt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1set_logging(opt.global_rank)if opt.global_rank in [-1, 0]:check_git_status()# Resumeif opt.resume:  # resume an interrupted run 是否继续训练#传入模型的路径或者最后一次跑的模型(在runs中有last.pt)ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent pathlog_dir = Path(ckpt).parent.parent  # runs/exp0assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'with open(log_dir / 'opt.yaml') as f:opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader))  # replaceopt.cfg, opt.weights, opt.resume = '', ckpt, Truelogger.info('Resuming training from %s' % ckpt)else:#加载之前配置好的参数# opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check filesassert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)log_dir = increment_dir(Path(opt.logdir) / 'exp', opt.name)  # runs/exp1device = select_device(opt.device, batch_size=opt.batch_size)# DDP mode 分布式训练,没有多卡的同学略过if opt.local_rank != -1:assert torch.cuda.device_count() > opt.local_ranktorch.cuda.set_device(opt.local_rank)#选择GPUdevice = torch.device('cuda', opt.local_rank)dist.init_process_group(backend='nccl', init_method='env://')  # distributed backendassert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'opt.batch_size = opt.total_batch_size // opt.world_sizelogger.info(opt)with open(opt.hyp) as f:hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps# Trainif not opt.evolve:tb_writer = Noneif opt.global_rank in [-1, 0]:logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')tb_writer = SummaryWriter(log_dir=log_dir)  # runs/exp0train(hyp, opt, device, tb_writer)# 参数搜索与突变# Evolve hyperparameters (optional) 参考github issue:https://github.com/ultralytics/yolov3/issues/392else:# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr'box': (1, 0.02, 0.2),  # box loss gain'cls': (1, 0.2, 4.0),  # cls loss gain'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight'iou_t': (0, 0.1, 0.7),  # IoU training threshold'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)'scale': (1, 0.0, 0.9),  # image scale (+/- gain)'shear': (1, 0.0, 10.0),  # image shear (+/- deg)'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)'mosaic': (1, 0.0, 1.0),  # image mixup (probability)'mixup': (1, 0.0, 1.0)}  # image mixup (probability)assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'opt.notest, opt.nosave = True, True  # only test/save final epoch# ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indicesyaml_file = Path(opt.logdir) / 'evolve' / 'hyp_evolved.yaml'  # save best result hereif opt.bucket:os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt if existsfor _ in range(300):  # generations to evolveif os.path.exists('evolve.txt'):  # if evolve.txt exists: select best hyps and mutate# Select parent(s)parent = 'single'  # parent selection method: 'single' or 'weighted'x = np.loadtxt('evolve.txt', ndmin=2)n = min(5, len(x))  # number of previous results to considerx = x[np.argsort(-fitness(x))][:n]  # top n mutationsw = fitness(x) - fitness(x).min()  # weightsif parent == 'single' or len(x) == 1:# x = x[random.randint(0, n - 1)]  # random selectionx = x[random.choices(range(n), weights=w)[0]]  # weighted selectionelif parent == 'weighted':x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination# Mutatemp, s = 0.8, 0.2  # mutation probability, sigmanpr = np.randomnpr.seed(int(time.time()))g = np.array([x[0] for x in meta.values()])  # gains 0-1ng = len(meta)v = np.ones(ng)while all(v == 1):  # mutate until a change occurs (prevent duplicates)v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)hyp[k] = float(x[i + 7] * v[i])  # mutate# Constrain to limitsfor k, v in meta.items():hyp[k] = max(hyp[k], v[1])  # lower limithyp[k] = min(hyp[k], v[2])  # upper limithyp[k] = round(hyp[k], 5)  # significant digits# Train mutationresults = train(hyp.copy(), opt, device)# Write mutation resultsprint_mutation(hyp.copy(), results, yaml_file, opt.bucket)# Plot resultsplot_evolution(yaml_file)print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')

基于 PyTorch 实现的YOLOv5 目标检测模型训练框架,支持单 / 多 GPU 训练、超参数进化、迁移学习等核心能力,可直接用于自定义目标检测数据集的模型训练与优化。​就是这份代码

一、核心功能与定位​

该代码的核心目标是提供一套完整、可配置的 YOLOv5 训练流程,降低目标检测模型的开发门槛。核心能力覆盖从数据准备到模型部署前的全流程,具体包括:​

  1. 灵活的训练安装:承受自定义数据集路径、训练轮次、 batch size、输入图像尺寸等关键参数。​
  1. 多硬件与分布式支持:兼容 CPU/GPU 训练,帮助单 GPU、多 GPU(DP 模式)、多机多卡(DDP 模式),并可通过SyncBatchNorm构建跨 GPU BatchNorm 同步。​
  1. 训练优化策略:内置混合精度训练(AMP)、学习率余弦衰减、模型指数移动平均(EMA)、多尺度训练等提升训练效率与模型性能的策略。​
  1. 超参数进化:通过--evolve参数启动超参数搜索,基于历史训练结果迭代优化超参数(如学习率、损失权重、素材增强强度等)。​
  1. 完整的日志与可视化:自动保存训练日志(损失、精度、mAP 等)、生成 TensorBoard 可视化文件、绘制训练结果图(标签分布、训练曲线等)。​
  1. 模型管理:自动保存last.pt(最新模型)和best.pt(最优 mAP 模型),支持训练中断后从断点恢复(--resume)。​

二、核心模块与工作流程​

代码通过模块化设计实现训练流程解耦,核心模块及执行顺序如下:​

1. 参数解析(__main__函数)​

  • 通过argparse定义所有可配置参数(如权重路径、内容集路径、训练轮次等),默认参数适配 YOLOv5 基础训练场景,用户可借助命令行修改。​
  • 特殊处理:自动检测分布式训练环境(WORLD_SIZE、RANK)、校验数据集 / 配置文件路径、初始化日志系统。​

2. 数据准备(utils.datasets.create_dataloader)​

  • 加载自定义数据集(需提前按 YOLO 格式准备,通过data.yaml指定训练 / 测试集路径与类别)。​
  • 数据增强:默认支持 Mosaic 拼接、HSV 颜色抖动、旋转 / 平移 / 缩放 / 剪切、上下翻转等,提升模型泛化能力。​
  • 输入适配:自动校验图像尺寸是否为模型步长(gs,默认 32)的整数倍,多尺度训练时随机调整输入尺寸(--multi-scale)。​

3. 模型构建与初始化(models.yolo.Model)​

  • 支持两种初始化方式:​
  • 加载预训练权重(如yolov5s.pt):自动下载缺失权重,仅加载与当前模型匹配的参数(排除锚点等无需继承的参数)。​
  • 从零构建:通过--cfg指定模型配置文件(如yolov5s.yaml),按设置定义网络结构(Backbone、Neck、Head)。​
  • 模型适配:自动设置类别数(nc)、类别名称(names)、类别权重(基于素材集标签分布),支持单类别训练(--single-cls)。​

4. 训练核心逻辑(train函数)​

训练流程按 “epoch → batch” 两层循环执行,关键步骤包括:​

  1. 热身训练(Warmup):训练初期(前nw次迭代)线性提升学习率与动量,避免模型初期震荡。​
  1. 前向传播:通过混合精度训练(amp.autocast)加速计算,输出模型预测结果。​
  1. 损失计算:调用compute_loss计算总损失,涵盖边界框回归损失(box loss)、目标置信度损失(obj loss)、类别分类损失(cls loss)。​
  1. 反向传播与优化:按累计梯度(accumulate)更新参数(默认累计 4 次 batch 更新 1 次,模拟大 batch 效果),同步更新 EMA 模型。​
  1. 验证与日志:每个 epoch 结束后,调用test.test计算测试集 mAP(均值平均精度),将训练指标(损失、精度、学习率)写入日志与 TensorBoard。​

5. 模型保存与收尾​

  • 训练结束后,自动剥离模型优化器参数(strip_optimizer),减小模型文件体积,便于部署。​
  • 生成训练结果可视化图(plot_results),展示训练 / 验证损失曲线、mAP 变化曲线。​

三、关键配置与使用场景​​

核心参数​

作用​

适用场景​

--weights​

预训练权重路径(如yolov5s.pt)​

迁移学习(快速收敛,提升小数据集性能)​

--data​

资料集配置文件(data.yaml)​

自定义数据集训练(指定数据路径与类别)​

--epochs​

训练轮次​

小数据集设 50-100,大数据集设 200-300​

--multi-scale​

多尺度训练(图像尺寸 ±50% 波动)​

提升模型对不同尺寸目标的检测能力​

--evolve​

超参数进化(300 代迭代优化)​

追求最优模型性能,需大量计算资源​

--resume​

断点续训​

训练中断后恢复(无需重新训练)​

--sync-bn​

跨 GPU BatchNorm 同步​

多机多卡训练(消除 GPU 负载不均衡问题)​

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/979968.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

2025年工业冷风机车间降温技术全解析,铁皮棚车间降温/焊装车间通风降温/压铸车间通风降温/机械厂车间降温工业冷风机厂商推荐排行

技术背景:工业降温的绿色转型之路 随着全球制造业的快速发展,工业车间环境优化已成为企业提升生产效率、保障员工健康的重要课题。传统空调系统在工业场景中面临能耗高、覆盖范围有限等挑战,而基于蒸发冷却原理的蓝…

2025年长沙公务员面试机构排行榜:前五强揭晓,湖南长沙公务员面试排行优质品牌选购指南

专业视角下的长沙公考面试机构深度解析 随着2025年长沙公务员招录工作的推进,面试环节的备考需求持续升温。本文基于市场调研数据与机构服务表现,从师资实力、服务体系、教学成果等维度,对长沙地区公务员面试培训机…

车间降温新方案:工业冷风机2025年趋势,五金车间通风降温/焊装车间通风降温/钢构车间通风降温/制造业车间通风降温工业冷风机企业口碑推荐

技术背景:工业降温的绿色转型 随着全球制造业对节能环保要求的不断提升,传统空调系统在工业车间应用中的局限性日益凸显。高能耗、运行成本昂贵、难以适应大面积开放空间等问题,促使行业寻求更可持续的降温解决方案…

详细介绍:【k8s】Deployment、StatefulSet、DaemonSet

详细介绍:【k8s】Deployment、StatefulSet、DaemonSetpre { white-space: pre !important; word-wrap: normal !important; overflow-x: auto !important; display: block !important; font-family: "Consolas&q…

2025非遗新中式品牌排行TOP榜,这些品牌正走红,非遗新中式推荐排行色麦新中式层层把关品质优

非遗新中式行业格局深度解析 近年来,非遗新中式赛道呈现蓬勃发展态势,众多品牌凭借独特的产品定位与文化表达崭露头角。据行业数据显示,2024年非遗新中式市场规模同比增长42%,其中女装品类表现尤为突出。本报告基于…

2025年高定服装加盟品牌市场占有率排行,高定服装加盟采购色麦新中式市场认可度高

行业洞察:高定服装加盟市场格局分析 随着消费升级与文化自信的崛起,高定服装加盟市场正迎来新一轮发展机遇。据行业数据显示,2025年高定服装加盟市场规模预计将达到百亿级别,其中新中式风格成为增长最快的细分赛道…

2025国风源头排行榜发布,这些品牌上榜!,国风源头推荐排行技术领航者深度解析

国风产业迎来新格局,非遗供应链成核心竞争力 随着国风消费市场持续升温,2025年国风源头企业竞争格局已初步显现。据最新行业调研数据显示,具备完整非遗供应链体系的企业正以年均30%的增速领跑市场,其中深耕宋式美学…

2025年工业冷风机维护保养全攻略,延长设备使用寿命,铁皮房车间降温/高大车间厂房通风降温/炼钢车间通风降温工业冷风机公司口碑推荐

技术背景:蓝美达工业冷风机在车间环境优化中的关键作用 随着制造业转型升级步伐加快,蓝美达工业冷风机作为车间环境优化的关键设备,其技术性能与运行稳定性直接影响生产效率和能耗水平。工业冷风机通过蒸发冷却原理…

2025年新中式高定服装加盟五大推荐品牌,新中式高定服装加盟批发精选优质厂家

行业洞察:新中式高定服装市场迎来发展新机遇 随着国潮文化的持续升温,新中式高定服装市场展现出强劲的增长态势。据行业数据显示,近三年来该细分领域年均复合增长率超过25%,成为服装加盟领域最具潜力的赛道之一。本…

2025年靠谱的四川变形缝_变形缝厂家实力及口碑推荐榜

2025 年靠谱的四川变形缝 / 变形缝厂家实力及口碑推荐榜随着四川城镇化进程加速,超高层建筑、商业综合体与公共设施项目密集落地,变形缝作为保障建筑结构安全与防水性能的核心部件,市场需求持续攀升。据四川省建筑装…

2025年国内正规的AGV货架批发厂家找哪家,悬臂货架/高位货架/精益管料架/贯通货架/牛脚式货架/可调节货架/冷库货架AGV货架产品选哪家

行业背景与评选标准 随着智能仓储需求的持续增长,AGV货架作为自动化仓储系统的核心组成部分,其质量与性能直接影响整体运营效率。本文基于企业规模、技术实力、客户案例及市场口碑等维度,对国内AGV货架领域具备正规…

HTML基础--面向后端

简单的HTML基础知识,帮助后端开发者了解前端的基本语法HTML页面 <!DOCTYPE html> <html><head><meta charset="UTF-8"><title>Hello World</title><body><h1…

2025年长沙公务员面试选哪家?最新排名出炉,湖南长沙公务员面试赋能企业生产效率提升与成本优化

行业背景分析 随着2025年长沙公务员招录工作的全面启动,面试备考已进入关键阶段。作为公务员招录的重要环节,面试成绩往往直接影响最终录取结果。近年来,长沙公务员面试培训市场呈现出专业化、精细化的发展趋势,各…

2025年工业冷风机车间降温技术全解析,电炉车间通风降温/钢结构车间夏季降温/机械厂车间降温/电镀车间通风降温工业冷风机生产厂家哪家好

技术背景:蓝美达工业降温的绿色转型之路 随着全球制造业绿色转型加速,工业车间环境优化已成为企业可持续发展的重要课题。在高温作业环境中,传统空调系统因能耗高、覆盖范围有限等局限性,难以满足大面积工业厂房的…

中考前最后一个假期!选这些数学老师带你冲刺寒假

中考前的最后一个寒假,是数学提分的“黄金逆袭期”——既能集中补全过往知识漏洞,又能提前攻克中考核心难点,为春季冲刺筑牢根基。但不少初三学生陷入“在家复习没方向、刷题越刷越迷茫”的困境,而选对线上老师,就…

2025中国HR SaaS厂商AI进度大盘点

2025年,中国HR SaaS市场正经历一场由人工智能驱动的深刻变革。AI不再是简单的效率工具,而是跃升为驱动企业人力资源战略转型和价值创造的核心引擎。在这场竞速中,各主流厂商纷纷亮出底牌,从AI赋能的广度、深度、底…

SQL Server设置用户查看指定数据库的指定表单

1、创建新登录用户CREATE LOGIN financeUser WITH PASSWORD = Password!; GOALTER LOGIN financeUser WITH DEFAULT_DATABASE = myDB; GO 注意:一定要将目标数据库定义为用户登录后的默认数据库;2、移除用户查看全部…

网页打包EXE/APK/IPA出现乱码快速解决方案

最近有几个朋友在把网页项目打包成 EXE(电脑程序)、APK(安卓应用) 或 IPA(苹果应用) 软件时,会遇到一个恼人的问题: 页面出现乱码, 原本在浏览器中一切正常,文字清晰可读,但一打包成应用,界面上的中文就成了一串奇…

注重交流的成人英语课程怎么选?五大维度深度解析

在新加坡生活或工作的成年人,常常面临一个现实困境: 词汇量不小,语法也懂,但一到真实对话场景就卡壳、紧张、不敢开口。 这不是语言知识的问题,而是缺乏以“交流”为核心的训练环境。市面上许多英语课程仍停留在“…

气体分析仪厂家综合实力榜:2025年度十大气体分析仪厂家排名,权威榜单+技术数据实证

在工业自动化、环保监测、化工安全等领域,气体分析仪作为关键检测设备,其精度、稳定性与可靠性直接影响生产安全与效率。然而,市场上厂家众多、技术参差不齐、认证标准不一,导致用户在选择时常面临“质量虚标、服务…