五折交叉验证因其无放回分层抽样和重复验证机制,成为超参数调优的首选;
而Bootstrap因有放回抽样的重复性和验证集的不稳定性,主要服务于参数估计(置信区间的计算)而非调优。
实际应用中,可结合两者优势:用交叉验证调参,再用Bootstrap评估模型参数的置信区间
rsmp(“bootstrap”,repeats=20) 正式跑repeats设置为1000
learner_rf <- lrn("classif.ranger", num.threads = 6, # OKnum.trees = to_tune(seq(100, 1500, by = 50)#p_int(lower =100, upper =1000)),mtry = to_tune(p_int(lower =1, upper =8)),min.node.size = to_tune(p_int(lower =1, upper =10)),max.depth = to_tune(p_int(3,10)),importance = "impurity",predict_sets = c("train", "test"),predict_type = "prob")
#
rf_at <- auto_tuner(tuner=tnr("random_search"), learner = learner_rf,resampling = rsmp("cv",folds=5),#resampling = rsmp("bootstrap",repeats=100),measure =msr("classif.auc"), terminator = trm("evals",n_evals=10,k=0))
# 测试集寻找最佳参数
# future::plan("multisession", workers = 7) # OK
# set_threads(learner_rf, n = 7)rf_at$train(task,row_ids=train_id)
rf_at$tuning_result#获取自动调参器 at 中的超参数调优结果中的最优超参数组合
rf_at$tuning_result$learner_param_vals[[1]]# 调参结束后,可以取出最优超参数,更新学习器参数:
learner_rf$param_set$values= rf_at$tuning_result$learner_param_vals[[1]]# # 合并到学习器集合
learners$classif.ranger <- learner_rf
learners$classif.ranger$id <- "RF" # "RandomForest"# 一份数据单词测试
learner_rf$train(task,train_id)
pre <- learner_rf$predict(task = task,row_ids = train_id)
undebug(autoplot)
autoplot(pre,type="roc")pre_test <- learner_rf$predict(task = task,row_ids = test_id)
autoplot(pre_test,type="roc")# 置信区间计算依赖bootstrap的
rr <- resample(task, learner_rf,resampling = rsmp("bootstrap",repeats=20),store_models = T,store_backends = T)pre_test <- as.data.table(rr$prediction("test"))
pre_train <- as.data.table(rr$prediction("train"))
names(pre2)
library(pROC)
roc_obj_train <- roc(response = pre_train$truth, predictor = pre_train$prob.Case);pROC::ggroc(roc_obj_train)
roc_obj <- roc(response = pre_test$truth, predictor = pre_test$prob.Case);pROC::ggroc(roc_obj)
# CI
ci.auc(roc_obj_train)roc_list <- list("tf Set" = roc_obj, "Train Set" = roc_obj_train
)ggroc(roc_list, alpha = 0.8, linewidth = 1) + # 添加对角线geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "gray40", linetype = "dashed", linewidth = 0.3) + # 设置颜色和标签scale_color_manual(values = c("#1A73E8", "#00BFC4")) + # 自定义颜色labs(x = "1 - Specificity", y = "Sensitivity", color = "Dataset") + # 标签优化# 主题与样式调整theme_bw(base_size = 11) +theme(text = element_text(family = "Arial", face = "bold"),legend.position = c(0.8, 0.2), # 图例位置(右下角)panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.border = element_rect(color = "black", linewidth = 0.6))eoffice::topptx(figure =p_train_id_roc,filename = "02_ALL_ML_train_ROC.pptx",width = 6, height = 4, units = "cm") # Plot作图