增加Tidy Models parsnip支持的算法模型

  前文《用Tidy Models实现墨尔本房价回归模型》中列出了Tidy Models parsnip包支持的算法模型,Python版墨尔本房价回归模型使用的算法中,Catboost与LightGBM不在上面,可以通过parsnip包的接口函数添加相关的定义,让Tidy Models支持新的算法模型,从而将新的算法模型纳入Tidy Models工具套件的框架,为使用及参数调优等带来方便。具体可以参阅:《How to build a parsnip model》一文。至于LightGBM及Catboost,已经有开源的R包添加了对他们的支持,直接引用就好了。
  一般来说,这种主流框架都是比较开放的,有成熟的机制添加新的算法支持。
一、LightGBM
  需要安装lightgbm包。

>install.packages("lightgbm", repos = "https://cran.r-project.org")

  以及为它提供parsnip接口支持的bonsai包或treesnip

>install.packages("bonsai")

  加载bonsai包后,与parsnip中其它算法模型的使用是一样的。可调参数参阅treesnip包,bonsai包作为Rstudio接管后的官方版本,继承了treesnip包,treesnip包有一个支持Catboost的分支版本,具体可参阅其主页。bonsai包没有支持Catboost,treesnip包也移除了Catboost的支持,理由是Catboost没有提供CRAN安装包,不过它有一个支持Catboost的分支。

# -----------------------------------------------------------------------------------------
library(tidymodels)
library(kableExtra)
library(tidyr)
# 为LightGBM提供 parsnip接口支持
library(bonsai)
# All operating systems,注册并行处理
library(doParallel)
cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)
# 优先使用tidymodels的同名函数。
tidymodels_prefer()

# 异常值阈值30
threshold<- 30

# ----------------------------------------------------------------------------------------
# 加载经过预处理的数据
melbourne<- read.csv("D:/temp/data/Melbourne_housing/Melbourne_housing_pre.csv")
# 过滤缺失值
# Error: Missing data in columns: BuildingArea.
# 47 obs.
missing <- filter(melbourne, BuildingArea==0)
melbourne <- filter(melbourne, BuildingArea!=0)

# 划分训练集与测试集
set.seed(2023)
melbourne_split <- initial_split(melbourne, prop = 0.80)
melbourne_train <- training(melbourne_split)
melbourne_test  <-  testing(melbourne_split)

# ---------------------------------------------------------------------------------------
# LightGBM 模型

# 定义菜谱:回归公式与预处理
# 'Year','YearBuilt','Distance','Lattitude','Longtitude','Propertycount',
# 'Landsize','BuildingArea', 'Rooms','Bathroom', 'Car','Type_h','Type_t','Type_u'
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  #step_log(BuildingArea, base = 10) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:LightGBM  
lgbm_model <-
  boost_tree() %>%
  set_engine('lightgbm') %>%
  set_mode('regression')

# 定义工作流
lgbm_wflow <- 
  workflow() %>% 
  add_model(lgbm_model) %>% 
  add_recipe(melbourne_rec)

# 训练模型
t1<-proc.time()
lgbm_fit <- fit(lgbm_wflow, melbourne_train)
t2<-proc.time()
# 0.35 0.05 0.86 NA NA
cat(t2-t1)

# 测试集预测
# 预测值
melbourne_test_res <- predict(lgbm_fit, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_res <- bind_cols(melbourne_test_res, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_res

# 画拟合效果,这个图效果较好
# 可以看到随机森林预测的效果是:异常值中低价区间预测偏高,高价区间预测偏低。
data <- arrange(melbourne_test_res, LogPrice)
data["index"]<-1:nrow(data)
data["upper"]<-data["LogPrice"]+log(1+threshold/100)
data["lower"]<-data["LogPrice"]+log(1-threshold/100)

ggplot(data = data)+
  geom_point(mapping = aes(x = index, y = .pred, color = "blue"), size=0.1)+
  geom_line(mapping = aes(x = index, y = LogPrice, color = "red"))+
  geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
  ggtitle("测试集拟合效果图")+
  xlab("样本")+
  ylab("ln(房价)")+
  scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
  scale_colour_manual(name = '颜色', 
                      values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))

# 计算性能指标
# rmse(melbourne_test_res, truth = LogPrice, estimate = .pred)
# 0.828, 0.228, 0.165
melbourne_metrics <- metric_set(rsq, rmse,  mae)
melbourne_metrics(melbourne_test_res, truth = LogPrice, estimate = .pred)

# ---------------------------------------------------------------------------------------------
# 5折交叉验证

set.seed(1001)
melbourne_folds <- vfold_cv(melbourne_train, v = 5)

get_model <- function(x) {
  extract_fit_parsnip(x)
}

# 并行训练
t1<-proc.time()
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE, extract = get_model)
set.seed(1003)
lgbm_res <- 
  lgbm_wflow %>% 
  fit_resamples(resamples = melbourne_folds, control = keep_pred)
t2<-proc.time()
# 0.28 0.05 6.71 NA NA
cat(t2-t1)

lgbm_res

# 查看性能指标
# 0.855, 0.214
collect_metrics(lgbm_res)

# 查看预测效果
assess_res <- collect_predictions(lgbm_res)
assess_res

# 画预测效果图
data <- arrange(assess_res, LogPrice)
data["index"]<-1:nrow(data)
data["upper"]<-data["LogPrice"]+log(1+threshold/100)
data["lower"]<-data["LogPrice"]+log(1-threshold/100)

ggplot(data = data)+
  geom_point(mapping = aes(x = index, y = .pred, color = "blue"), size=0.1)+
  geom_line(mapping = aes(x = index, y = LogPrice, color = "red"))+
  geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
  ggtitle("测试集拟合效果图")+
  xlab("样本")+
  ylab("ln(房价)")+
  scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
  scale_colour_manual(name = '颜色', 
                      values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))

# -----------------------------------------------------------------------------------------------
# 网格搜索

# 定义菜谱:回归公式与预处理
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:Light GBM, 定义要调整的参数 
lgbm_spec <-
  boost_tree(tree_depth = tune(), trees = tune(), learn_rate = tune(), min_n = tune(),
             loss_reduction = tune(), sample_size = tune(), mtry=tune()) %>%
  set_engine('lightgbm') %>%
  set_mode('regression')

# 定义工作流
lgbm_wflow <- 
  workflow() %>% 
  add_model(lgbm_spec) %>% 
  add_recipe(melbourne_rec)

# mtry参数的边界未完全确定,用finalize()函数确定。
lgbm_param <- lgbm_wflow %>%
  extract_parameter_set_dials() %>%
  update(learn_rate = threshold(c(0.01,0.5))) %>%
  update(trees = trees(c(500,1000))) %>%
  update(tree_depth = tree_depth(c(5,15))) %>%
  update(mtry = mtry(c(3,6))) %>%
  update(sample_size = threshold(c(0.5,1))) %>%
  finalize(melbourne_train)

# 查看参数边界,都已确定
lgbm_param %>% extract_parameter_dials("trees")
lgbm_param %>% extract_parameter_dials("min_n")
lgbm_param %>% extract_parameter_dials("tree_depth")
lgbm_param %>% extract_parameter_dials("learn_rate")
lgbm_param %>% extract_parameter_dials("loss_reduction")
lgbm_param %>% extract_parameter_dials("sample_size")
lgbm_param %>% extract_parameter_dials("mtry")

# 执行网格搜索,每个参数搜索2个候选。
set.seed(2023)
t1<-proc.time()
lgbm_res_grid <- 
  lgbm_wflow %>% 
  tune_grid(
    resamples = melbourne_folds,
    grid = lgbm_param %>% grid_regular(levels = 2),  # 用参数空间构造一个网格 2^7个组合。
    metrics = metric_set(rsq, rmse, mae)
  )
t2<-proc.time()
# 0.36 0.03 338.03 NA NA
cat(t2-t1)

# 画图查看网格搜索效果
# autoplot(lgbm_res_grid)

# 查看准确率最高的模型
show_best(lgbm_res_grid, metric="rsq")

# 选择准确率最高的模型
select_best(lgbm_res_grid, metric="rsq")
# # A tibble: 1 x 8
# mtry trees min_n tree_depth learn_rate loss_reduction sample_size .config               
# <int> <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>                 
#   6  1000     2         15       0.01   0.0000000001         0.5 Preprocessor1_Model082
# 构造最佳参数
# 直接读取调参的最佳结果
lgbm_param_best<- select_best(lgbm_res_grid, metric="rsq")
# 最佳参数回填到工作流
lgbm_wflow_grid <-
  lgbm_wflow %>%
  finalize_workflow(lgbm_param_best)
lgbm_wflow_grid

# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
lgbm_fit_grid<- lgbm_wflow_grid %>% fit(melbourne_train)
t2<-proc.time()
# 4.84 8.54 6.22 NA NA
cat(t2-t1)

# 预测值
melbourne_test_grid <- predict(lgbm_fit_grid, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_grid <- bind_cols(melbourne_test_grid, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_grid
# 86.5, 0.193, 0.137
melbourne_metrics(melbourne_test_grid, truth = LogPrice, estimate = .pred)

# ----------------------------------------------------------------------------------------------------
# 贝叶斯优化
# 可以调整菜谱参数、模型主参数及引擎相关参数。

# 定义菜谱:回归公式与预处理
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:Light GBM, 定义要调整的参数 
lgbm_spec <-
  boost_tree(tree_depth = tune(), trees = tune(), learn_rate = tune(), min_n = tune(),
             loss_reduction = tune(), sample_size = tune(), mtry=tune()) %>%
  set_engine('lightgbm') %>%
  set_mode('regression')

# 定义工作流
lgbm_wflow <- 
  workflow() %>% 
  add_model(lgbm_spec) %>% 
  add_recipe(melbourne_rec)

# mtry参数的边界未完全确定,用finalize()函数确定。
lgbm_param <- lgbm_wflow %>%
  extract_parameter_set_dials() %>%
  update(learn_rate = threshold(c(0.01,0.5))) %>%
  update(trees = trees(c(500,1000))) %>%
  update(tree_depth = tree_depth(c(5,15))) %>%
  update(mtry = mtry(c(3,6))) %>%
  update(sample_size = threshold(c(0.5,1))) %>%
  finalize(melbourne_train)

# 查看参数边界,都已确定
lgbm_param %>% extract_parameter_dials("trees")
lgbm_param %>% extract_parameter_dials("min_n")
lgbm_param %>% extract_parameter_dials("tree_depth")
lgbm_param %>% extract_parameter_dials("learn_rate")
lgbm_param %>% extract_parameter_dials("loss_reduction")
lgbm_param %>% extract_parameter_dials("sample_size")
lgbm_param %>% extract_parameter_dials("mtry")

melbourne_folds <- vfold_cv(melbourne, v = 5)

# 执行贝叶斯优化
ctrl <- control_bayes(verbose = TRUE, no_improve = Inf)
# set.seed(2023)
t1<-proc.time()
lgbm_res_bo <-
  lgbm_wflow %>%
  tune_bayes(
    resamples = melbourne_folds,
    metrics = metric_set(rsq, rmse, mae),
    initial = 10,
    param_info = lgbm_param,
    iter = 100,
    control = ctrl
  )
t2<-proc.time()
# 4760.83 2.64 5503.5 NA NA
cat(t2-t1)

# 画图查看贝叶斯优化效果
autoplot(lgbm_res_bo, type = "performance", metric="rsq")

# 查看准确率最高的模型
show_best(lgbm_res_bo, metric="rsq")
# # A tibble: 1 x 8
# mtry trees min_n tree_depth learn_rate loss_reduction sample_size .config
# <int> <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
#  6   908     9         14     0.0337       2.80e-10       0.875 Iter53 

# 选择准确率最高的模型
select_best(lgbm_res_bo, metric="rsq")
# 直接读取调参的最佳结果
lgbm_param_best<- select_best(lgbm_res_bo, metric="rsq")

# 最佳参数回填到工作流
lgbm_wflow_bo <-
  lgbm_wflow %>%
  finalize_workflow(lgbm_param_best)
lgbm_wflow_bo

# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
lgbm_fit_bo<- lgbm_wflow_bo %>% fit(melbourne_train)
t2<-proc.time()
# 3.85 6.98 5.13 NA NA
cat(t2-t1)

# 测试集
# 预测值
melbourne_test_bo <- predict(lgbm_fit_bo, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_bo <- bind_cols(melbourne_test_bo, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_bo
# 0.870, 0.189, 0.136
melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)

# 训练集
# 预测值
melbourne_train_bo <- predict(lgbm_fit_bo, new_data = melbourne_train %>% select(-LogPrice))
# 合并真实值
melbourne_train_bo <- bind_cols(melbourne_train_bo, melbourne_train %>% select(LogPrice))
# 浏览
melbourne_train_bo
# 0.960, 0.108, 0.0813
melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)

  LightGBM在R上的效果似乎比XGB还要好一点。

> show_best(lgbm_res_bo, metric="rsq")
# A tibble: 5 x 14
   mtry trees min_n tree_depth learn_rate loss_reduction sample_~1 .metric .esti~2  mean     n std_err .config .iter
  <int> <int> <int>      <int>      <dbl>          <dbl>     <dbl> <chr>   <chr>   <dbl> <int>   <dbl> <chr>   <int>
1     6   908     9         14     0.0337       2.80e-10     0.875 rsq     standa~ 0.887     5 0.00255 Iter53     53
2     6   753    29         14     0.0377       1.61e- 9     0.943 rsq     standa~ 0.887     5 0.00253 Iter93     93
3     5   702    39         14     0.0385       2.47e-10     0.930 rsq     standa~ 0.887     5 0.00220 Iter41     41
4     6   725     7         10     0.0324       1.27e- 2     0.665 rsq     standa~ 0.887     5 0.00257 Iter28     28
5     6   803     8          6     0.0316       6.50e- 3     0.755 rsq     standa~ 0.887     5 0.00260 Iter31     31
# ... with abbreviated variable names 1: sample_size, 2: .estimator
> melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)
# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.870
2 rmse    standard       0.189
3 mae     standard       0.136
> melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)
# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.960 
2 rmse    standard      0.108 
3 mae     standard      0.0813

二、Catboost
  Catboost用的是最新的V1.1.1,安装参阅文档《Install the released version》
1、安装测试。
  参阅文档《Usage examples》。此处的改动是对categorical features的处理,要自己先转换为factor类型,然后catboost.load_pool()的参数cat_features就没有用了,Catboost会根据输入的dataframe自动处理,文档没有更新, 8-( ,然后基本测试就通过了。

>devtools::install_url('https://github.com/catboost/catboost/releases/download/v1.1.1/catboost-R-Windows-1.1.1.tgz', INSTALL_opts = c("--no-multiarch", "--no-test-load"))
library(catboost)

# ---------------------------------------------------------------------------------------
# Load a dataset with numerical features, define the training parameters and start the training:
dataset = matrix(c(1900,7,
                   1896,1,
                   1896,41),
                 nrow=3, 
                 ncol=2, 
                 byrow = TRUE)
label_values = c(0,1,1)

fit_params <- list(iterations = 100,
                   loss_function = 'Logloss')

pool = catboost.load_pool(dataset, label = label_values)

model <- catboost.train(pool, params = fit_params)

head(pool, n = 2)

prediction <- catboost.predict(model, pool, prediction_type = 'Class')

# -------------------------------------------------------------------------------------
# Load a dataset with numerical features, define the training parameters and start the training on GPU:
dataset = matrix(c(1900,7,
                   1896,1,
                   1896,41),
                 nrow=3, 
                 ncol=2, 
                 byrow = TRUE)
label_values = c(0,1,1)

fit_params <- list(iterations = 100,
                   loss_function = 'Logloss',
                   task_type = 'GPU')

pool = catboost.load_pool(dataset, label = label_values)

model <- catboost.train(pool, params = fit_params)

head(pool, n = 2)

prediction <- catboost.predict(model, pool, prediction_type = 'Class')

# --------------------------------------------------------------------------------------
# Load a dataset with numerical and categorical features, define the training parameters and start the training:
countries = c('RUS','USA','SUI')
years = c(1900,1896,1896)
phone_codes = c(7,1,41)
domains = c('ru','us','ch')

dataset = data.frame(countries, years, phone_codes, domains)
dataset$countries<-as.factor(dataset$countries)
dataset$domains<-as.factor(dataset$domains)

label_values = c(0,1,1)

fit_params <- list(iterations = 100,
                   loss_function = 'Logloss',
                   ignored_features = c(4,9),
                   border_count = 32,
                   depth = 5,
                   learning_rate = 0.03,
                   l2_leaf_reg = 3.5)

# pool = catboost.load_pool(dataset, label = label_values, cat_features = c(0,3))
pool = catboost.load_pool(dataset, label = label_values)

model <- catboost.train(pool, params = fit_params)

prediction <- catboost.predict(model, pool, prediction_type = 'Class')

# -----------------------------------------------------------------------------------
# Load a dataset with numerical and categorical features, define the training parameters and start the training on GPU:

countries = c('RUS','USA','SUI')
years = c(1900,1896,1896)
phone_codes = c(7,1,41)
domains = c('ru','us','ch')

dataset = data.frame(countries, years, phone_codes, domains)
dataset$countries<-as.factor(dataset$countries)
dataset$domains<-as.factor(dataset$domains)

label_values = c(0,1,1)

fit_params <- list(iterations = 100,
                   loss_function = 'Logloss',
                   ignored_features = c(4,9),
                   border_count = 32,
                   depth = 5,
                   learning_rate = 0.03,
                   l2_leaf_reg = 3.5,
                   task_type = 'GPU')

pool = catboost.load_pool(dataset, label = label_values)

model <- catboost.train(pool, params = fit_params)

prediction <- catboost.predict(model, pool, prediction_type = 'Class')

2、安装测试treesnip。
  如上所述,bonsai包没有支持Catboost,treesnip包也移除了Catboost的支持,理由是Catboost没有提供CRAN安装包。treesnip包有一个支持Catboost的分支版本,不过用于分类时会有问题,因为Tidy Models要求分类的目标变量是factor类型,而Catboost的R版本目前还不支持目标变量是factor类型,参阅该帖子该帖子。所以需要安装定制版的treesnip,开发者是俄国Innopolis University的Mikhail Rudakov。

>remotes::install_github("Glemhel/treesnip", INSTALL_opts = c("--no-multiarch"))

  测试通过。

library(tidymodels)
library(kableExtra)
library(tidyr)
# remotes::install_github("Glemhel/treesnip", INSTALL_opts = c("--no-multiarch"))
# 为catboost提供 parsnip接口支持
library(treesnip)
# 优先使用tidymodels的同名函数。
tidymodels_prefer()
# 可以看到加载treesnip包后已经加入了对Catboost的支持。
show_model_info("boost_tree")

countries = c('RUS','USA','SUI')
years = c(1900,1896,1896)
phone_codes = c(7,1,41)
domains = c('ru','us','ch')
label_values = c(0,1,1)

dataset = data.frame(label_values, countries, years, phone_codes, domains)
dataset$countries<-as.factor(dataset$countries)
dataset$domains<-as.factor(dataset$domains)
dataset$label_values<-as.factor(dataset$label_values)

cat_rec <- recipe(label_values ~ countries+years+phone_codes+domains, data = dataset) 
cat_model <-
  boost_tree(trees = 100, tree_depth= 5, learn_rate= 0.03) %>%
  set_engine('catboost', loss_function = 'Logloss', ignored_features = c(4,9),
             border_count = 32, l2_leaf_reg = 3.5) %>%
  set_mode('classification')
# 看看treesnip包翻译的最终对catboost的调用。
translate(cat_model)

cat_wflow <- 
  workflow() %>% 
  add_model(cat_model) %>% 
  add_recipe(cat_rec)

cat_fit <- fit(cat_wflow, dataset)
res <- predict(cat_fit, new_data = dataset %>% select(-label_values))

3、测试墨尔本房价回归模型。
  用法与Tidy Models的其它算法模型是一样的,这里缩小了参数空间的范围,可以有效提高搜索的速度和效率。R语言贝叶斯优化的高斯过程与Python的实现可能有所不同,要算出几千个候选组合,速度要慢一点。

# -----------------------------------------------------------------------------------------
library(tidymodels)
library(kableExtra)
library(tidyr)
library(treesnip)
library(ggplot2)
library(showtext)
# R Plot添加中文及其他字体[showtext]
# https://blog.csdn.net/weixin_46128755/article/details/125825935
showtext_auto()

# # All operating systems,注册并行处理
# library(doParallel)
# cl <- makePSOCKcluster(parallel::detectCores())
# registerDoParallel(cl)
# Unix like only, XGB用library(doMC)有问题,用library(doParallel)
library(doMC)
registerDoMC(cores = parallel::detectCores())

# 优先使用tidymodels的同名函数。
tidymodels_prefer()

# 异常值阈值30
threshold<- 30

# ----------------------------------------------------------------------------------------
# 加载经过预处理的数据
melbourne<- read.csv("/home/jean/data/Melbourne_housing_pre.csv")
# 过滤缺失值, 已经作过填零处理。
# Error: Missing data in columns: BuildingArea.
# 47 obs.
missing <- filter(melbourne, BuildingArea==0)
melbourne <- filter(melbourne, BuildingArea!=0)

# 划分训练集与测试集
set.seed(2023)
melbourne_split <- initial_split(melbourne, prop = 0.80)
melbourne_train <- training(melbourne_split)
melbourne_test  <-  testing(melbourne_split)

# ---------------------------------------------------------------------------------------
# Cat 模型

# 定义菜谱:回归公式与预处理
# 'Year','YearBuilt','Distance','Lattitude','Longtitude','Propertycount',
# 'Landsize','BuildingArea', 'Rooms','Bathroom', 'Car','Type_h','Type_t','Type_u'
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  #step_log(BuildingArea, base = 10) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:Cat  
cat_model<-
  boost_tree(trees = 1000, learn_rate=0.05) %>%
  set_engine("catboost", 
             loss_function = "RMSE", 
             eval_metric='RMSE'
  )  %>%
  set_mode("regression")

# 定义工作流
cat_wflow <- 
  workflow() %>% 
  add_model(cat_model) %>% 
  add_recipe(melbourne_rec)

# 训练模型
t1<-proc.time()
cat_fit <- fit(cat_wflow, melbourne_train)
t2<-proc.time()
# 2.42 0.07 2.68 NA NA
cat(t2-t1)

# 测试集预测
# 预测值
melbourne_test_res <- predict(cat_fit, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_res <- bind_cols(melbourne_test_res, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_res

# 画拟合效果,这个图效果较好
# 可以看到随机森林预测的效果是:异常值中低价区间预测偏高,高价区间预测偏低。
data <- arrange(melbourne_test_res, LogPrice)
data["index"]<-1:nrow(data)
data["upper"]<-data["LogPrice"]+log(1+threshold/100)
data["lower"]<-data["LogPrice"]+log(1-threshold/100)

ggplot(data = data)+
  geom_point(mapping = aes(x = index, y = .pred, color = "blue"), size=0.1)+
  geom_line(mapping = aes(x = index, y = LogPrice, color = "red"))+
  geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
  ggtitle("测试集拟合效果图")+
  xlab("样本")+
  ylab("ln(房价)")+
  scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
  scale_colour_manual(name = '颜色', 
                      values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))

# 计算性能指标
# rmse(melbourne_test_res, truth = LogPrice, estimate = .pred)
# 0.828, 0.228, 0.165
melbourne_metrics <- metric_set(rsq, rmse,  mae)
melbourne_metrics(melbourne_test_res, truth = LogPrice, estimate = .pred)

# ---------------------------------------------------------------------------------------------
# 5折交叉验证

set.seed(1001)
melbourne_folds <- vfold_cv(melbourne_train, v = 5)

get_model <- function(x) {
  extract_fit_parsnip(x)
}

# 并行训练
t1<-proc.time()
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE, 
                               pkgs=c("catboost","treesnip"), extract = get_model)
set.seed(1003)
cat_res <- 
  cat_wflow %>% 
  fit_resamples(resamples = melbourne_folds, control = keep_pred)
t2<-proc.time()
# 0.19 0.01 4.94 NA NA
cat(t2-t1)

cat_res

# 查看性能指标
# 0.855, 0.214
collect_metrics(cat_res)

# 查看预测效果
assess_res <- collect_predictions(cat_res)
assess_res

# 画预测效果图
data <- arrange(assess_res, LogPrice)
data["index"]<-1:nrow(data)
data["upper"]<-data["LogPrice"]+log(1+threshold/100)
data["lower"]<-data["LogPrice"]+log(1-threshold/100)

ggplot(data = data)+
  geom_point(mapping = aes(x = index, y = .pred, color = "blue"), size=0.1)+
  geom_line(mapping = aes(x = index, y = LogPrice, color = "red"))+
  geom_ribbon(aes(index, ymax=upper, ymin=lower, fill="yellow"), alpha=0.25)+
  ggtitle("测试集拟合效果图")+
  xlab("样本")+
  ylab("ln(房价)")+
  scale_fill_identity(name = '异常值区间', guide = 'legend',labels = c('正常值')) +
  scale_colour_manual(name = '颜色', 
                      values =c('blue'='blue','red'='red'), labels = c('预测值','实际值'))

# -----------------------------------------------------------------------------------------------
# 网格搜索

# 定义菜谱:回归公式与预处理
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:Catboost, 定义要调整的参数 
cat_spec <-
  boost_tree(mtry=tune(), tree_depth = tune(), trees = tune(), learn_rate = tune(), min_n = tune()) %>%
  set_engine('catboost', subsample = tune("subsample")) %>%
  set_mode('regression')

# 定义工作流
cat_wflow <- 
  workflow() %>% 
  add_model(cat_spec) %>% 
  add_recipe(melbourne_rec)

# mtry参数的边界未完全确定,用finalize()函数确定。
cat_param <- cat_wflow %>%
  extract_parameter_set_dials() %>%
  update(learn_rate = threshold(c(0.01,0.5))) %>%
  update(trees = trees(c(500,1000))) %>%
  update(tree_depth = tree_depth(c(5,15))) %>%
  update(mtry = mtry(c(3,6))) %>%
  update(subsample = threshold(c(0.5,1))) %>%
  finalize(melbourne_train)

# 查看参数边界,都已确定
cat_param %>% extract_parameter_dials("trees")
cat_param %>% extract_parameter_dials("min_n")
cat_param %>% extract_parameter_dials("tree_depth")
cat_param %>% extract_parameter_dials("learn_rate")
cat_param %>% extract_parameter_dials("mtry")
cat_param %>% extract_parameter_dials("subsample")  

# 执行网格搜索,每个参数搜索3个候选。
set.seed(2023)
t1<-proc.time()
cat_res_grid <- 
  cat_wflow %>% 
  tune_grid(
    resamples = melbourne_folds,
    grid = cat_param %>% grid_regular(levels = 2),  # 用参数空间构造一个网格 2^5个组合。
    metrics = metric_set(rsq, rmse, mae)
  )
t2<-proc.time()
# 0.44 0.06 249.74 NA NA
cat(t2-t1)

# 画图查看网格搜索效果
autoplot(cat_res_grid)

# 查看准确率最高的模型
show_best(cat_res_grid, metric="rsq")

# 选择准确率最高的模型
select_best(cat_res_grid, metric="rsq")

# 构造最佳参数
# 直接读取调参的最佳结果
cat_param_best<- select_best(cat_res_grid, metric="rsq")
# 最佳参数回填到工作流
cat_wflow_grid <-
  cat_wflow %>%
  finalize_workflow(cat_param_best)
cat_wflow_grid

# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
cat_fit_grid<- cat_wflow_grid %>% fit(melbourne_train)
t2<-proc.time()
# 28.64 0.39 29.02 NA NA
cat(t2-t1)

# 预测值
melbourne_test_grid <- predict(cat_fit_grid, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_grid <- bind_cols(melbourne_test_grid, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_grid
# 85.3, 0.201, 0.142
melbourne_metrics(melbourne_test_grid, truth = LogPrice, estimate = .pred)

# ----------------------------------------------------------------------------------------------------
# 贝叶斯优化
# 可以调整菜谱参数、模型主参数及引擎相关参数。

# 定义菜谱:回归公式与预处理
melbourne_rec<-
  recipe(LogPrice ~ Year + YearBuilt + Distance + Lattitude + Longtitude + BuildingArea
         + Rooms + Bathroom + Car + Type_h + Type_t + Type_u, data = melbourne_train) %>%
  # 标准化数值型变量
  step_normalize(all_numeric_predictors())

# 定义模型:Catboost, 定义要调整的参数 
cat_spec <-
  boost_tree(mtry=tune(), tree_depth = tune(), trees = tune(), learn_rate = tune(), min_n = tune()) %>%
  set_engine('catboost', subsample = tune("subsample")) %>%
  set_mode('regression')

# 定义工作流
cat_wflow <- 
  workflow() %>% 
  add_model(cat_spec) %>% 
  add_recipe(melbourne_rec)

# mtry参数的边界未完全确定,用finalize()函数确定。
cat_param <- cat_wflow %>%
  extract_parameter_set_dials() %>%
  update(learn_rate = threshold(c(0.01,0.5))) %>%
  update(trees = trees(c(500,1000))) %>%
  update(tree_depth = tree_depth(c(5,15))) %>%
  update(mtry = mtry(c(3,6))) %>%
  update(subsample = threshold(c(0.5,1))) %>%
  finalize(melbourne_train)

# 查看参数边界,都已确定
cat_param %>% extract_parameter_dials("trees")
cat_param %>% extract_parameter_dials("min_n")
cat_param %>% extract_parameter_dials("tree_depth")
cat_param %>% extract_parameter_dials("learn_rate")
cat_param %>% extract_parameter_dials("mtry")
cat_param %>% extract_parameter_dials("subsample")  

melbourne_folds <- vfold_cv(melbourne, v = 5)

# 执行贝叶斯优化
ctrl <- control_bayes(verbose = TRUE, no_improve = Inf)
# set.seed(2023)
t1<-proc.time()
cat_res_bo <-
  cat_wflow %>%
  tune_bayes(
    resamples = melbourne_folds,
    metrics = metric_set(rsq, rmse, mae),
    initial = 6,
    param_info = cat_param,
    iter = 100,
    control = ctrl
  )
t2<-proc.time()
# 4514.99 12.53 10034.09 9278.199 5407.921
cat(t2-t1)

# 画图查看贝叶斯优化效果
autoplot(cat_res_bo, type = "performance", metric="rsq")

# 查看准确率最高的模型
show_best(cat_res_bo, metric="rsq")

# 选择准确率最高的模型
select_best(cat_res_bo, metric="rsq")
# # A tibble: 1 × 7
# mtry trees min_n tree_depth learn_rate subsample .config
# <int> <int> <int>      <int>      <dbl>     <dbl> <chr>  
#  5   915    27          7     0.0734     0.676 Iter96 

# 直接读取调参的最佳结果
cat_param_best<- select_best(cat_res_bo, metric="rsq")

# 最佳参数回填到工作流
cat_wflow_bo <-
  cat_wflow %>%
  finalize_workflow(cat_param_best)
cat_wflow_bo

# 用最佳参数在训练集全集上训练模型
t1<-proc.time()
cat_fit_bo<- cat_wflow_bo %>% fit(melbourne_train)
t2<-proc.time()
# 2.504 0.019 2.561 0 0
cat(t2-t1)

# 测试集
# 预测值
melbourne_test_bo <- predict(cat_fit_bo, new_data = melbourne_test %>% select(-LogPrice))
# 合并真实值
melbourne_test_bo <- bind_cols(melbourne_test_bo, melbourne_test %>% select(LogPrice))
# 浏览
melbourne_test_bo
# 0.870, 0.190, 0.135
melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)

# 训练集
# 预测值
melbourne_train_bo <- predict(cat_fit_bo, new_data = melbourne_train %>% select(-LogPrice))
# 合并真实值
melbourne_train_bo <- bind_cols(melbourne_train_bo, melbourne_train %>% select(LogPrice))
# 浏览
melbourne_train_bo
# 0.952, 0.117, 0.0889
melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)

  R语言贝叶斯优化的高斯过程要算几千个候选点,贝叶斯优化的耗时主要在高斯过程。缩小参数空间的范围,可以有效提高搜索的速度和效率。

── Iteration 95 ─────────────────────────────────────────

i Current best:     rsq=0.8909 (@iter 70)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i mtry=5, trees=989, min_n=27, tree_depth=5, learn_rate=0.456, subsample=0.795
i Estimating performance
✓ Estimating performance
ⓧ Newest results:   rsq=0.8764 (+/-0.003)

── Iteration 96 ─────────────────────────────────────────

i Current best:     rsq=0.8909 (@iter 70)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i mtry=5, trees=915, min_n=27, tree_depth=7, learn_rate=0.0734, subsample=0.676
i Estimating performance
✓ Estimating performance
♥ Newest results:   rsq=0.8909 (+/-0.00372)

── Iteration 97 ─────────────────────────────────────────

i Current best:     rsq=0.8909 (@iter 96)
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i mtry=5, trees=902, min_n=25, tree_depth=7, learn_rate=0.0826, subsample=0.91
i Estimating performance
✓ Estimating performance
ⓧ Newest results:   rsq=0.89 (+/-0.00401)

  性能数据跟LightGBM差不多,不相伯仲。

> show_best(cat_res_bo, metric="rsq")
# A tibble: 5 × 13
   mtry trees min_n tree_depth learn_rate subsample .metric .estimator  mean     n std_err .config .iter
  <int> <int> <int>      <int>      <dbl>     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>   <int>
1     5   915    27          7     0.0734     0.676 rsq     standard   0.891     5 0.00372 Iter96     96
2     6   976    13          7     0.0526     0.503 rsq     standard   0.891     5 0.00387 Iter70     70
3     6   962    22          7     0.0767     0.996 rsq     standard   0.891     5 0.00369 Iter43     43
4     5   986     8          7     0.0565     0.646 rsq     standard   0.891     5 0.00374 Iter76     76
5     6   925    28          7     0.0769     0.789 rsq     standard   0.891     5 0.00372 Iter63     63
> select_best(cat_res_bo, metric="rsq")
# A tibble: 1 × 7
   mtry trees min_n tree_depth learn_rate subsample .config
  <int> <int> <int>      <int>      <dbl>     <dbl> <chr>  
1     5   915    27          7     0.0734     0.676 Iter96 
> melbourne_metrics(melbourne_test_bo, truth = LogPrice, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.870
2 rmse    standard       0.190
3 mae     standard       0.135
> melbourne_metrics(melbourne_train_bo, truth = LogPrice, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.952 
2 rmse    standard      0.117 
3 mae     standard      0.0889

Catboost贝叶斯优化过程

  另一个开源项目catsnip也是在treesnip上改的,提供了对Catboost的支持,虽然也有不支持factor目标变量的问题,但它提供了源码,可以参考该帖子按自己的需要去修改。如果要同时使用LightGBM与Catboost,那么要加载bonsai,这时加载treesnip就会报错,可以用catsnip代替,它解决了与bonsai包冲突的问题。

library(tidymodels)
# 不能同时加载bonsai包与treesnip包。
library(bonsai)
# library(treesnip)
# 错误: The combination of engine 'lightgbm' and mode 'regression'  already has 
# fit data for model 'boost_tree'and the new information being registered is different.
# treesnip包同时支持LightGBM与Catboost算法,会与bonsai包冲突。
# 用catsnip包可以避免与bonsai包的冲突,用LightGBM的话还是用bonsai包好一点。
library(catsnip)
# 优先使用tidymodels的同名函数。
tidymodels_prefer()
show_model_info("boost_tree")

  注意catboost的参数sample_prop有误,应该是sample_size。

> show_model_info("boost_tree")
Information for `boost_tree`
 modes: unknown, classification, regression, censored regression 

 engines: 
   classification: C5.0¹, catboost, lightgbm, spark, xgboost¹
   regression:     catboost, lightgbm, spark, xgboost¹

¹The model can use case weights.

 arguments: 
   xgboost:  
      tree_depth     --> max_depth
      trees          --> nrounds
      learn_rate     --> eta
      mtry           --> colsample_bynode
      min_n          --> min_child_weight
      loss_reduction --> gamma
      sample_size    --> subsample
      stop_iter      --> early_stop
   C5.0:     
      trees          --> trials
      min_n          --> minCases
      sample_size    --> sample
   spark:    
      tree_depth     --> max_depth
      trees          --> max_iter
      learn_rate     --> step_size
      mtry           --> feature_subset_strategy
      min_n          --> min_instances_per_node
      loss_reduction --> min_info_gain
      sample_size    --> subsampling_rate
   lightgbm: 
      tree_depth     --> max_depth
      trees          --> num_iterations
      learn_rate     --> learning_rate
      mtry           --> feature_fraction_bynode
      min_n          --> min_data_in_leaf
      loss_reduction --> min_gain_to_split
      sample_size    --> bagging_fraction
      stop_iter      --> early_stopping_round
   catboost: 
      tree_depth     --> depth
      trees          --> iterations
      learn_rate     --> learning_rate
      mtry           --> rsm
      min_n          --> min_data_in_leaf
      sample_prop    --> subsample

 fit modules:
     engine           mode
    xgboost     regression
    xgboost classification
       C5.0 classification
      spark     regression
      spark classification
   lightgbm     regression
   lightgbm classification
   catboost     regression
   catboost classification

 prediction modules:
             mode   engine          methods
   classification     C5.0 class, prob, raw
   classification catboost class, prob, raw
   classification lightgbm class, prob, raw
   classification    spark      class, prob
   classification  xgboost class, prob, raw
       regression catboost     numeric, raw
       regression lightgbm          numeric
       regression    spark          numeric
       regression  xgboost     numeric, raw

  如果要调整的超参数比较多,贝叶斯优化比网格搜索就要高效很多。比如调7个参数,每个参数只选2个值就有2^7=512个组合,多折交叉验证就要运行那么多次。用贝叶斯优化的话,100次迭代就可以找出比较好的参数组合,就快很多了。

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 157,298评论 4 360
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 66,701评论 1 290
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 107,078评论 0 237
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 43,687评论 0 202
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 52,018评论 3 286
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 40,410评论 1 211
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 31,729评论 2 310
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 30,412评论 0 194
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 34,124评论 1 239
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 30,379评论 2 242
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 31,903评论 1 257
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 28,268评论 2 251
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 32,894评论 3 233
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 26,014评论 0 8
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 26,770评论 0 192
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 35,435评论 2 269
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 35,312评论 2 260

推荐阅读更多精彩内容