I am trying to build a stacked ensemble using H2O in R. It has five base learners - Random Forest, XGBoost, GLM, GBM and Naive Bayes. It is a classification problem with three levels. The base learners ran successfully and returned accuracy values on a test dataset.
When the base learners are used in h2o.stackedEnsemble, below error is returned:
Error: water.exceptions.H2OIllegalArgumentException: water.exceptions.H2OIllegalArgumentException: Don't know how to determine the distribution for a multinomial classifier.
Below is the code snippet for the stacked ensemble section:
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),
c(depVarsMulti,"weightage")), #Names of indep vars
y = depVarsMulti, #dep var
training_frame = trainPCA,
model_id = "123",
base_models = c(ModelOneRF@model_id,
ModelTwoXGBoost@model_id,ModelThreeGLM@model_id,ModelFourGBM@model_id,ModelFiveBayes@model_id),
metalearner_algorithm = "drf",
metalearner_nfolds = nfolds)
Additonal details:
I am able to build stacked ensemble models similar to the grid example given here
H2O Version: "3.21.0.4359" | R Version: "3.4.1 (2017-06-30)"
The H2O cluster is a local one
EDIT(Aug 3, 2018):
As suggested by Darren, I am adding a script that reproduces the problem using an open dataset Cars93 (from package CARS)
#######################################################################
# Minimum reproducible example for Stackoverflow
#######################################################################
# R version: 3.4.4 (2018-03-15)
# H2O cluster version: 3.21.0.4376
#OS: Linux (Azure Data Science VM)
#Installing and loading necessary libraries
cat("\n Installing and loading necessary libraries \n")
libsNeeded <- c("dplyr", "data.table", "randomForest", "stringr","doParallel", "parallel", "doSNOW", "rlang", "nlme", "MASS", "survival", "stringi", "dummies", "missRanger","cluster", "e1071","xgboost","ranger", "caret")
if(length(setdiff(libsNeeded, rownames(installed.packages()))) > 0){
install.packages(setdiff(libsNeeded, rownames(installed.packages())))
}
lapply(libsNeeded, require, character.only = T)
#Installing latest H2O if not done already:
# install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))
library(h2o)
#Starting an H2O cluster
h2o.init(max_mem_size = "23g")
library(CARS)
dataFrame <- Cars93
#Removing rows where Passengers = 8 or 7 or 2, as their occurence frequency is low and for demonstration purpose, want to avoid errors coming because of this reason
dataFrame <- dataFrame[!(dataFrame$Passengers %in% c("2", "7", "8")),]
#Making the dependent variable as factor
dataFrame$Passengers <- as.factor(dataFrame$Passengers)
#Defining the variables to be used in modeling
depVars <- "Passengers"
indepNumVars <- c("Price","MPG.highway","EngineSize","Horsepower")
indepFactVars <- c("AirBags","Type")
#Keeping only columns of interest
dataFrame <- dataFrame[,c(indepFactVars,indepNumVars,depVars)]
#Converting dependent variables into dummy variables:
dataFrame <- dummy.data.frame(dataFrame, names=colnames(dataFrame[,indepFactVars]), sep="_")
names(dataFrame) <- gsub(" ", "_", names(dataFrame))
#Creating the train and test datasets
trainIndex <- createDataPartition(dataFrame[,depVars], times = 1, p = 0.75)
trainingData <- dataFrame[trainIndex$Resample1,]
testingData <- dataFrame[-trainIndex$Resample1,]
# H2O Frames
train <- as.h2o(trainingData)
test <- as.h2o(testingData)
# Perform PCA
depData <- train[, depVars]
train <- train[, setdiff(names(train), c(depVars))]
pca_model <- h2o.prcomp(training_frame = train,
model_id = NULL,
ignore_const_cols = TRUE,
transform = "STANDARDIZE",
pca_method = "GramSVD",
k = 10,
max_iterations = 5000,
seed = -1,
score_each_iteration = TRUE,
use_all_factor_levels = FALSE,
compute_metrics = TRUE,
max_runtime_secs = 0,
impute_missing = T)
cum_prop <- pca_model@model$model_summary["Cumulative Proportion", ]
# print(cum_prop)
cum_prop_to_consider <- length(cum_prop[cum_prop < .95]) + 1
cat("\n\n Number of principal components that explain 95% variance = ",cum_prop_to_consider,"\n\n")
trainPCA <- h2o.predict(pca_model, train)
if(cum_prop_to_consider > ncol(trainPCA)){
trainPCA <- trainPCA[, 1:(cum_prop_to_consider - 1)]
}else{
trainPCA <- trainPCA[, 1:cum_prop_to_consider]
}
# pca_data <- as.data.table(pca_data)
trainPCA[, depVars] <- depData[, depVars]
#Preparing the test data:
testPCA <- h2o.predict(pca_model,test)
if(cum_prop_to_consider > ncol(testPCA)){
testPCA <- testPCA[, 1:(cum_prop_to_consider - 1)]
}else{
testPCA <- testPCA[, 1:cum_prop_to_consider]
}
testPCA[, depVars] <- test[, depVars]
# For binary classification, response should be a factor
trainPCA[,depVars] <- as.factor(trainPCA[,depVars])
testPCA[,depVars] <- as.factor(test[,depVars])
#Weights of the training data:
trainPCA$weightage <- ifelse(trainPCA[,depVars] == "5", 1, ifelse(trainPCA[,depVars] == "4", 2, ifelse(trainPCA[,depVars] == "6", 2,1)))
# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
####################################################################################################
# Stacked Ensemble modeling
####################################################################################################
modelIteration <- Sys.Date()
modelIteration <- gsub("-", "_", modelIteration)
i = "withInsp"
# Train & Cross-validate a RF
ModelOneRF <- h2o.randomForest(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
ntrees = 15,
nfolds = nfolds,
fold_assignment = "Stratified",
max_depth = 30,
min_rows = 1,
mtries = 3,
keep_cross_validation_predictions = TRUE,
seed = 1,
# verbose = T,
weights_column = "weightage",
model_id = paste0(i,"_ModelOneRF_",modelIteration))
cat("\n\n Mean accuracy of Random Forest Model (on cross validation):",ModelOneRF@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_RF <- h2o.performance(model = ModelOneRF, newdata = testPCA)
cat("\n\n Accuracy of Random Forest Model (on test data):",1 - perf_RF@metrics$mean_per_class_error,"\n\n")
# Train & Cross-validate a XGBoost
ModelTwoXGBoost <- h2o.xgboost(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
ntrees = 15,
max_depth = 20,
min_rows = 1,
learn_rate = 0.1,
eta = 0.3,
keep_cross_validation_predictions = TRUE,
seed = 1,
# verbose = T,
model_id = paste0(i,"_ModelTwoXGBoost_",modelIteration))
cat("\n\n Mean accuracy of XGBoost Model (on cross validation):",ModelTwoXGBoost@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_XGBoost <- h2o.performance(model = ModelTwoXGBoost, newdata = testPCA)
cat("\n\n Accuracy of XGBoost Model (on test data):",1 - perf_XGBoost@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Generalized Linear Model (GLM)
ModelThreeGLM <- h2o.glm(family= "multinomial",
x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
alpha = 0.0,
lambda_search = T,
standardize = T,
seed = 1,
# verbose = T,
model_id = paste0(i,"_ModelThreeGLM_",modelIteration),
keep_cross_validation_predictions = TRUE)
cat("\n\n Mean accuracy of GLM Model (on cross validation):",ModelThreeGLM@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_GLM <- h2o.performance(model = ModelThreeGLM, newdata = testPCA)
cat("\n\n Accuracy of GLM Model (on test data):",1 - perf_GLM@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Gradient Boosting Machine (GBM)
ModelFourGBM <- h2o.gbm(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
ntrees = 10,
max_depth = 20,
seed = 1,
learn_rate = 0.05,
learn_rate_annealing = 0.99,
# verbose = T,
keep_cross_validation_predictions = TRUE,
model_id = paste0(i,"_ModelFourGBM_",modelIteration))
cat("\n\n Mean accuracy of GBM Model (on cross validation):",ModelFourGBM@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_GBM <- h2o.performance(model = ModelFourGBM, newdata = testPCA)
cat("\n\n Accuracy of GBM Model (on test data):",1 - perf_GBM@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Naïve Bayes Model
ModelFiveBayes <- h2o.naiveBayes(x = setdiff(colnames(trainPCA),c(depVars,"weightage")),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
# weights_column = "weightage",
seed = 1,
# verbose = T,
keep_cross_validation_predictions = TRUE,
model_id = paste0(i,"_ModelFiveBayes_",modelIteration))
cat("\n\n Mean accuracy of Naive Bayes Model (on cross validation):",ModelFiveBayes@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_Bayes <- h2o.performance(model = ModelFiveBayes, newdata = testPCA)
cat("\n\n Accuracy of Naive Bayes Model (on test data):",1 - perf_Bayes@metrics$mean_per_class_error,"\n\n")
# Train a stacked ensemble using the GBM and RF above
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),c(depVars,"weightage")),
y = depVars,
training_frame = trainPCA,
# model_id = paste0(i,"_ModelEnsemble_",modelIteration),
model_id = paste0(i,"_ModelEnsemble_2_",modelIteration),
base_models = c(ModelOneRF@model_id, ModelTwoXGBoost@model_id,ModelThreeGLM@model_id,ModelFourGBM@model_id,ModelFiveBayes@model_id),
metalearner_algorithm = "drf",
metalearner_nfolds = nfolds)
metalearner_algorithm
of "drf" then. By the waymetalearner_nfolds
is not mentioned in the docs. But you might want to try experimenting withmetalearner_params
: docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/… – Darren Cook