I am trying to understand how to build predictive models and recently came across xgboost package in R and tried to implement it using Titanic dataset. I built a model and now I am wondering how to detect that if my model is overfitting or not and how many rounds to choose and if this is based on train-error or test-error.
This is the code:
#Load Dataset
titanic.train <- read.csv("D:/Data/titanic/train.csv")
titanic.test <- read.csv("D:/Data/titanic/test.csv")
PassengerId=titanic.test$PassengerId
head(titanic.train)
#Create columns to distinguish between Train and Test datasets
titanic.train$IsTrainSet <- TRUE
titanic.test$IsTrainSet <- FALSE
#Create a missing column for Test data
titanic.test$Survived <- NA
#Combine Test and Train Datasets
titanic.full <- rbind(titanic.train , titanic.test)
tail(titanic.full)
titanic.full$Name <- as.character(titanic.full$Name)
titanic.full$Title <- sapply(titanic.full$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
titanic.full$Title <- sub(' ','',titanic.full$Title)
titanic.full$Title[titanic.full$Title %in% c('Capt', 'Col' , 'Dr' , 'Don', 'Major', 'Sir' , 'Rev' ,
'Dona', 'Lady', 'the Countess' , 'Jonkheer', 'Master')] <- 'Noble'
titanic.full$Title[titanic.full$Title %in% c('Ms', 'Miss' , 'Mlle')] <- 'Miss'
titanic.full$Title[titanic.full$Title %in% c('Mrs' , 'Mme')] <- 'Mrs'
table(titanic.full$Title)
#Family size 3 and greater are TRUE or 1
titanic.full$Family <- titanic.full$SibSp + titanic.full$Parch + 1
table(titanic.full$Family)
#titanic.full$Family <- titanic.full$Family >= 3
#titanic.full$Family <- as.factor(titanic.full$Family)
#levels(titanic.full$Family) <- c(0,1)
#titanic.full$Family
titanic.full <- titanic.full[c( "Pclass" , "Title" , "Sex" , "Age" , "Family" , "Fare", "SibSp" , "Parch" , "Embarked" , "Survived")]
head(titanic.full)
#Categorical Casting
titanic.full$Title <- as.factor(titanic.full$Title)
titanic.full$Sex <- as.factor(titanic.full$Sex)
titanic.full$Embarked <- as.factor(titanic.full$Embarked)
titanicDummy <- dummyVars("~.",data=titanic.full, fullRank=T)
titanic.full <- as.data.frame(predict(titanicDummy,titanic.full))
print(names(titanic.full))
#Create test and train data sets
titanic.train <- titanic.full[1:891,]
titanic.test <- titanic.full[892:1309,]
#XGBoosting
set.seed(35)
labs <- titanic.train$Survived
names(titanic.full)
dat <- titanic.train[c("Pclass","Title.Mr","Title.Mrs","Title.Noble", "Sex.male","Age", "Family", "Fare", "SibSp","Parch","Embarked.C","Embarked.Q","Embarked.S")]
titdata <- xgb.DMatrix(data = as.matrix(dat), missing = NA, label=as.numeric(labs))
res <- xgb.cv(objective="binary:logistic" , eta=0.1, metric="auc", max_depth = 3,
data = titdata , label=as.numeric(labs) , nrounds = 200 , nfold = 10 , prediction = TRUE)
This is the result and I need help interpreting it and some advice on what should I look at to increase or decrease "eta" and "max_depth"
res <- xgb.cv(objective="binary:logistic" , eta=0.1, metric="auc", max_depth = 3, + data = titdata , label=as.numeric(labs) , nrounds = 200 , nfold = 10 , prediction = TRUE) [1] train-auc:0.869192+0.009120 test-auc:0.856596+0.056215 [2] train-auc:0.875578+0.007362 test-auc:0.864648+0.059435 [3] train-auc:0.877447+0.006179 test-auc:0.865721+0.059722 [4] train-auc:0.881547+0.006015 test-auc:0.867861+0.060918 [5] train-auc:0.882322+0.006134 test-auc:0.869224+0.060917 [6] train-auc:0.884335+0.005501 test-auc:0.871305+0.061070 [7] train-auc:0.885292+0.004622 test-auc:0.871798+0.060813 [8] train-auc:0.886477+0.004309 test-auc:0.872152+0.060549 [9] train-auc:0.887559+0.004344 test-auc:0.870026+0.060668 [10] train-auc:0.889544+0.004236 test-auc:0.866687+0.062255 [11] train-auc:0.890486+0.004868 test-auc:0.868404+0.060674 [12] train-auc:0.891533+0.004522 test-auc:0.866567+0.060826 [13] train-auc:0.893234+0.004674 test-auc:0.865659+0.060419 [14] train-auc:0.894624+0.004733 test-auc:0.866490+0.061235 [15] train-auc:0.895527+0.004583 test-auc:0.867123+0.060497 [16] train-auc:0.896230+0.005107 test-auc:0.866917+0.061243 [17] train-auc:0.897409+0.004722 test-auc:0.868524+0.058841 [18] train-auc:0.898608+0.005050 test-auc:0.866894+0.059343 [19] train-auc:0.899822+0.005369 test-auc:0.868109+0.055177 [20] train-auc:0.900747+0.004902 test-auc:0.869309+0.055201 [21] train-auc:0.901191+0.005089 test-auc:0.869443+0.054093 [22] train-auc:0.901958+0.005586 test-auc:0.871578+0.050996 [23] train-auc:0.902394+0.005318 test-auc:0.871743+0.050770 [24] train-auc:0.902980+0.005217 test-auc:0.872139+0.050897 [25] train-auc:0.903442+0.004909 test-auc:0.871261+0.052185 [26] train-auc:0.904110+0.004886 test-auc:0.869618+0.051097 [27] train-auc:0.905216+0.004741 test-auc:0.868398+0.051865 [28] train-auc:0.906057+0.004625 test-auc:0.868029+0.052551 [29] train-auc:0.906668+0.004490 test-auc:0.868944+0.052679 [30] train-auc:0.907469+0.004247 test-auc:0.867783+0.052927 [31] train-auc:0.908050+0.004229 test-auc:0.866851+0.053204 [32] train-auc:0.909257+0.004118 test-auc:0.866138+0.055716 [33] train-auc:0.909878+0.004200 test-auc:0.866495+0.055579 [34] train-auc:0.910485+0.004335 test-auc:0.867096+0.055105 [35] train-auc:0.911178+0.004286 test-auc:0.866654+0.055277 [36] train-auc:0.912293+0.004265 test-auc:0.866994+0.055417 [37] train-auc:0.913109+0.003980 test-auc:0.866273+0.054600 [38] train-auc:0.913829+0.004026 test-auc:0.866423+0.055104 [39] train-auc:0.914379+0.004208 test-auc:0.866457+0.054575 [40] train-auc:0.914986+0.004381 test-auc:0.865801+0.055015 [41] train-auc:0.915569+0.004222 test-auc:0.866209+0.054269 [42] train-auc:0.916022+0.004439 test-auc:0.864052+0.057692 [43] train-auc:0.916491+0.004346 test-auc:0.864199+0.058291 [44] train-auc:0.916902+0.004508 test-auc:0.864829+0.058214 [45] train-auc:0.917351+0.004464 test-auc:0.863823+0.058295 [46] train-auc:0.917825+0.004461 test-auc:0.863831+0.058384 [47] train-auc:0.918281+0.004501 test-auc:0.863642+0.058678 [48] train-auc:0.918833+0.004514 test-auc:0.863539+0.058620 [49] train-auc:0.919144+0.004550 test-auc:0.863587+0.058575 [50] train-auc:0.919679+0.004454 test-auc:0.864577+0.057084 [51] train-auc:0.920047+0.004557 test-auc:0.864855+0.057393 [52] train-auc:0.920453+0.004770 test-auc:0.865826+0.055897 [53] train-auc:0.920900+0.004533 test-auc:0.865653+0.055794 [54] train-auc:0.921269+0.004572 test-auc:0.865483+0.055574 [55] train-auc:0.921690+0.004696 test-auc:0.866402+0.054969 [56] train-auc:0.922165+0.004787 test-auc:0.867068+0.054520 [57] train-auc:0.922492+0.004574 test-auc:0.866784+0.054629 [58] train-auc:0.922904+0.004615 test-auc:0.866632+0.054462 [59] train-auc:0.923430+0.004563 test-auc:0.866625+0.054274 [60] train-auc:0.923674+0.004492 test-auc:0.866584+0.054400 [61] train-auc:0.923847+0.004484 test-auc:0.866553+0.054280 [62] train-auc:0.924204+0.004409 test-auc:0.867392+0.053863 [63] train-auc:0.924350+0.004435 test-auc:0.867171+0.053805 [64] train-auc:0.924681+0.004547 test-auc:0.867443+0.053727 [65] train-auc:0.925087+0.004428 test-auc:0.867295+0.053986 [66] train-auc:0.925663+0.004398 test-auc:0.866947+0.054199 [67] train-auc:0.925823+0.004476 test-auc:0.867209+0.053866 [68] train-auc:0.926276+0.004367 test-auc:0.867635+0.054284 [69] train-auc:0.926702+0.004346 test-auc:0.867142+0.054288 [70] train-auc:0.927134+0.004529 test-auc:0.867386+0.054673 [71] train-auc:0.927491+0.004571 test-auc:0.866602+0.054073 [72] train-auc:0.928026+0.004551 test-auc:0.866468+0.054223 [73] train-auc:0.928421+0.004448 test-auc:0.866837+0.054042 [74] train-auc:0.928793+0.004538 test-auc:0.866865+0.053660 [75] train-auc:0.928996+0.004604 test-auc:0.866813+0.053500 [76] train-auc:0.929360+0.004683 test-auc:0.867645+0.053280 [77] train-auc:0.929817+0.004426 test-auc:0.868134+0.054196 [78] train-auc:0.930156+0.004416 test-auc:0.867472+0.054034 [79] train-auc:0.930336+0.004371 test-auc:0.867463+0.053932 [80] train-auc:0.930774+0.004482 test-auc:0.867997+0.054005 [81] train-auc:0.931103+0.004319 test-auc:0.867835+0.054027 [82] train-auc:0.931436+0.004340 test-auc:0.867459+0.053907 [83] train-auc:0.931650+0.004284 test-auc:0.866549+0.053899 [84] train-auc:0.931984+0.004116 test-auc:0.866696+0.054119 [85] train-auc:0.932413+0.004126 test-auc:0.866812+0.053876 [86] train-auc:0.932853+0.004255 test-auc:0.866699+0.053729 [87] train-auc:0.933132+0.004264 test-auc:0.866588+0.053681 [88] train-auc:0.933688+0.004378 test-auc:0.866346+0.053490 [89] train-auc:0.934066+0.004233 test-auc:0.866561+0.053246 [90] train-auc:0.934529+0.004557 test-auc:0.866569+0.053029 [91] train-auc:0.934796+0.004587 test-auc:0.866623+0.053103 [92] train-auc:0.935168+0.004627 test-auc:0.865982+0.053365 [93] train-auc:0.935630+0.004560 test-auc:0.865622+0.053572 [94] train-auc:0.935930+0.004483 test-auc:0.865783+0.053594 [95] train-auc:0.936319+0.004339 test-auc:0.865757+0.053474 [96] train-auc:0.936615+0.004263 test-auc:0.865538+0.053517 [97] train-auc:0.936840+0.004360 test-auc:0.865439+0.053619 [98] train-auc:0.937240+0.004283 test-auc:0.865529+0.053559 [99] train-auc:0.937612+0.004325 test-auc:0.865924+0.053515 [100] train-auc:0.937934+0.004419 test-auc:0.865869+0.053225 [101] train-auc:0.938170+0.004512 test-auc:0.865770+0.053696 [102] train-auc:0.938491+0.004540 test-auc:0.865449+0.053774 [103] train-auc:0.938744+0.004574 test-auc:0.865712+0.052924 [104] train-auc:0.938961+0.004587 test-auc:0.865698+0.052789 [105] train-auc:0.939429+0.004391 test-auc:0.866574+0.053548 [106] train-auc:0.939608+0.004285 test-auc:0.866846+0.053928 [107] train-auc:0.940049+0.004231 test-auc:0.866967+0.054383 [108] train-auc:0.940273+0.004230 test-auc:0.867415+0.054812 [109] train-auc:0.940599+0.004284 test-auc:0.866757+0.054485 [110] train-auc:0.940855+0.004281 test-auc:0.866719+0.054324 [111] train-auc:0.941248+0.004187 test-auc:0.866606+0.054411 [112] train-auc:0.941609+0.004178 test-auc:0.866776+0.054262 [113] train-auc:0.942016+0.004326 test-auc:0.867379+0.054235 [114] train-auc:0.942123+0.004331 test-auc:0.867263+0.054290 [115] train-auc:0.942362+0.004380 test-auc:0.867058+0.054484 [116] train-auc:0.942642+0.004397 test-auc:0.867212+0.053896 [117] train-auc:0.942910+0.004512 test-auc:0.866902+0.054213 [118] train-auc:0.943095+0.004524 test-auc:0.866685+0.053992 [119] train-auc:0.943411+0.004400 test-auc:0.866844+0.053564 [120] train-auc:0.943627+0.004282 test-auc:0.867069+0.054186 [121] train-auc:0.943793+0.004305 test-auc:0.867372+0.053924 [122] train-auc:0.944034+0.004300 test-auc:0.867482+0.053637 [123] train-auc:0.944404+0.004257 test-auc:0.867957+0.053311 [124] train-auc:0.944712+0.004312 test-auc:0.868072+0.053453 [125] train-auc:0.944997+0.004157 test-auc:0.869068+0.053413 [126] train-auc:0.945215+0.004118 test-auc:0.869093+0.053406 [127] train-auc:0.945473+0.004058 test-auc:0.869137+0.053200 [128] train-auc:0.945871+0.003948 test-auc:0.869462+0.053528 [129] train-auc:0.946026+0.003936 test-auc:0.869688+0.053372 [130] train-auc:0.946277+0.003754 test-auc:0.869576+0.053113 [131] train-auc:0.946444+0.003649 test-auc:0.869194+0.053095 [132] train-auc:0.946642+0.003725 test-auc:0.869515+0.052871 [133] train-auc:0.946820+0.003592 test-auc:0.869199+0.052938 [134] train-auc:0.947051+0.003620 test-auc:0.869311+0.052884 [135] train-auc:0.947473+0.003638 test-auc:0.870009+0.052410 [136] train-auc:0.947780+0.003629 test-auc:0.869906+0.052172 [137] train-auc:0.948068+0.003695 test-auc:0.869593+0.052137 [138] train-auc:0.948298+0.003800 test-auc:0.869807+0.051708 [139] train-auc:0.948461+0.003800 test-auc:0.869279+0.052147 [140] train-auc:0.948715+0.003760 test-auc:0.869389+0.052126 [141] train-auc:0.948981+0.003686 test-auc:0.869353+0.052317 [142] train-auc:0.949199+0.003648 test-auc:0.869630+0.052192 [143] train-auc:0.949387+0.003586 test-auc:0.869038+0.052386 [144] train-auc:0.949599+0.003620 test-auc:0.869290+0.052218 [145] train-auc:0.949904+0.003591 test-auc:0.869723+0.051909 [146] train-auc:0.950152+0.003542 test-auc:0.870094+0.052072 [147] train-auc:0.950349+0.003444 test-auc:0.869605+0.051703 [148] train-auc:0.950581+0.003447 test-auc:0.869659+0.051793 [149] train-auc:0.950802+0.003429 test-auc:0.869284+0.051968 [150] train-auc:0.951045+0.003509 test-auc:0.868943+0.052391 [151] train-auc:0.951256+0.003528 test-auc:0.869113+0.052452 [152] train-auc:0.951533+0.003455 test-auc:0.869608+0.051146 [153] train-auc:0.951776+0.003452 test-auc:0.869464+0.051480 [154] train-auc:0.952069+0.003511 test-auc:0.869697+0.050773 [155] train-auc:0.952243+0.003447 test-auc:0.869850+0.050781 [156] train-auc:0.952522+0.003466 test-auc:0.869631+0.051013 [157] train-auc:0.952750+0.003430 test-auc:0.868954+0.051264 [158] train-auc:0.952986+0.003302 test-auc:0.868899+0.050795 [159] train-auc:0.953262+0.003318 test-auc:0.869263+0.050704 [160] train-auc:0.953457+0.003253 test-auc:0.869479+0.050793 [161] train-auc:0.953652+0.003300 test-auc:0.869534+0.050539 [162] train-auc:0.954006+0.003246 test-auc:0.869209+0.050546 [163] train-auc:0.954325+0.003080 test-auc:0.869472+0.050937 [164] train-auc:0.954566+0.003039 test-auc:0.869787+0.051283 [165] train-auc:0.954751+0.003020 test-auc:0.870019+0.051601 [166] train-auc:0.954947+0.003063 test-auc:0.869687+0.051775 [167] train-auc:0.955228+0.003122 test-auc:0.869682+0.051919 [168] train-auc:0.955369+0.003181 test-auc:0.869351+0.051832 [169] train-auc:0.955590+0.003116 test-auc:0.869254+0.051729 [170] train-auc:0.955800+0.003130 test-auc:0.868659+0.052226 [171] train-auc:0.955973+0.003165 test-auc:0.868595+0.052111 [172] train-auc:0.956239+0.003307 test-auc:0.868854+0.051982 [173] train-auc:0.956409+0.003291 test-auc:0.869171+0.051843 [174] train-auc:0.956584+0.003241 test-auc:0.869083+0.052108 [175] train-auc:0.956735+0.003290 test-auc:0.869097+0.052149 [176] train-auc:0.957047+0.003332 test-auc:0.869146+0.052037 [177] train-auc:0.957270+0.003337 test-auc:0.869041+0.052098 [178] train-auc:0.957416+0.003358 test-auc:0.869358+0.052262 [179] train-auc:0.957657+0.003385 test-auc:0.869679+0.051461 [180] train-auc:0.957852+0.003407 test-auc:0.869796+0.051852 [181] train-auc:0.957961+0.003399 test-auc:0.870012+0.052018 [182] train-auc:0.958129+0.003470 test-auc:0.869848+0.053008 [183] train-auc:0.958341+0.003455 test-auc:0.870424+0.052901 [184] train-auc:0.958481+0.003462 test-auc:0.870379+0.053007 [185] train-auc:0.958621+0.003487 test-auc:0.869798+0.053354 [186] train-auc:0.958754+0.003481 test-auc:0.869790+0.052790 [187] train-auc:0.958929+0.003489 test-auc:0.869948+0.052531 [188] train-auc:0.959051+0.003485 test-auc:0.869826+0.052705 [189] train-auc:0.959154+0.003557 test-auc:0.869825+0.052674 [190] train-auc:0.959389+0.003488 test-auc:0.869134+0.052981 [191] train-auc:0.959640+0.003534 test-auc:0.869139+0.052928 [192] train-auc:0.959802+0.003548 test-auc:0.868721+0.053014 [193] train-auc:0.959943+0.003554 test-auc:0.868621+0.053073 [194] train-auc:0.960091+0.003548 test-auc:0.868896+0.053038 [195] train-auc:0.960262+0.003518 test-auc:0.868815+0.053007 [196] train-auc:0.960446+0.003483 test-auc:0.868435+0.053389 [197] train-auc:0.960651+0.003504 test-auc:0.868740+0.052869 [198] train-auc:0.960811+0.003404 test-auc:0.868526+0.052753 [199] train-auc:0.961005+0.003476 test-auc:0.868301+0.052774 [200] train-auc:0.961263+0.003473 test-auc:0.869224+0.052641 Warning message: In xgb.get.DMatrix(data, label, missing) : xgboost: label will be ignored.