Predicting LDA testing set different legnth than testing set

Question

I'm having issues predicting in lda. I want to cross validate my set, so I split my data into a training set (80%) and a testing set (20%) 5 times. This gives me two dataframes of different length. I can make the training lda no problem, but when I predict I don't get the newdata prediction. It goes automatically to the training set predictions. Any help for a R newbie?

 df.test=structure(list(DEV.rabbit.Bi = c(0L, 1L, 1L, 0L, 0L, 0L, 1L,0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L), cytoP = c(0,0, 0, 0, 0, 0, 0, -2.1260048, 0, 0, 0, 2.83428136, 0, 0, NA,0, -2.33067135, -3.2528685, 0, 0, -3.9118235, 0, -2.12893162,0, -2.135834975, -3.38015, 0, 2.86341288, 0, -2.4050405, 0, -2.38829672,0, -2.24985834, 0, -2.2202064, -2.15253385, -2.2366473, -2.96851445,0, -0.743292433, 0, 0, 0, -2.61448215, 0, 0, 0, 0, -2.9443965,0, 0), GIP = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0.88683115, 0, 0, 0, 4.31335206, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.900614, 0, 1.4537355, 0,6.168443, 3.872625, 3.1133642, 0, 2.3501405), neuroP = c(0, 0,2.0428646, 0, 0, 0, 0, 0, 0, 0, 5.165785, 0, 0, 0, NA, 0, 0,0, 0, 2.5078381, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.317407, 0, 0, 0,0, 0, 0, 0, 1.9766362, 0, 0, 0, 0, 4.6628686, 0, 0, 0, 4.6432279,4.586727, 0, 0, 0, 7.039145), ProlifP = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, NA, 0, 3.562687467, 0, 0, 0, 0, 0, 0, 0,-2.12833253, 4.947180667, -2.04286463, 0, 0, 0, -2.562395, 0,0, 0, -2.346905, 0, 0, 0, 0, 0, 0, 2.005820067, -3.0411488, -1.885536,-3.2384957, 0, 0, 0, 0, 5.6344196, 0, -4.767982), reproP = c(0.018018017,0.418918933, 0.040540533, 0.018018017, 0.454954967, 0, 0, 0.049549533,0, 0, 0, 0.3963964, 0.058558567, 0.040540533, NA, 0.054054067,0.441441433, 0, 0, 0.040540533, 0.063063067, 0, 0.35135135, 0.058558567,0.018018017, 0, 0.027027027, 0.040540533, 0.1036036, 0.4, 0.2,0.018018017, 0.130630633, 0.018018017, 0.1, 0.054054067, 0.031531533,0.081081067, 0.1036036, 0.040540533, 0.0900901, 0.369369367,0.036036033, -1.1009885, -0.673395133, NA, 0.045045033, 0, 0,0.1036036, -0.984343, 0)), .Names = c("DEV.rabbit.Bi", "cytoP","GIP", "neuroP", "ProlifP", "reproP"), row.names = c(12L, 23L,24L, 27L, 38L, 56L, 59L, 61L, 63L, 65L, 71L, 81L, 128L, 131L,141L, 154L, 163L, 168L, 170L, 184L, 186L, 205L, 210L, 217L, 233L,236L, 253L, 268L, 276L, 293L, 302L, 303L, 312L, 314L, 322L, 326L,335L, 339L, 343L, 361L, 377L, 385L, 392L, 394L, 399L, 402L, 418L,419L, 422L, 427L, 438L, 453L), class = "data.frame")
df.train= structure(list(DEV.rabbit.Bi = c(0L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L,0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L,1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L,0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), cytoP = c(0,NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,NA, 0, 0, 0, -2.648429, 0, 0, -2.1260048, 0, 0, 0, 0, 0, 0, 0,2.83428136, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, -3.126005, 0, 0,0, 7.0318728, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, 3.024976, 0,0, 0, -2.33067135, 0, 0, NA, 0, 0, -3.3048862, 3.2453672, 0,NA, 0, -3.9118235, NA, 0, 0, 0, 0, 0, -3.3074869, 0, 0, 0, 0,0, NA, 0, 0, 0, -3.64705195, 0, 0, -2.6801575, 0, -2.32687549,0, 0, -3.38015, 0, 0, NA, 0, -2.4122793, 0, 0, 0, 0, 0, 0, -2.434712735,2.86341288, 0, 0, 0, 0, 0, 0, 0, 0, -3.73306513, 0, 0, 0, 0,0, -2.38829672, 0, 0, 0, -0.823873667, 0, 0, 0, -2.24985834,0, 0, 0, 0, 0, -2.2202064, 0, -2.34696895, NA, NA, 0, -2.15253385,-2.1856675, -2.2366473, 2.017460955, -2.96851445, 0, 0, 0, -3.0842214,0, -3.50124325, -5.794065, 0, NA, 0, -3.1539793, -2.5736979,0, 0, -2.3865695, 0, -2.710736745, 0, -0.743292433, 0, 2.373366367,0, -2.75693455, NA, NA, -2.61448215, NA, 0, 0, 0, -2.2124975,0, 0, 0, 0, 0, 0, 0, 0, -3.053354, NA, 5.428529647, -2.9443965,-3.8878643, -2.2083998, 0, 0, 0, NA, 0, NA, -2.13583495, 0, 0,0), GIP = c(0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,5.820918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 3.73598124, 0, 0,4.588133, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0566821, 0,0, 0, 0, 0, 4.31335206, 0, 0, 0, 0, 0, 8.6651012, 0, 2.55087375,0, 0, 0, 0, 0, 0, 0, 0, 3.068526045, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 6.3948068, 0, 0,0, 0, 0, 0, 0, 0, 0, 3.3290915, 3.205779325, 0, 0, 0, 0, 0, 0,0, 0, 0, 1.01417725, 0, 0, 1.35015685, 0, 0, NA, 1.290875, 0,NA, 1.4537355, 0, 0, 0, 3.1133642, 0, 0, 0, 6.168443, 0, 6.26968469,3.872625, 0, 3.890076867, 0, 3.1133642, 2.250768067, 0, 0.97301535,4.8966569, 0, 8.487644, 0, 3.798781, 3.253654875, 4.960366, 0,2.3501405), neuroP = c(0, NA, NA, 0, 0, 0, 0, 0, 2.0428646, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11.03703, NA, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 5.165785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA,0, 0, 0, 0, 0, 3.583922, 0, 0, 0, 0, 0, 0, 2.0009107, 0, NA,NA, 0, 0, 0, 0, 0, 2.55936099, 0, 0, 0, NA, 0, 0, 0, 0, 0, NA,2.5078381, 0, NA, 0, 3.872625, 0, 0, 0, 0, 0, 0, 3.97424399,0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.5064081, NA,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.16196, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 1.929947, 0, 2.000911, 0, 0, 0, 0, 0, 0, 0,0, 0, 2.247053, 0, 0, 0, NA, NA, 0, 0, 0, 1.9766362, 2.126448,0, 0, 0, 0, 4.130221, 0, 0, NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0,2.599616, 0, 0, 0, 0, 0, NA, NA, 0, NA, 0, 0, 0, 0, 3.0913634,0, 0, 4.6432279, 4.586727, 0, 1.58651903, 0, 2.6652475, NA, 0,0, 0, 3.5208109, 4.2195317, 0, 0, NA, 10.5157265, NA, 0, 0, 2.8920614,7.039145), ProlifP = c(0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, -1.945246, 0, 0, 0, 0, NA, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, -11.05227, 0, 0, 0,0, 3.562687467, 0, 0, NA, 0, 0, 0, -2.02585, 3.887923007, NA,0, 0, NA, 0, 0, 0, 0, 0, 3.7865502, 0, 0, 0, 0, 0, NA, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 4.947180667, 0, 0, NA, 0, 0, 0, 0,-2.04286463, -2.0343177, 0, 0, 5.591507567, 0, -2.0868461, 0,0, 0, 0, 0, 0, 5.151728643, 4.936735813, 0, 0, 0, 0, -2.562395,0, -2.009148, -7.564251, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2.346905,3.207918667, 0, 0, -2.9254072, NA, NA, 0, 0, -2.5948795, 0, -2.060203,0, -4.14739583, -2.8027302, -4.487039, 0, 0, 0, NA, -2.8964375,5.003374, -2.263317, 0, 3.609647733, -2.6806902, 0, 3.505242133,0, 3.120921753, -3.445611, 0, 0, 5.147579867, 0, 0, NA, NA, -3.2384957,NA, 0, 0, 0, -2.798781, -1.6022584, 0, 0, 0, 0, 0, 0, 0, 4.713909533,4.4782686, -5.831885, 5.6344196, -6.8794451, 4.888960867, -3.1387679,-5.5994579, 0, NA, 0, NA, 0, 0, -4.6923589, -4.767982), reproP = c(0,0.58783785, 0.1486486, 0, 0.018018017, 0, 0.063063067, 0.418918933,0.040540533, 0, 0.4864865, 0.018018017, 0.0855856, 0.018018017,0.3963964, 0, 0, 0.0990991, 0.333333333, 0, 0, 0, 0, 0, 0, 0.076576567,0, 0.081081067, 0.049549533, 0.3873874, 0, 0, 0, 0.15, 0.06756755,0.0617284, 0.3963964, 0.383333333, 0.018018017, 0, 0.15, 0.031531533,0.3918919, 0.058558567, 0.0810811, 0, 0, 0.067567567, 0, 0, 0,0, 0, 0.06756755, 0, 0.516666667, NA, 0.058558567, 0.1621622,0.2567568, NA, NA, 0.1419753, 0, 0, 0.054054067, 0.040540533,0.018018017, 0.441441433, 0.031531533, 0, 0, 0, 0.1126126, 0.072072067,0, 0.35802469, 0.0472973, 0.040540533, 0.063063067, 0.16216215,0.083333333, 0.333333333, 0.018018017, 0.024691357, 0.0945946,0.0945946, 0.045045033, 0, 0.037037035, 0, 0.081081067, 0, 0.135135133,0.058558567, 0.081081067, 0.031531533, 0, 0.013513513, 0.063063067,0.333333333, 0.35802469, 0.1081081, 0.040540533, 0, 0.018018017,0.081081067, 0.075, 0.045045033, 0.067567567, 0.040540533, 0.031531533,0.027027027, 0.031531533, 0.036036033, 0.45, 0.018018017, 0.040540533,-0.7265556, 0.031531533, 0.4144144, 0.10185185, 0.067567567,0, 0.040540533, 0.018018017, 0.027027025, 0.0990991, 0.1036036,0.027027025, 0.054054067, 0.2, 0.018018017, 0, 0, 0.033333333,0, 0.031531533, 0.378378367, 0.130630633, 0.018018017, 0.1, 0,0, 0.1, 0, 0.054054067, 0.459459467, 0.031531533, 0.075, 0.5,0.364864867, 0.031531533, 0.06756755, 0.081081067, 0.6418919,0.1036036, 0.35135135, 0.054054067, -0.931616333, 0.3918919,0, 0.0855856, 0.1081081, 0.373873867, NA, 0.333333333, 0.0990991,-1.345913467, 0.040540533, 0.018018017, 0.081081067, 0.3963964,0.018018017, 0, 0.0900901, 0.2027027, 0.031531533, 0.3963964,0.364864867, 0.0743243, 0, -0.673395133, 0.06756755, NA, -0.316663167,0.031531533, 0, 0.031531533, 0.3873874, 0.0608108, 0.045045033,0, -1.004574, 0.018018017, 0, 0.4144144, 0.55405405, 0, 0.1036036,-1.646125933, -1.5806603, -0.9572768, -0.818359433, -0.984343,0.2, -4.2037963, 0, -1.2499105, 0.4, 0.0608108, 0)), .Names = c("DEV.rabbit.Bi","cytoP", "GIP", "neuroP", "ProlifP", "reproP"), row.names = c(2L,4L, 6L, 11L, 12L, 13L, 15L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,30L, 34L, 35L, 39L, 40L, 43L, 44L, 48L, 55L, 56L, 57L, 58L, 59L,60L, 61L, 62L, 63L, 65L, 71L, 72L, 75L, 79L, 81L, 84L, 85L, 86L,87L, 91L, 92L, 93L, 94L, 97L, 100L, 101L, 102L, 105L, 112L, 115L,118L, 119L, 120L, 121L, 126L, 128L, 129L, 132L, 136L, 141L, 144L,148L, 151L, 154L, 155L, 156L, 163L, 164L, 166L, 169L, 170L, 178L,179L, 180L, 181L, 183L, 184L, 186L, 188L, 190L, 191L, 193L, 194L,198L, 199L, 200L, 201L, 202L, 205L, 206L, 212L, 215L, 217L, 222L,223L, 224L, 228L, 229L, 230L, 231L, 232L, 235L, 236L, 238L, 239L,244L, 248L, 249L, 250L, 252L, 253L, 257L, 262L, 263L, 265L, 268L,271L, 272L, 275L, 279L, 282L, 285L, 286L, 287L, 289L, 290L, 291L,294L, 301L, 302L, 303L, 304L, 305L, 307L, 309L, 310L, 311L, 312L,314L, 315L, 317L, 319L, 322L, 323L, 326L, 327L, 329L, 331L, 333L,334L, 335L, 338L, 339L, 342L, 343L, 344L, 346L, 349L, 350L, 352L,353L, 354L, 356L, 359L, 360L, 363L, 365L, 366L, 368L, 370L, 371L,374L, 376L, 377L, 380L, 381L, 384L, 387L, 393L, 395L, 399L, 400L,402L, 403L, 408L, 409L, 414L, 415L, 417L, 418L, 419L, 420L, 421L,422L, 424L, 425L, 426L, 427L, 428L, 429L, 434L, 437L, 438L, 441L,442L, 443L, 444L, 448L, 451L, 453L), class = "data.frame")

lda.train= lda(df.train$DEV.rabbitBi~ df.train[,c(2)] +df.train[,c(3)]+df.train[,c(4)]+df.train[,c(5)]+df.train[,c(6)], data=df.train)

lda.pred= predict(lda.train, newdata=df.test)$class

I think this will likely be to how you are passing the variables to lda. As you are not using the variable names, then predict cannot match the variables in the newdata to the fitted model. [ps i think df.train dput is incomplete] — user20650
Would it be able to predict my data if it were a different format or if I attached the names? — Peter Karoway
Try fitting the lda with lda.train <- lda(DEV.rabbitBi~ ., data=df.train[c(1:6)]) and then predicting — user20650
It worked! Do you mind explaining the syntax of what that means/ does and put it in an answer (esp the "~.," because I don't know what that means). I'm trying to understand R grammar and also want to give you credit for answering my question! — Peter Karoway

user20650 user20650 · Accepted Answer · 2015-12-07T20:30:38

The call to predict does not work as the variable names that are in the lda model output do not match the variable names in the newdata / test data. (see the newdata argument description in ?predict.lda ... a data frame with columns of the same names as the variables used). As they don't match, the predict method ignores the newdata argument and predicts on the original data. You have used the same variables in the model and in the predict but it matters how they are passed to the lda model.

Using the example from ?lda

library(MASS)

Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]),
                   Sp = rep(c("s","c","v"), rep(50,3)))
train <- sample(1:150, 75)

names(Iris)
#[1] "Sepal.L." "Sepal.W." "Petal.L." "Petal.W." "Sp"      

z <- lda(Sp ~ . , Iris, prior = c(1,1,1)/3, subset = train)
z
#...
#...
# Coefficients of linear discriminants:
#                 LD1        LD2
# Sepal.L.  0.5863648  0.7580133
# Sepal.W.  2.0444073 -2.5613102
# Petal.L. -1.8827963  0.9356446
# Petal.W. -3.5895106 -3.4927051
#...
#...

Variable names in the model match those found in the data, and match those in the newdata, so predict will work as intended

pred <- predict(z, Iris[-train, ])

When you pass the variables differently you can run into problems.

z <- lda(Sp ~ Iris[,1] + Iris[,2] + Iris[,3] + Iris[,4]  , Iris, prior = c(1,1,1)/3, subset = train)
z
# Coefficients of linear discriminants:
#                  LD1        LD2
# Iris[, 1]  0.5863648  0.7580133
# Iris[, 2]  2.0444073 -2.5613102
# Iris[, 3] -1.8827963  0.9356446
# Iris[, 4] -3.5895106 -3.4927051

Notice the coefficient names. So now when you use the predict function it expects variables called Iris[,1], ... etc to be in newdata, which this is not the case, so predict will ignore the newdata.

As for the ~ . form: this is just a quick method to select all variables from the data (given bydata=), excluding those to the left-hand side of the equation (lhs ~ rhs), into the model. `See here. Note, you can use the data argument to only select certain columns.

So the first lda function call above is equivalent to writing all the names out manually

z <- lda(Sp ~ Sepal.L. + Sepal.W. + Petal.L. + Petal.W.  , 
                             Iris, prior = c(1,1,1)/3, subset = train)

Note it is common to most (all??) of the predict methods, that the names in any new data will need to match those from the model

Predicting LDA testing set different legnth than testing set

1 Answers