0
votes

I want to incorporate the function confusionMatrix() in the caret packageinto the function shuffle100 to produce confusion matrices from subsets (dataframes) of a master-list produced from classification tree models. My aim is to produce confusion matrix statistics such as classification accuracy, kappa metric etc (desired output below). I am sorry to ask such a simple question but I cannot figure this out. If anyone can help, then many thanks in advance.

Reproducible dummy data can be found at this address:

Reproducible data

Code to produce a nested list of classification tree model predictions and confusion matrices

        library(caret)
        library(e1071)
        library(rpart)

        set.seed(1235)

       shuffle100 <-lapply(seq(10), function(n){ #produce 10 different shuffled data-frames
       subset <- my_data[sample(nrow(my_data), 80),] #shuffle 80 rows in the data-frame
       subset_idx <- sample(1:nrow(subset), replace = FALSE)
       subset <- subset[subset_idx, ] 
       subset_resampled_idx <- createDataPartition(subset_idx, times = 1, p = 0.7, list = FALSE) #partition data-frame into 70 % training and 30 % test subsets   
       subset_resampled <- subset[subset_resampled_idx, ] #70 % training data
       ct_mod<-rpart(Family~., data=subset_resampled, method="class", control=rpart.control(cp=0.005)) #10 ct models
       ct_pred<-predict(ct_mod, newdata=subset[,2:13])
       confusionMatrix(ct_pred, norm$Family)#10 confusion matrices
       })

Error messages

        Error in sort.list(y) : 'x' must be atomic for 'sort.list'
        Have you called 'sort' on a list?
        Called from: sort.list(y)

Desired outcome

                    Confusion Matrix and Statistics

                    Reference
         Prediction G8 V4
                 G8 42 12
                 V4  8 18

                Accuracy : 0.75            
                  95% CI : (0.6406, 0.8401)
     No Information Rate : 0.625           
     P-Value [Acc > NIR] : 0.01244         

                   Kappa : 0.4521          
  Mcnemar's Test P-Value : 0.50233         

             Sensitivity : 0.8400          
             Specificity : 0.6000          
          Pos Pred Value : 0.7778          
          Neg Pred Value : 0.6923          
              Prevalence : 0.6250          
          Detection Rate : 0.5250          
    Detection Prevalence : 0.6750          
       Balanced Accuracy : 0.7200          

        'Positive' Class : G8              
1

1 Answers

0
votes

Here is a function to produce confusion matrices from sub-lists (dataframes) in a master-list produced from classification tree models using the function confusionMatrixin the caret package.

   #Generate three new column headings: 
   #(1) `Predicted'
   #(2) `Actual'
   #(3) `Binary'

 my_list <- lapply(shuffle100, function(df){#Create two new columns     Predicted and Actual
         if (nrow(df) > 0)
         cbind(df, Predicted = c(""), Actual = c(""), Binary = c(""),  Actual2 = c(""))
         else
         cbind(df, Predicted = factor(), Actual = c(""), Binary = c (""), Actual2 = c(""))
         })

  # Produce three columns filled with NA's
  #`Predicted' = NA
  #`Actual' = NA
  #`Binary' = NA

 Final_lists<-lapply(my_list, function(x) mutate(x, Predicted = NA, Actual = NA, Binary = NA, Actual2 = NA))

  #FILL THE PREDICTED COLUMN

  #Fill the `Predicted'depending on the condition of which group in the dependent variable has the highest probability: either V4 > G8 or G8 > V4

  #Fill the Predicted column

   for(i in 1:length(Final_lists)){
    for(j in 1:nrow(Final_lists[[i]])){
    Final_lists[[i]] [j,3]=names(Final_lists[[i]])[(Final_lists[[i]] [j,2] > Final_lists[[i]] [j,1])+1]
    }
   }           

 Final_lists

 #FILL THE ACTUAL COLUMN

 #Fill in the Actual column with the actual class predictions
 #Firstly create a vector for normalised_scores$Family
 #Insert normalised_scores$Family into the column called `Actual' for each sub-list in the nested sublist

  Actual <-lapply(Final_lists, `[`, 4) # Select the Actual column in all lists
  normalised_Actual<-normalised_scores$Family
  Actual<-normalised_Actual

  #There are two ways:

  #Way 1:

  # Use indices - and pass in Final_lists

   Actual_list <- lapply(seq_along(Final_lists), 
                  function(i, x){
                    x[[i]]$Actual <- Actual 
                    return (x[[i]])
                  }, Final_lists
                 )

  #FILL THE BINARY COLUMN

  # Use indices - and pass in Final_lists

  # iterate the ten elements of the outer list
  # iterate each row of EACH inner list
  # in each row, if Predicted==Actual, assign 1 to Binary, else 0

  #Method 1

   for( i in 1 : length(Actual_list)) {
    for( j in 1 : length(Actual_list[[i]]$Predicted)) {
    if(Actual_list[[i]][j,"Predicted"] == Actual_list[[i]][j,"Actual"]){
      Actual_list[[i]][j,"Binary"] <- 1
      } else {
      Actual_list[[i]][j,"Binary"] <- 0
    }
  }
}


 #Fill in Actual2 column

  for( i in 1 : length(Actual_list)){
    for( j in 1 : length(Actual_list[[i]]$Actual)){
     if(Actual_list[[i]][j,"Actual"] == "V4"){
       Actual_list[[i]][j,"Actual2"] <- 1
    } else {
      Actual_list[[i]][j,"Actual2"] <- 0
    }
   }
  }

Actual_list

#Generate confusion matrices

   confusionMatrices <- lapply(Actual_list, function(scores){
confusionMatrix(scores$Predicted, scores$Actual)
})