3
votes

I am trying to plot roc curve in ggplot, I know ggplot needs "FPR" and "TPR" but I can get to this values from "performance" function or "prediction" function.

I need to combine two classifier's roc curve in one plot.

this is the code I want to use:

ggplot(df,aes(FPR,TPR,color=classifier))+geom_line(size = 2, alpha = 0.7)+
      labs(title= "ROC curve", 
           x = "False Positive Rate (1-Specificity)", 
           y = "True Positive Rate (Sensitivity)")

this is the data:

> dput(xgboost_prediction[1:100,])
structure(list(no = c(0.990442335605621, 0.534177422523499, 0.991996705532074, 
0.84391313791275, 0.965542376041412, 0.691059589385986, 0.718063056468964, 
0.995023250579834, 0.956590950489044, 0.94465434551239, 0.775800347328186, 
0.725375294685364, 0.8955979347229, 0.519426822662354, 0.262101024389267, 
0.920495390892029, 0.961367130279541, 0.529997289180756, 0.753180921077728, 
0.374252378940582, 0.649519801139832, 0.873721957206726, 0.938673436641693, 
0.98504501581192, 0.990583181381226, 0.991510570049286, 0.981655895709991, 
0.903484642505646, 0.965107679367065, 0.953721821308136, 0.554582417011261, 
0.919193744659424, 0.965399146080017, 0.574111104011536, 0.898433864116669, 
0.901707828044891, 0.951646208763123, 0.637083053588867, 0.947289109230042, 
0.920474767684937, 0.995938301086426, 0.973612129688263, 0.964727997779846, 
0.884541928768158, 0.95257955789566, 0.412161201238632, 0.990020036697388, 
0.976162195205688, 0.939770936965942, 0.968810081481934, 0.983040153980255, 
0.89262580871582, 0.970016121864319, 0.923628032207489, 0.886549472808838, 
0.522226393222809, 0.960890293121338, 0.975551664829254, 0.710578739643097, 
0.973441541194916, 0.388043284416199, 0.944134533405304, 0.962944269180298, 
0.958814978599548, 0.948761761188507, 0.973393976688385, 0.971923232078552, 
0.883057355880737, 0.974324226379395, 0.886299729347229, 0.464900106191635, 
0.746802806854248, 0.735462188720703, 0.663913428783417, 0.965535283088684, 
0.990276217460632, 0.201064333319664, 0.98608660697937, 0.870771527290344, 
0.99166601896286, 0.974869549274445, 0.990644574165344, 0.991726994514465, 
0.827968239784241, 0.988746464252472, 0.783728361129761, 0.221792578697205, 
0.98324054479599, 0.617589056491852, 0.986640393733978, 0.932099580764771, 
0.967139899730682, 0.956782400608063, 0.90983521938324, 0.990782082080841, 
0.995104193687439, 0.809846758842468, 0.698821008205414, 0.996101856231689, 
0.528671264648438), yes = c(0.00955766439437866, 0.465822577476501, 
0.00800329446792603, 0.15608686208725, 0.0344576239585876, 0.308940410614014, 
0.281936943531036, 0.00497674942016602, 0.0434090495109558, 0.0553456544876099, 
0.224199652671814, 0.274624705314636, 0.1044020652771, 0.480573177337646, 
0.737898975610733, 0.0795046091079712, 0.038632869720459, 0.470002710819244, 
0.246819078922272, 0.625747621059418, 0.350480198860168, 0.126278042793274, 
0.0613265633583069, 0.0149549841880798, 0.00941681861877441, 
0.00848942995071411, 0.0183441042900085, 0.0965153574943542, 
0.0348923206329346, 0.046278178691864, 0.445417582988739, 0.0808062553405762, 
0.0346008539199829, 0.425888895988464, 0.101566135883331, 0.0982921719551086, 
0.0483537912368774, 0.362916946411133, 0.0527108907699585, 0.0795252323150635, 
0.00406169891357422, 0.0263878703117371, 0.0352720022201538, 
0.115458071231842, 0.0474204421043396, 0.587838798761368, 0.0099799633026123, 
0.0238378047943115, 0.0602290630340576, 0.0311899185180664, 0.0169598460197449, 
0.10737419128418, 0.0299838781356812, 0.076371967792511, 0.113450527191162, 
0.477773606777191, 0.0391097068786621, 0.0244483351707458, 0.289421260356903, 
0.0265584588050842, 0.611956715583801, 0.055865466594696, 0.0370557308197021, 
0.0411850214004517, 0.0512382388114929, 0.026606023311615, 0.0280767679214478, 
0.116942644119263, 0.0256757736206055, 0.113700270652771, 0.535099893808365, 
0.253197193145752, 0.264537811279297, 0.336086571216583, 0.0344647169113159, 
0.00972378253936768, 0.798935666680336, 0.0139133930206299, 0.129228472709656, 
0.00833398103713989, 0.0251304507255554, 0.00935542583465576, 
0.00827300548553467, 0.172031760215759, 0.0112535357475281, 0.216271638870239, 
0.778207421302795, 0.01675945520401, 0.382410943508148, 0.0133596062660217, 
0.0679004192352295, 0.0328601002693176, 0.0432175993919373, 0.0901647806167603, 
0.00921791791915894, 0.00489580631256104, 0.190153241157532, 
0.301178991794586, 0.00389814376831055, 0.471328735351562)), .Names = c("no", 
"yes"), row.names = c(NA, 100L), class = "data.frame")

> dput(randomforest_prediction[1:100,])
structure(list(no = c(0.694, 0.606, 0.778, 0.498, 0.748, 0.604, 
0.446, 0.586, 0.686, 0.748, 0.708, 0.574, 0.662, 0.614, 0.65, 
0.616, 0.618, 0.664, 0.562, 0.496, 0.628, 0.77, 0.652, 0.738, 
0.674, 0.73, 0.684, 0.7, 0.678, 0.672, 0.616, 0.71, 0.93, 0.774, 
0.668, 0.682, 0.752, 0.926, 0.776, 0.796, 0.85, 0.728, 0.57, 
0.648, 0.804, 0.64, 0.766, 0.722, 0.73, 0.706, 0.748, 0.608, 
0.684, 0.708, 0.746, 0.554, 0.732, 0.816, 0.888, 0.656, 0.808, 
0.908, 0.69, 0.722, 0.764, 0.746, 0.728, 0.57, 0.95, 0.656, 0.066, 
0.692, 0.794, 0.638, 0.63, 0.8, 0.572, 0.776, 0.776, 0.702, 0.848, 
0.77, 0.864, 0.682, 0.784, 0.582, 0.028, 0.694, 0.642, 0.428, 
0.636, 0.654, 0.798, 0.576, 0.674, 0.756, 0.606, 0.648, 0.676, 
0.498), yes = c(0.306, 0.394, 0.222, 0.502, 0.252, 0.396, 0.554, 
0.414, 0.314, 0.252, 0.292, 0.426, 0.338, 0.386, 0.35, 0.384, 
0.382, 0.336, 0.438, 0.504, 0.372, 0.23, 0.348, 0.262, 0.326, 
0.27, 0.316, 0.3, 0.322, 0.328, 0.384, 0.29, 0.07, 0.226, 0.332, 
0.318, 0.248, 0.074, 0.224, 0.204, 0.15, 0.272, 0.43, 0.352, 
0.196, 0.36, 0.234, 0.278, 0.27, 0.294, 0.252, 0.392, 0.316, 
0.292, 0.254, 0.446, 0.268, 0.184, 0.112, 0.344, 0.192, 0.092, 
0.31, 0.278, 0.236, 0.254, 0.272, 0.43, 0.05, 0.344, 0.934, 0.308, 
0.206, 0.362, 0.37, 0.2, 0.428, 0.224, 0.224, 0.298, 0.152, 0.23, 
0.136, 0.318, 0.216, 0.418, 0.972, 0.306, 0.358, 0.572, 0.364, 
0.346, 0.202, 0.424, 0.326, 0.244, 0.394, 0.352, 0.324, 0.502
)), .Names = c("no", "yes"), row.names = c("1", "2", "3", "4", 
"7", "8", "9", "10", "13", "14", "15", "16", "18", "19", "22", 
"23", "24", "27", "28", "29", "30", "31", "32", "33", "34", "35", 
"36", "38", "39", "40", "41", "45", "46", "47", "48", "50", "51", 
"55", "56", "57", "58", "60", "61", "62", "64", "65", "66", "68", 
"70", "71", "73", "75", "76", "77", "78", "79", "80", "82", "83", 
"84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "95", 
"96", "99", "100", "101", "102", "103", "105", "106", "107", 
"108", "109", "110", "112", "114", "115", "116", "118", "120", 
"123", "124", "125", "126", "127", "128", "129", "130", "131", 
"132", "133", "135"), class = "data.frame")

> dput(test_set$y[1:100])
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")

can someone please help? I have been trying for hours!

1
In your data to what do 'yes' and 'no' refer?Chris
this is the prediction for the two classes, the probability to be yes or no.anat
so you need to do hypothesis testing to find the fprChris
How should one calculate the false positive rate from just knowing class prediction percentages (and not the true labels)?lukeA
no, I have it. sorry I will add that, but the thing is there is function that calculate them... I just doesn't seem to be able to get access.anat

1 Answers

1
votes

The following code could be a solution to your problem.
Hope it can help you.

pred.xgb <- prediction(xgboost_prediction[,2], test_set$y)
perf.xgb <- performance(pred.xgb, "fpr", "tpr")
tpr.xgb <- [email protected][[1]]
fpr.xgb <- [email protected][[1]]

pred.rf <- prediction(randomforest_prediction[,2], test_set$y)
perf.rf <- performance(pred.rf, "fpr", "tpr")
tpr.rf <- [email protected][[1]]
fpr.rf <- [email protected][[1]]

library(ggplot2)
df <- data.frame(tpr=c(tpr.xgb,tpr.rf), fpr=c(fpr.xgb, fpr.rf),
        Method=c(rep("XGB",each=length(tpr.xgb)),
                 rep("RF",each=length(tpr.rf))) )
ggplot(aes(x=fpr, y=tpr, color=Method), data=df) + geom_line()

The AUC can be calculated using:

The AUC can be calculated using:

performance(pred.xgb, "auc")@y.values[[1]]
# 0.6766304

performance(pred.rf, "auc")@y.values[[1]]
# 0.6650815