0
votes

I am trying to roughly reproduce C-classification SVM cross-validation skill in R (e1071 package) using Python (scikit-learn), but am getting nowhere near R's prediction skill. Given the below training and test data (which have been smoothed from datasets much larger in length), R prediction skill is 0.87 (where 1 is perfect), and Python skill is 0.55 which is not much better than guessing. Note that I am not at all trying to get identical results, I am just hopeful that if R can do reasonably well then so could Python on the same dataset. I have split my data 50-50 (training and test), and am trying to predict binomial results from floats. R and Python code are given below. All the default SVM arguments that I checked were the same between both R and Python (gamma, C(cost), shrinking, tol, etc).

R code:

library("e1071")

data <- c(-108.604150711185, -131.880188127745, -18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562, -396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413)
class <- as.factor(c(0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0))
df_training <- data.frame(data, class)

data <- c(133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133, 80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719, 85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549)
class <- as.factor(c(1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1))
df_test <- data.frame(data, class)

#train model
best.svm <- best.tune(svm,
                      class~data,
                      data=df_training,kernel = 'radial',cost = 1, gamma = 0.01,
                      type = "C-classification")

#make predictions
TrainingPredictions<-predict(best.svm,df_training,type="class")
TestPredictions <- predict(best.svm,df_test,type="class")

Skill = sum(TestPredictions==df_test[[c('class')]])/length(TestPredictions)
print(Skill) #value is 0.87

Python code:

import numpy as np
from sklearn.svm import SVC

Data = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413])
Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])
df_training = np.array([Data, Class])

Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549])
Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])
df_test = np.array([Data, Class])

# train model                                                                                                                                                 
clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)                                                     

# make predictions
clf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1))
TrainingPredictions = clf.predict(df_training[0].reshape(88,1))
TestPredictions = clf.predict(df_test[0].reshape(89,1))
Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions))
print Skill #value is 0.55
1
As an R-only "speaker/thinker" I find it odd that this complaint about Python would have an R tag. I also find it annoying even when people with questions about howto-in-R post code that is in another computer language (but no clear natural language description) and expect us monolingual R-users to offer advice. It's also likely that SVM's are non-deterministic so just offering output of a single run on a small dataset with binomial predictions is unlikely to be convincing evidence of a "true difference". - IRTFM
Mr Grumpy Face: I made some effort to write this post in and concise and clear way. I hoped it would be about the details of tuning SVMs for maximum skill. SVMs are may not be deterministic, but you can certainly tune them to improve skill (the best.svm function in R does this). The data posted is smoothed from data 20 times larger, so there should not be much noise. - Oliver Angelil
So... Do you really think an R tag is useful for people trying to find information in the future about how to do SVM's with Python in the future? If the "value"'s are comparable then don't you think it would have added something to the question to say that they were constructed with data that was significantly larger than what appears? - IRTFM
Good point, I'll add that bit into the question. Thanks. - Oliver Angelil

1 Answers

2
votes

This observed difference might come from the fact that in R, svm() scales the data by default (see documentation, page 6).

If you use scikit-learn's StandardScaler, you end up with a result pretty close from the one you obtained with R :

import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Data = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413])
Data = scaler.fit_transform(Data)
Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])
df_training = np.array([Data, Class])

Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.5520123243496, 120.582346886614, -214.850084749638, 96.8090523924549])
Data = scaler.fit_transform(Data)
Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])
df_test = np.array([Data, Class])

# train model                                                                                                                                                 
clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)                                                     

# make predictions
clf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1))
TrainingPredictions = clf.predict(df_training[0].reshape(88,1))
TestPredictions = clf.predict(df_test[0].reshape(89,1))
Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions))
print("Skill: "+str(Skill)) #value is 0.84