0
votes
library(e1071)
m1 <- naiveBayes(Survived ~ ., data =tr) # tr is the training set
Prediction1<-predict(m1,tst)  #tst is the test set
Error in `[.default`(object$tables[[v]], , nd +    islogical[attribs[v]]) : 
subscript out of bounds
In addition: Warning messages:
1: In data.matrix(newdata) : NAs introduced by coercion
2: In data.matrix(newdata) : NAs introduced by coercion
3: In data.matrix(newdata) : NAs introduced by coercion
4: In data.matrix(newdata) : NAs introduced by coercion
5: In data.matrix(newdata) : NAs introduced by coercion
6: In data.matrix(newdata) : NAs introduced by coercion
7: In data.matrix(newdata) : NAs introduced by coercion
8: In data.matrix(newdata) : NAs introduced by coercion
9: In data.matrix(newdata) : NAs introduced by coercion

tr is the training set that has 17 columns and 891 rows. One of the columns is the Survived that is filled with zeros and ones depending if a passenger in Titanic survived or not. The tst is the test set has the same 17 columns and 418 rows, where the survived column is NA. It is NA because you want to predict it and then compare what you found to kaggle.com. Both tst and tr are data.frames. What is the mistake here? I read the manual of naiveBayes and I tried to convert my data to factor, but nothing happend. Thanks in advance

This is tr: https://www.dropbox.com/s/riklgjabppqa0om/tr.png?dl=0

This is tst: https://www.dropbox.com/s/9juvs6g630181tg/tst.png?dl=0

dput(head(tr, 20)) structure(list(PassengerId = 1:20, Survived = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L ), Pclass = c(3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L), Name = c("Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Briggs Thayer)", "Heikkinen, Miss. Laina", "Futrelle, Mrs. Jacques Heath (Lily May Peel)", "Allen, Mr. William Henry", "Moran, Mr. James", "McCarthy, Mr. Timothy J", "Palsson, Master. Gosta Leonard", "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)", "Nasser, Mrs. Nicholas (Adele Achem)", "Sandstrom, Miss. Marguerite Rut", "Bonnell, Miss. Elizabeth", "Saundercock, Mr. William Henry", "Andersson, Mr. Anders Johan", "Vestrom, Miss. Hulda Amanda Adolfina", "Hewlett, Mrs. (Mary D Kingcome)", "Rice, Master. Eugene", "Williams, Mr. Charles Eugene", "Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)", "Masselmani, Mrs. Fatima"), Sex = c("male", "female", "female", "female", "male", "male", "male", "male", "female", "female", "female", "female", "male", "male", "female", "female", "male", "male", "female", "female"), Age = c(22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2, NA, 31, NA), SibSp = c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 3L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 4L, 0L, 1L, 0L), Parch = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 1L, 0L, 0L, 5L, 0L, 0L, 1L, 0L, 0L, 0L), Ticket = c("A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "330877", "17463", "349909", "347742", "237736", "PP 9549", "113783", "A/5. 2151", "347082", "350406", "248706", "382652", "244373", "345763", "2649" ), Fare = c(7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708, 16.7, 26.55, 8.05, 31.275, 7.8542, 16, 29.125, 13, 18, 7.225), Cabin = c(NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C103", NA, NA, NA, NA, NA, NA, NA, NA), Embarked = c("S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S", "S", "S", "S", "S", "Q", "S", "S", "C" )), .Names = c("PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" ), class = c("data.table", "data.frame"), row.names = c(NA, -20L ), .internal.selfref = )

dput(head(tst, 20)) structure(list(PassengerId = 892:911, Pclass = c(3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 2L, 2L, 3L, 3L, 3L), Name = c("Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Needs)", "Myles, Mr. Thomas Francis", "Wirz, Mr. Albert", "Hirvonen, Mrs. Alexander (Helga E Lindqvist)", "Svensson, Mr. Johan Cervin", "Connolly, Miss. Kate", "Caldwell, Mr. Albert Francis", "Abrahim, Mrs. Joseph (Sophie Halaut Easu)", "Davies, Mr. John Samuel", "Ilieff, Mr. Ylio", "Jones, Mr. Charles Cresson", "Snyder, Mrs. John Pillsbury (Nelle Stevenson)", "Howard, Mr. Benjamin", "Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)", "del Carlo, Mrs. Sebastiano (Argenia Genovesi)", "Keane, Mr. Daniel", "Assaf, Mr. Gerios", "Ilmakangas, Miss. Ida Livija", "Assaf Khalil, Mrs. Mariana (Miriam\")\"" ), Sex = c("male", "female", "male", "male", "female", "male", "female", "male", "female", "male", "male", "male", "female", "male", "female", "female", "male", "male", "female", "female" ), Age = c(34.5, 47, 62, 27, 22, 14, 30, 26, 18, 21, 32.1505376344086, 46, 23, 63, 47, 24, 35, 21, 27, 45), SibSp = c(0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 2L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L ), Parch = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Ticket = c("330911", "363272", "240276", "315154", "3101298", "7538", "330972", "248738", "2657", "A/4 48871", "349220", "694", "21228", "24065", "W.E.P. 5734", "SC/PARIS 2167", "233734", "2692", "STON/O2. 3101270", "2696" ), Fare = c(7.8292, 7, 9.6875, 8.6625, 12.2875, 9.225, 7.6292, 29, 7.2292, 24.15, 7.8958, 26, 82.2667, 26, 61.175, 27.7208, 12.35, 7.225, 7.925, 7.225), Cabin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "B45", NA, "E31", NA, NA, NA, NA, NA), Embarked = c("Q", "S", "Q", "S", "S", "S", "Q", "S", "C", "S", "S", "S", "S", "S", "S", "C", "Q", "C", "S", "C"), Survived = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Title = c("Mr", "Mrs", "Mr", "Mr", "Mrs", "Mr", "Miss", "Mr", "Mrs", "Mr", "Mr", "Mr", "Mrs", "Mr", "Mrs", "Mrs", "Mr", "Mr", "Miss", "Mrs"), TotalFamily = c(1, 2, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1), FamSize = c("Small", "Small", "Small", "Small", "Medium", "Small", "Small", "Medium", "Small", "Medium", "Small", "Small", "Small", "Small", "Small", "Small", "Small", "Small", "Small", "Small"), FarePrice = c("Low", "Low", "Low", "Low", "Medium", "Low", "Low", "High", "Low", "High", "Low", "High", "Expensive", "High", "Expensive", "High", "Medium", "Low", "Low", "Low" ), AgeNew = c("Adult", "Old", "Old", "Adult", "Adult", "Child", "Adult", "Adult", "Adult", "Adult", "Adult", "Old", "Adult", "Old", "Old", "Adult", "Adult", "Adult", "Adult", "Old")), .Names = c("PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked", "Survived", "Title", "TotalFamily", "FamSize", "FarePrice", "AgeNew"), class = c("data.table", "data.frame"), row.names = c(NA, -20L), .internal.selfref = )

1
It is not very helpful to provide an image of your data. No one will want to type all of that in and so no one will be able to run your code. Please use dput(tr) and dput(tst) to make a text readable version of your data. If the data is too long, you might try something like dput(head(tr, 20)) Also, when you use functions outside of base R, it is helpful to mention the packages that you are using by including a statement like library(e1071)G5W
I did it :) :):) @G5Wgiorgos kapetanidis

1 Answers

0
votes

I had similar issues in using Naive Bayes and finding confusion matrix. In my case, type of my outcome variable was Char. I converted it into factor. Then it seems working fine. Character fields are different from factors. Try following,

as.factor(Survived)