1
votes

I'm currently using the following code to train an SVM to classify (Iceberg Challenge), I need to display the accuracy of the model but I'm facing problems.

import numpy as np 
import pandas as pd 
import cv2 
from sklearn import svm 
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.metrics import log_loss
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import pdb
from subprocess import check_output
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

print(check_output(["ls", "/content/drive/MyDrive/iceberg"]).decode("utf8"))
trainDataFilePath = "/content/drive/MyDrive/iceberg/train.json"
testDataFilePath = "/content/drive/MyDrive/iceberg/test.json"
trainDataFrame = pd.read_json('/content/drive/MyDrive/iceberg/train.json')
testDataFrame = pd.read_json('/content/drive/MyDrive/iceberg/test.json')

def getHogDescriptor(image,binNumber = 16):
   gx = cv2.Sobel(image, cv2.CV_32F, 1, 0)
   gy = cv2.Sobel(image, cv2.CV_32F, 0, 1)
   mag, ang = cv2.cartToPolar(gx, gy)
   bins = np.int32(binNumber*ang/(2*np.pi))    
   bin_cells = bins[:10,:10], bins[10:,:10], bins[:10,10:], bins[10:,10:]
   mag_cells = mag[:10,:10], mag[10:,:10], mag[:10,10:], mag[10:,10:]
   hists = [np.bincount(b.ravel(), m.ravel(), binNumber) for b, m in zip(bin_cells, mag_cells)]
   hist = np.hstack(hists)    
   hist = np.array(hist,dtype=np.float32)
   return hist

def makeRGBImageFromHnV(bandHH,bandHV):
    b = np.divide(bandHH, bandHV, out=np.zeros_like(bandHH), where=(bandHV!=0))
    rgb = np.dstack((bandHH.astype(np.float32), bandHV.astype(np.float32),b.astype(np.uint16)))
    return rgb

def getMeanImageFromImageDataFrame(trainDataFrame):
    meanImage = np.zeros(shape =(75,75,3),dtype = np.float32)
    for currentImage in trainDataFrame["fullImage"]:
        meanImage = meanImage + currentImage
    meanImage = meanImage/len(trainDataFrame)
    return meanImage.astype(np.float32)

def getStandardDeviationFromImageDataFrame(trainDataFrame,meanImage):
    stdImage = np.zeros(shape =(75,75,3),dtype = np.float32)
    for currentImage in trainDataFrame["fullImage"]:
        stdImage = stdImage + (currentImage - meanImage)
    stdImage = stdImage/len(trainDataFrame)
    return stdImage.astype(np.float32)

def normalizedImageParamFromDataFrame(trainDataFrame): 
    meanImageData = getMeanImageFromImageDataFrame(trainDataFrame)
    stdImageData = getStandardDeviationFromImageDataFrame(trainDataFrame,meanImageData)
    return meanImageData,stdImageData

def normalizeSingleImage(currentImage,meanImageData,stdImageData):
    normalizedImage = (currentImage - meanImageData)/stdImageData
    return normalizedImage

def transformImageDataFrame(inputDataFrame,meanImageData,stdImageData):
    inputDataFrame["fullImageNormalized"] = [normalizeSingleImage(inputDataFrame["fullImage"][i],meanImageData,stdImageData) for i in range(0,len(inputDataFrame["fullImage"]))]
    return inputDataFrame

def normalizeImageUsingOpenCV(currentImage):
    norm_image = cv2.normalize(currentImage,currentImage, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    return norm_image

def getValuesfromDB(bandDB):
    currentband = np.array(bandDB).reshape(75,75)
    actualValue = 10**(currentband/10)
    return actualValue

def getImageFromBandDataFrame(dataFrame):
    dataFrame["valueBand1"] = [getValuesfromDB(dataFrame["band_1"][i]) for i in range(0,len(dataFrame["band_1"]))]
    dataFrame["valueBand2"] = [getValuesfromDB(dataFrame["band_2"][i]) for i in range(0,len(dataFrame["band_2"]))]
    dataFrame["fullImage"] = [makeRGBImageFromHnV(dataFrame["valueBand1"][i],dataFrame["valueBand2"][i]) for i in range(0,len(dataFrame["band_1"]))]
    dataFrame["fullImageNormalized"] = [normalizeImageUsingOpenCV(dataFrame["fullImage"][i]) for i in range(0,len(dataFrame["fullImage"]))]
    return dataFrame

def bootstrapAndEqualizeTheData(inputDataFrame):
    noOfIceBergData = len(inputDataFrame[inputDataFrame.is_iceberg == 1])
    totalData = len(inputDataFrame)
    print("Tornando os dados de iceberg e de barcos como iguais através de bootstrap")
    randomSamplesForIceBerg = inputDataFrame[inputDataFrame.is_iceberg == 1].sample((totalData - (2*noOfIceBergData)))
    inputDataFrame = pd.concat([inputDataFrame, randomSamplesForIceBerg], ignore_index=True)
    return inputDataFrame

def addFeatureDataFrame(dataFrame):
    dataFrame["hogFeature"] = [getHogDescriptor(dataFrame["fullImageNormalized"][i]) for i in range(0,len(dataFrame["fullImageNormalized"]))]
    return dataFrame

def getFeatureFromDataFrame(dataFrame,isTestData=0):
    featureDataVector = []
    responseVector = []
    featureDataVector =  np.array(featureDataVector).reshape(-1,64)
    for i in range(0,len(dataFrame)):
        currentFeature = dataFrame["hogFeature"][i].tolist()
        if(isTestData is 0):
            currentResponse = dataFrame["is_iceberg"][i].tolist()
        else:
            currentResponse = 2 
        currentFeature = np.array(currentFeature[0:64]).reshape(-1,64)
        currentResponse = int(currentResponse)
        if(i == 0):
            featureDataVector = currentFeature
            responseVector.append(currentResponse)
        else:
            featureDataVector = np.vstack((featureDataVector,currentFeature))
            responseVector.append(currentResponse)
    return featureDataVector,responseVector

print("Adicionando novas colunas ao Dataframe")
print("Normalizando o Dataframe")
trainDataFrame = getImageFromBandDataFrame(trainDataFrame)
trainDataFrame = bootstrapAndEqualizeTheData(trainDataFrame)
trainDataFrame = addFeatureDataFrame(trainDataFrame)

print("Calculando os vetores de atributos")
trainFeatureData , trainResponseData = getFeatureFromDataFrame(trainDataFrame)
print("Feito")

print(str(trainFeatureData.shape))
print("Inserindo no modelo")
clf = svm.SVC(gamma=0.001,C=100,kernel='rbf',probability=True)
clf.fit(trainFeatureData,trainResponseData)

print("Feito")

print("getting TestData features...")
print("Normalizando o Dataframe")
testDataFrame = getImageFromBandDataFrame(testDataFrame)
testDataFrame = addFeatureDataFrame(testDataFrame)
testFeatureData , _ = getFeatureFromDataFrame(testDataFrame,1)
print("Feito")

print(str(testFeatureData.shape))
test_predictions = clf.predict_proba(testFeatureData)
trainPredictions = clf.predict_proba(trainFeatureData)
print("Predição concluída")

print("A perda do algoritmo foi de: "+str(log_loss(trainResponseData,trainPredictions[:,1])))
print("A perda do algoritmo foi de: "+str(accuracy_score(trainResponseData,trainPredictions)))


pred_df = testDataFrame[['id']].copy()
pred_df['is_iceberg'] = test_predictions[:,1]
pred_df.to_csv('predictions.csv', index = False)

Here's the error message: ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

Is there any way or code that I can use to be able to display my model's acurracy?