1
votes

I'm generating a arff file with groovy from a xslx, but when i try to open this file in weka i got this error:

File "..." not recognised as an 'Arff data files' file. Reason: nominal value not declared in header, read Token[Ativo], line 16

i can't understand why i'm getting this error can someone helpme to fix this error, and explain why it's happening?

Generated file

@relation kd-itempedido
@attribute tipopedido {Assistencia,Recompra,Venda,Troca}
@attribute aprovado {0.0,1.0}
@attribute fasepedido {Aprovado,Cancelado,EmAprovacao,Liberado,Novo}
@attribute statusinternopedido {NegociarPagamento,PedidosDeTeste,AguardandoOcorrencia,Nada,AguardandoBoletoDeposito,PedidoDuplicado,SuspeitaDeFraude}
@attribute canal {Marketplace,Desktop}
@attribute origem {LojasAmericanas,Optimise,MercadoLivre,Cityads,Zanox,Zoom,Rakuten,Lomadee,Facebook,Viptarget,Submarino,Criteo,Muccashop,Chaordic,Walmart,Googlead,Nada,Extra,Lojaskd,Shopback,Afilio,Shoptime,Nextperformance,CarrinhoAbandonado,Bing}
@attribute mercado {S,N}
@attribute cluster {EntregaImediata,Fiprec,Icconv,Esgotado}
@attribute statusitem {Ativo}
@attribute statusproduto {Inativo,Ativo,AtivoSemEstoque,ForaDeLinha}
@attribute polo {Polo1,Polo3,Polo2}
@data
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Ativo,Inativo,Polo2
Venda,0.0,Novo,Nada,Desktop,Muccashop,N,Ativo,Ativo,Polo3

Groovy (VM -Dfile.encoding=ascii utf-8 utf8)

@Grapes([
        @Grab('org.apache.poi:poi:3.10.1'),
        @Grab('org.apache.poi:poi-ooxml:3.10.1')])
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import java.text.Normalizer
import static org.apache.poi.ss.usermodel.Cell.*
import java.nio.file.Paths

def path = "/home/eric/Documents/development/ufpr/Solid Eric/ItemPedido1000.xlsx"
def relation = "kd-itempedido"
def columns = ["tipopedido", "aprovado", "fasepedido", "statusinternopedido", "canal", "origem", "mercado", "cluster", "statusitem","statusproduto", "polo"]
def arff = "ItemPedido.arff"
new XslxToArffParser(path, relation, columns, arff);

class Data{
    def rows = new ArrayList<List>();

    @Override
    String toString() {
        def s = ""
        for (r in rows){
            for(d in r){

                s+=d
                if(r.indexOf(d) < (r.size()-1))
                    s+=","
            }
            s+="\n"
        }
        return s
    }
}



class Atributo {
    def descricao;
    def possibilidades = new HashSet<Object>();
    def index;

    @Override
    String toString() {

        def builder = new StringBuilder()
        builder.append("@attribute ").append(descricao)
        builder.append(" {")
        for(def i = 0; i<possibilidades.size(); i++){
            builder.append(possibilidades[i])
            if((i+1) != possibilidades.size())
                builder.append(",")
        }
        builder.append("}").append("\n")
        return builder.toString();
    }
}

class XslxToArffParser {
    def attributes =[:];
    def data = new Data();
    def sheet = null;

    XslxToArffParser(path, relation, columns, arffPath){
        load(path)
        getAttributes(columns)
        collectData()
        saveArff(relation, arffPath)
    }

    def String parse(String s){
        s = Normalizer.normalize(s, Normalizer.Form.NFD)
        s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "")
        s = s.split(/[^\w]/).collect { it.toLowerCase().capitalize() }.join("")
        s = s.replaceAll(" ", "")
        s = s.replaceAll("[^A-Za-z0-9]", "")
        s = s.isEmpty() ? "Nada" : s
        return s
    }

    def load(path) {
        Paths.get(path).withInputStream { input ->
            def workbook = new XSSFWorkbook(input)
            sheet = workbook.getSheetAt(0)
        }
    }

    def getAttributes(columns){
        for (cell in sheet.getRow(0).cellIterator()) {
            def index = cell.columnIndex
            def description = parse(cell.stringCellValue).toLowerCase()
            if(columns.contains(description)){
                attributes << [(index):new Atributo(descricao: description, index: index)]
            }
        }
    }

    def collectData(){
        def headerFlag = true
        for (row in sheet.rowIterator()) {
            if (headerFlag) {
                headerFlag = false
                continue
            }
            def r = []
            for (cell in row.cellIterator()) {
                def index = cell.columnIndex;
                def value = cell.cellType == CELL_TYPE_STRING ? parse(cell.stringCellValue) : cell.numericCellValue

                def attr = attributes[index]
                if(attr != null){
                    attr.possibilidades.add(value)
                    r << value
                }
            }

            data.rows.add(r)
        }
    }

    def saveArff(relation, path){
        Paths.get(path).withWriter { writer ->

            writer.write "@relation " + relation
            writer.write "\n"
            for(a in attributes.values())
                writer.write a.toString()

            writer.write "@data"
            writer.write "\n"

            writer.write data.toString()
        }
    }
}

Solved. "row.cellIterator()" does not iterate over null/blank cells

1

1 Answers

0
votes

It has been a while since I used Weka, but looking at the file you showed and the error message, I suspect the problem is in the last two rows of the data file. They don't have a value for the attribute "cluster".

After the S or N (for attribute "mercado"), they have "Ativo". That "Ativo" value is not defined as one of the possible values of the nominal attribute cluster. The file did read "Ativo" though (which is why the error message says ''read Token[Ativo]'', but it expected to read a value for the cluster attribute, it did not yet expect a value for the statusitem attribute.