1
votes

Each of my data's columns will be rescaled and put into bins from 0 to 100. The bin columns will be used as features for a model. In order to test each bin separately, I'd like to split each bin column into separate columns for each of it's values. The new column will hold either a 0 or 1, dependent upon whether the value in the cell matched the column's bin. From something like this:

row values
  1     10
  2     20
  3     30
  4     40
  5     10
  6     30
  7     40

to this:

row values_10 values_20 values_30 values_40
  1         1         0         0         0
  2         0         1         0         0
  3         0         0         1         0
  4         0         0         0         1
  5         1         0         0         0
  6         0         0         1         0
  7         0         0         0         1

This brute force approach does the job, but there must be a better (non-loop) way:

values <- c( 10,20,30,40,10,30,40)
dat <- data.frame(values)

columnNames <- unique(dat$values)

for( n in 1:length(columnNames) )
{
    dat[as.character(columnNames[n])]  <- 0
}

columnNames2 <- colnames(dat)

for( c in 2:ncol(dat))
{
    hdr <- columnNames2[c]

    for( r in 1:nrow(dat))
    {
        if( dat$values[r]==as.integer(hdr) )
            dat[r,c]=1
    }
}

Many thanks!!

EDIT

These are all great answers, thank you everyone. The final object, whether a matrix, table, or data.table, will contain only the separate bin columns (no source columns). How can the solutions below be used for 2000+ source columns?

EDIT2

Based on the answers to my follow-up question, below are implementations for each of the methods for anyone coming to this question in the future.

# read in some data with multiple columns

df_in  <- read.table(text="row val1 val2
                  1     10     100
                  2     20     200
                  3     30     300
                  4     40     400
                  5     10     100
                  6     30     300
                  7     40     400", header=TRUE, stringsAsFactors=FALSE)

#   @Zelazny7 's method using a matrix

df_in$row <- NULL

col_names <- names(df_in)

for( c in 1:length(col_names)){

    uniq <- unlist(unique(df_in[col_names[c]]))

    m <- matrix(0, nrow(df_in), length(uniq), 
                dimnames = list(NULL, paste0(col_names[c], "_", uniq)))

    for (i in seq_along(df_in[[col_names[c]]])) {
        k <- match(df_in[[col_names[c]]][i], uniq, 0)
        m[i,k] <- 1
    }

    if( c==1 )
        df_out <- m
    else
        df_out <- cbind(df_out,m)
}


#   @P Lapointe 's method using 'table'

col_names <- names(df_in)

for( c in 2:length(col_names)){

    m <- table(df_in$row,df_in[[col_names[c]]])    
    uniq <- unlist(unique(df_in[col_names[c]]))
    newNames <- toString(paste0(col_names[c],'_',uniq))

    if( c==2 ){
        df_out <- m
        hdrs <- newNames
    }
    else{
        df_out <- cbind(df_out,m)
        hdrs <- paste(hdrs,newNames,sep=", ")
    }
}

colnames(df_out) <- unlist(strsplit(hdrs, split=", "))


#   @bdemarest 's method using 'data.table'
#   read in data first

library(data.table)

df_in = fread("row val1 val2
            1     10     100
            2     20     200
            3     30     300
            4     40     400
            5     10     100
            6     30     300
            7     40     400")

df_in$count = 1L

col_names <- names(df_in)

for( c in 2:length(col_names)-1){

    m = dcast(df_in, paste( 'row', '~', col_names[c]), value.var="count", fill=0L)

    uniq <- unlist(unique(df_in[,get(col_names[c])]))
    newNames <- toString(paste0(col_names[c],'_',uniq))

    m$row <- NULL

    if( c==2 ){
        df_out <- m
        hdrs <- newNames
    }
    else if( c>2 ){
        df_out <- cbind(df_out,m)
        hdrs <- paste(hdrs,newNames,sep=", ")
    }
}

colnames(df_out) <- unlist(strsplit(hdrs, split=", "))

All answers were appropriate and usable so the best answer was awarded to the quickest initial response. Thanks again for your help!!

3
look at ?dcast - mtoto
Instead of creating separate dummy columns, you can use cut to group a single column of data into factor categories (in this case it looks like you'd want an ordered factor). R's model functions generally create the dummy columns automatically if you include a factor column in your model. If not, you can use model.matrix to create the dummy columns. But are you sure it makes sense to group the data? Doesn't that just throw away information? - eipi10
This gets you the basic structure of the matrix you are looking for: df$I = 1;t(tidyr::spread(df, key = row, value = I)) - Vlo
@eipi10 Thank you for the suggestions. The actual bins for the rescaled data are 0,1,2,3...100, which is an adequate resolution for this application. I chose 10,20,etc in the example to visually distinguish the 'value' column from the 'row' column. - Val Dalin

3 Answers

2
votes

I do this quite often. This is the method I use to create dummies. It is very fast.

## reading in your example data
df <- read.table(file = "clipboard", header=TRUE)
df$row <- NULL

uniq <- unique(df$values)
m <- matrix(0, nrow(df), length(uniq), dimnames = list(NULL, paste0("column_", uniq)))

for (i in seq_along(df$values)) {
  k <- match(df$values[i], uniq, 0)
  m[i,k] <- 1
}

And the results:

> m
     column_10 column_20 column_30 column_40
[1,]         1         0         0         0
[2,]         0         1         0         0
[3,]         0         0         1         0
[4,]         0         0         0         1
[5,]         1         0         0         0
[6,]         0         0         1         0
[7,]         0         0         0         1

Another variant that avoids the loop by indexing the matrix with a matrix:

m[cbind(seq.int(nrow(m)), match(df$values, uniq))] <- 1
1
votes

Here is a data.table solution. I first add a count column, then reshape to wide form with dcast(). By the way, this is fast enough to use on data with 10 million or more rows.

library(data.table)

tab = fread("row values
  1     10
  2     20
  3     30
  4     40
  5     10
  6     30
  7     40")

tab$count = 1L

res = dcast(tab, row ~ values, value.var="count", fill=0L)
res
#    row 10 20 30 40
# 1:   1  1  0  0  0
# 2:   2  0  1  0  0
# 3:   3  0  0  1  0
# 4:   4  0  0  0  1
# 5:   5  1  0  0  0
# 6:   6  0  0  1  0
# 7:   7  0  0  0  1
1
votes

Use table:

df1  <- read.table(text="row values
  1     10
  2     20
  3     30
  4     40
  5     10
  6     30
  7     40", header=TRUE, stringsAsFactors=FALSE)

  table(df1)

    values
row 10 20 30 40
  1  1  0  0  0
  2  0  1  0  0
  3  0  0  1  0
  4  0  0  0  1
  5  1  0  0  0
  6  0  0  1  0
  7  0  0  0  1

You can index the table like this:

table(df1)[5,1]
[1] 1

EDIT To answer your additional request, you can make new column names like so:

tbl <-table(df1)
out<-as.data.frame.matrix(tbl) #to transform into a data.frame
colnames(out) <-make.names(colnames(out)) #to make new column names
out
  X10 X20 X30 X40
1   1   0   0   0
2   0   1   0   0
3   0   0   1   0
4   0   0   0   1
5   1   0   0   0
6   0   0   1   0
7   0   0   0   1