0
votes

I need to convert with str_split a big data_frame. The idea is to check if the value in 2 columns is != 0 and is so, split them on "_" in new columns. I want to keep the new values in 3 new different columns. I was thinking of a for loop, but it will take forever and I wanted to use apply(), but is not working.

the data.frame is:

a <- structure(list(seqnames1 = c("chr1", "chr1", "chr1", "chr1",  "chr1",
"chr1"), start1 = c(4207675L, 4207675L, 4207675L, 4207675L,  4207675L,
4207675L), end1 = c(4207887L, 4207887L, 4207887L, 4207887L,  4207887L,
4207887L), width1 = c(213L, 213L, 213L, 213L, 213L,  213L), strand1 =
c("*", "*", "*", "*", "*", "*"), node.class1 = c("bait",  "bait",
"bait", "bait", "bait", "bait"), promoter.id1 = c(0L,  0L, 0L, 0L, 0L,
0L), promoter_flanking_region.id1 = c(0L, 0L,  0L, 0L, 0L, 0L),
exon.id1 = c(0L, 0L, 0L, 0L, 0L, 0L), intron.id1 = c(0L,  0L, 0L, 0L,
0L, 0L), enhancer.id1 = c(0L, 0L, 0L, 0L, 0L, 0L), 
    fli1.id1 = c(0L, 0L, 0L, 0L, 0L, 0L), gata1.id1 = c(0L, 0L, 
    0L, 0L, 0L, 0L), gata2.id1 = c(0L, 0L, 0L, 0L, 0L, 0L), tal1.id1 = c(0L, 
    0L, 0L, 0L, 0L, 0L), ctcf.id1 = c(0L, 0L, 0L, 0L, 0L, 0L), 
    bait.id1 = c("chr1:4267838-4267939", "chr1:4267838-4267939", 
    "chr1:4267838-4267939", "chr1:4267838-4267939", "chr1:4267838-4267939", 
    "chr1:4267838-4267939"), gene_name.id1 = c("0", "0", "0", 
    "0", "0", "0"), RNA_expression.id1 = c("0", "0", "0", "0", 
    "0", "0"), seqnames2 = c("chr1", "chr1", "chr1", "chr1", 
    "chr1", "chr1"), start2 = c(1886952L, 2562429L, 2908853L, 
    3596298L, 4008510L, 4025732L), end2 = c(1887558L, 2562819L, 
    2909055L, 3597281L, 4008863L, 4026507L), width2 = c(607L, 
    391L, 203L, 984L, 354L, 776L), strand2 = c("*", "*", "*", 
    "*", "*", "*"), node.class2 = c("intron", "exon", "intergenic_region", 
    "intron", "promoter_flanking_region", "promoter_flanking_region"
    ), promoter.id2 = c(0L, 0L, 0L, 0L, 0L, 0L), promoter_flanking_region.id2 = c(0L, 
    0L, 0L, 0L, 1L, 1L), exon.id2 = c(0L, 1L, 0L, 0L, 0L, 0L), 
    intron.id2 = c(1L, 0L, 0L, 1L, 1L, 0L), enhancer.id2 = c(0L, 
    1L, 0L, 0L, 0L, 0L), fli1.id2 = c(0L, 0L, 0L, 0L, 0L, 0L), 
    gata1.id2 = c(0L, 0L, 0L, 0L, 0L, 0L), gata2.id2 = c(0L, 
    0L, 0L, 0L, 0L, 0L), tal1.id2 = c(0L, 0L, 0L, 0L, 0L, 0L), 
    ctcf.id2 = c(0L, 1L, 0L, 0L, 0L, 0L), bait.id2 = c("0", "0", 
    "0", "0", "0", "0"), gene_name.id2 = c("GNB1_21665", "TNFRSF14_25838", 
    "0", "MEGF6_34434", "AL805961.1_25459", "0"), RNA_expression.id2 = c("0", 
    "0", "0", "0", "0", "0"), counts = c(0L, 1L, 1L, 3L, 3L, 
    3L), CHiCAGO_Score = c(0.57, 1.39, 1.78, 3.26, 3.52, 3.48
    ), distance_bait_prey = c(2320526, 1645157, 1298827, 610991, 
    199094, 181661), RNA_expression_gene_symbol_id1 = c(0, 0, 
    0, 0, 0, 0), RNA_expression_logFPKM_id1 = c(0, 0, 0, 0, 0, 
    0), RNA_expression_stratification_id1 = c(0, 0, 0, 0, 0, 
    0), RNA_expression_gene_symbol_id2 = c(0, 0, 0, 0, 0, 0), 
    RNA_expression_logFPKM_id2 = c(0, 0, 0, 0, 0, 0), RNA_expression_stratification_id2 = c(0, 
    0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("data.table",  "data.frame"))

My (nonreproducible) code to import the data.frame and start the new columns at zero

a <- data.table::fread(input='file',  sep = '\t', header = TRUE)
a$RNA_expression_gene_symbol_id1 <- "0"
a$RNA_expression_logFPKM_id1 <- "0"
a$RNA_expression_stratification_id1 <- "0"
a$RNA_expression_gene_symbol_id2 <- "0"
a$RNA_expression_logFPKM_id2 <- "0"
a$RNA_expression_stratification_id2 <- "0"

The for loop i had in mind

for ( i in seq(1, length(a$gene_name.id1))){
        if (a$RNA_expression.id1[i] != 0){
                b <- str_split(a$RNA_expression.id1[i], "_", n=3)
                a$RNA_expression_gene_symbol_id1[i] <- b[[1]][1]
                a$RNA_expression_logFPKM_id1[i] <- b[[1]][2]
                a$RNA_expression_stratification_id1[i] <- b[[1]][3]
                }
        if (a$RNA_expression.id2[i] != 0){
                b <- str_split(a$RNA_expression.id1[i], "_", n=3)
                a$RNA_expression_gene_symbol_id1[i] <- b[[1]][1]
                a$RNA_expression_logFPKM_id1[i] <- b[[1]][2]
               a$RNA_expression_stratification_id1[i] <- b[[1]][3]
                }
}

I tried to create a function to use apply() as follow:

my_function <- function(a){
        if (a[19] != 0){
                b <- str_split(a[19], "_", n=3)
                a[43] <- b[[1]][1]
                a[44] <- b[[1]][2]
                a[45] <- b[[1]][3]
                }
        if (a[19] != 0){
               b <- str_split(a[38], "_", n=3)
                a[46] <- b[[1]][1]
                a[47] <- b[[1]][2]
                a[48] <- b[[1]][3]
                }
}`

apply(a, 1, my_function)

I am getting a list of NULL values and only the last of the three values I am splitting Where am I wrong? am I completely off with the structure of the script?

1
It would help if you could provide a sample of your data with dput(head(a)), and what you expect the output to look like (even if manually typed). - r2evans
How can I correctly format the data.frame? - Lu_Ste
(1) 47 columns is a bit much, can you reduce your sample data to be only a few relevant columns? (2) What is your intended output? (BTW: minor nit, but when you post the output from dput of a data.table, please remove the .internal.selfref, it cannot be used outside of your immediate R session and does not harm things when missing here.) - r2evans

1 Answers

1
votes

The main problem is that apply expects a matrix as input, as you are providing a data.frame the first thing that apply does is coercing your data.frame, hence changing all columns to the same type and messing everything. Another problems is that apply returns an array or list of values which is not the output you expect (a data.frame/data.table).

You could achieve the same result with separate from tidyr. For example, the id1 case:

  separate(a,
           RNA_expression.id1, 
           into = c("RNA_expression_gene_symbol_id1", 
                    "RNA_expression_logFPKM_id1",
                    "RNA_expression_stratification_id1"), 
           sep = "_",
           fill = "left")