0
votes

How can we subset the dataframe by splitting columns by delimiter and selecting the minimum and maximum value from respective columns.

From data.frame df1, i want to split column from 2 to 4 by delimiter ";" . from column 2 select only the first character vector from column 3 select minimum value and from column 4 select the maximum value

df1 <-  
     Geneid             Chr             Start                       End
    CPA1       chr7;chr7;chr7;chr7      60837277;60842119;60844209 60858738;60860094;60861430
    GUCY2D    chr17;chr17;chr17;chr17   60864066;60865166;60867516 60871561;60873263;60874538
    UBC       chr12;chr12;chr12;chr12   61053840;61054888;61056916 61090048;61090639;61092555    


df2 <-  
     Geneid   Chr    Start       End
    CPA1      chr7     60837277   60861430
    GUCY2D    chr17    60864066   60874538
    UBC       cr12     61053840   61092555  
2
can you dput your data? - Colonel Beauvel

2 Answers

2
votes

Use sub .

x <- "     Geneid             Chr             Start                       End
     CPA1       chr7;chr7;chr7;chr7      60837277;60842119;60844209 60858738;60860094;60861430
     GUCY2D    chr17;chr17;chr17;chr17   60864066;60865166;60867516 60871561;60873263;60874538
    UBC       chr12;chr12;chr12;chr12   61053840;61054888;61056916 61090048;61090639;61092555    
 "
df1 <-read.table(text=x, header=T)
data.frame(Geneid=df1$Geneid,Chr=sub(";.*", "", df1$Chr), Start=sub(";.*", "", df1$Start), End=sub(".*;", "", df1$End))
#    Geneid   Chr    Start      End
# 1   CPA1  chr7 60837277 60861430
# 2 GUCY2D chr17 60864066 60874538
# 3    UBC chr12 61053840 61092555

or

data.frame(Geneid=df1$Geneid,Chr=sub(";.*", "", df1$Chr), Start=sapply(strsplit(as.character(df1$Start),";"),function(x) min(as.numeric(x))), End=sapply(strsplit(as.character(df1$End),";"),function(x) max(as.numeric(x))))
1
votes
library(dplyr)
library(tidyr)
library(stringi)

df1 %>%
  gather(variable, value, -Geneid) %>%
  mutate(value = value %>% stri_split_fixed(";")) %>%
  unnest(value) %>%
  group_by(Geneid, variable) %>%
  mutate(group_id = 1:n()) %>%
  ungroup %>%
  spread(variable, value) %>%
  group_by(Geneid) %>%
  summarize(Start = Start %>% as.numeric %>% min(na.rm = TRUE),
            End = End %>% as.numeric %>% max(na.rm = TRUE),
            Chr = first(Chr) )