2
votes

Here is a sample data frame of the data I am working with. It is basically a modified VCF file for those who are familiar with genetic data formats. If not, essentially each row contains information for a position in the genome where a variant may exist.

samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"), 
    Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L, 
    8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L, 
    8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L, 
    3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G", 
    "T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L, 
    1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C", 
    "G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997", 
    "AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
    ), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149", 
    "rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889", 
    "rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom", 
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")

What I would like to do is extract values from the "Info" column. However, the information contained in this column is not the same for each row, and does not always occur in the same order. Thus, I want to use pattern matching to get values of interest to me.

I wrote a little function to extract the "allele frequencies" (AF) for various "super populations" (eg. AMR, AFR, EUR, SAS, EAS) which are contained in the Info column.

extractAF <- function(pop, vec) {
  info <- unlist((strsplit(vec, ";", fixed=TRUE)))
  AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
  return(AF)
}

This function requires two arguments: the 'pop' which is a string specifying which super population to extract, and the 'vec' which is designed to take the Info column of my dataframe.

The function works as expected when passing a single vector through:

extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779

extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528

However, I wish to do it to each row of the dataframe and create a new column containing the data. When I use the mutate function of dplyr, I wind up with a column of the same value:

library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))

I read a post (that I can't seem to find now otherwise I would reference it) that said mutate is passing all the rows at once, instead of row-by-row which I need.

So I tried a few other ways below based on this post :

apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))

Error in apply(samp[, "Info"], 1, function(x) extractAF("AMR_AF", x)) : dim(X) must have a positive length

samp[, extractAF("AMR_AF", Info), by = .I]

Error in [.data.frame(samp, , extractAF("AMR_AF", Info), by = .I) : unused argument (by = .I)

samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]

Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) : 
  unused argument (by = 1:nrow(samp))
#

UPDATE

Additional sample dataset that contains an NA and AF=0 in the INFO column below:

structure(list(CHROM = c("chr1", "chr1", "chr1", "chr1", "chr1", "chr1"), POS = c(16090898L, 16091074L, 16091583L, 16092212L, 16093560L, 16093639L), ID = c("rs6429774", "rs6429776", NA, "rs74528955", "rs904912", NA), REF = c("G", "A", "T", "C", "T", "C"), ALT = c("A", "G", "A", "T", "A", "T"), QUAL = c(NA, NA, NA, NA, NA, NA), FILTER = c(NA, NA, NA, NA, NA, NA), INFO = c("AC=1606;AF=0.320687;AN=5008;NS=2504;DP=21565;EAS_AF=0.1419;AMR_AF=0.2983;AFR_AF=0.525;EUR_AF=0.3509;SAS_AF=0.2137;AA=G|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|upstream_gene_variant|||||||96|1||||||;ERB=A||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=1690;AF=0.33746;AN=5008;NS=2504;DP=20247;EAS_AF=0.1498;AMR_AF=0.3012;AFR_AF=0.5681;EUR_AF=0.3549;SAS_AF=0.227;AA=G|||;CSQ=G|ENSG00000162458|ENST00000441801|Transcript|5_prime_UTR_variant|81|||||||1||||||;ERB=G||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", NA, "AC=8;AF=0.00159744;AN=5008;NS=2504;DP=19197;EAS_AF=0.0079;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;ERB=T||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=3282;AF=0.655351;AN=5008;NS=2504;DP=14721;EAS_AF=0.8343;AMR_AF=0.6916;AFR_AF=0.4259;EUR_AF=0.6531;SAS_AF=0.7577;AA=A|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483", "AC=5;AF=0.000998403;AN=5008;NS=2504;DP=14736;EAS_AF=0.003;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.002;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483" )), row.names = 14:19, class = "data.frame", .Names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))

1
Your apply function was close apply(samp, 1, function(x) extractAF("AFR_AF", x[5]))Pierre L
Please format your code properlyMatias Andina

1 Answers

3
votes

You may not need those formulas since sub is vectorized. First create a variable of all possible codes like (AFR, AMR, EUR, etc...). Use that vector to create a search pattern to go through the Info column and return a new data frame with all matches:

all_pop <- c("AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF", "EAS_AF")
pat <- paste0(".*\\b", all_pop, "=(\\d+(\\.\\d+)?)\\b.*")

out <- sapply(pat, sub, "\\1", samp$Info)
newdf <- setNames(as.data.frame(out), all_pop)
#      AMR_AF AFR_AF EUR_AF SAS_AF EAS_AF
#   1  0.8357 0.5779 0.7366 0.8466 0.9921
#   2  0.8444 0.6725 0.7366 0.8538 0.9921
#   3  0.8415 0.6558 0.7376 0.8466 0.9921
#   4  0.8386 0.6339 0.7376 0.8466 0.9921
#   5  0.8487 0.6528 0.7714 0.8599 0.9921
#   6  0.8458 0.6362 0.7714 0.8599 0.9911
#   7    <NA>   <NA>   <NA>   <NA>   <NA>
#   8    <NA>   <NA>   <NA>   <NA>   <NA>
#   9  0.7954 0.5651 0.7167   0.82 0.9653
#   10   <NA>   <NA>   <NA>   <NA>   <NA>
#   11 0.8458 0.6362 0.7724 0.8671 0.9921
#   12 0.8473 0.6528 0.7724 0.8671 0.9921
#   13 0.8473 0.6346 0.7724 0.8671 0.9921