Here is a sample data frame of the data I am working with. It is basically a modified VCF file for those who are familiar with genetic data formats. If not, essentially each row contains information for a position in the genome where a variant may exist.
samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"),
Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L,
8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L,
8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L,
3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G",
"T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L,
1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C",
"G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997",
"AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149",
"rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889",
"rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom",
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")
What I would like to do is extract values from the "Info" column. However, the information contained in this column is not the same for each row, and does not always occur in the same order. Thus, I want to use pattern matching to get values of interest to me.
I wrote a little function to extract the "allele frequencies" (AF) for various "super populations" (eg. AMR, AFR, EUR, SAS, EAS) which are contained in the Info column.
extractAF <- function(pop, vec) {
info <- unlist((strsplit(vec, ";", fixed=TRUE)))
AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
return(AF)
}
This function requires two arguments: the 'pop' which is a string specifying which super population to extract, and the 'vec' which is designed to take the Info column of my dataframe.
The function works as expected when passing a single vector through:
extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779
extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528
However, I wish to do it to each row of the dataframe and create a new column containing the data. When I use the mutate function of dplyr, I wind up with a column of the same value:
library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))
I read a post (that I can't seem to find now otherwise I would reference it) that said mutate is passing all the rows at once, instead of row-by-row which I need.
So I tried a few other ways below based on this post :
apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))
Error in apply(samp[, "Info"], 1, function(x) extractAF("AMR_AF", x)) : dim(X) must have a positive length
samp[, extractAF("AMR_AF", Info), by = .I]
Error in [.data.frame
(samp, , extractAF("AMR_AF", Info), by = .I) :
unused argument (by = .I)
samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]
Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) :
unused argument (by = 1:nrow(samp))
#
UPDATE
Additional sample dataset that contains an NA and AF=0 in the INFO column below:
structure(list(CHROM = c("chr1", "chr1", "chr1", "chr1", "chr1", "chr1"), POS = c(16090898L, 16091074L, 16091583L, 16092212L, 16093560L, 16093639L), ID = c("rs6429774", "rs6429776", NA, "rs74528955", "rs904912", NA), REF = c("G", "A", "T", "C", "T", "C"), ALT = c("A", "G", "A", "T", "A", "T"), QUAL = c(NA, NA, NA, NA, NA, NA), FILTER = c(NA, NA, NA, NA, NA, NA), INFO = c("AC=1606;AF=0.320687;AN=5008;NS=2504;DP=21565;EAS_AF=0.1419;AMR_AF=0.2983;AFR_AF=0.525;EUR_AF=0.3509;SAS_AF=0.2137;AA=G|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|upstream_gene_variant|||||||96|1||||||;ERB=A||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=1690;AF=0.33746;AN=5008;NS=2504;DP=20247;EAS_AF=0.1498;AMR_AF=0.3012;AFR_AF=0.5681;EUR_AF=0.3549;SAS_AF=0.227;AA=G|||;CSQ=G|ENSG00000162458|ENST00000441801|Transcript|5_prime_UTR_variant|81|||||||1||||||;ERB=G||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", NA, "AC=8;AF=0.00159744;AN=5008;NS=2504;DP=19197;EAS_AF=0.0079;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;ERB=T||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=3282;AF=0.655351;AN=5008;NS=2504;DP=14721;EAS_AF=0.8343;AMR_AF=0.6916;AFR_AF=0.4259;EUR_AF=0.6531;SAS_AF=0.7577;AA=A|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483", "AC=5;AF=0.000998403;AN=5008;NS=2504;DP=14736;EAS_AF=0.003;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.002;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483" )), row.names = 14:19, class = "data.frame", .Names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))
apply(samp, 1, function(x) extractAF("AFR_AF", x[5]))
– Pierre L