I have two gene-expression time-course data sets:
First, gene expression was measured over 14 time points from 4 groups:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
which looks like this (adding a loess
smoothed trend line):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Second, gene expression was measured over similar 14 time points but now from 2 different groups, each represented by the two sexes:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
which looks like this (adding a loess smoothed trend line):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
For df1
, I would like to estimate the effect of time
on val
, adjusting for group
.
For df2
, I would like to estimate the effect of time:group
on val
, adjusting for sex
.
Looking at the data I thought using spline
regression
s would be appropriate so I used the gam
function from the mgcv
package, which as far as I understand optimizes the degrees of freedom of the spline
s fitted to the data.
This is what I fitted for df1
:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
Which gives:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
But 7.18 degrees of freedom seems too much for these data.
For df2
:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
which gives:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
I guess that in this case I'd imagine the degrees of freedom to be slightly higher.
One more point. Plotting the fitted values for these two data sets:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
which looks fine.
But for df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Looks like it flipped the group labels.
So my questions are:
- Am I using
mgcv::gam
correctly for optimizing the spline degrees of freedom for my questions? - Does
mgcv
reorders the samples in itsfitted.values
?