Generalized estimation equations

We display how to use geepack, a R package to fit the null model for longitudinal traits. In this online tutorial, our main focus is to demonstrate how to obtain model residuals from the fitted null model.

Example data sets

PhenoFile = system.file("extdata", "simuLongPHENO.txt", package = "GRAB")
print(PhenoFile)
# "../GRAB/extdata/simuLongPHENO.txt"

LongPheno = data.table::fread(PhenoFile)
print(LongPheno)
#           IID      AGE GENDER LongPheno
#     1: Subj-1 49.76668      0  21.18404
#     2: Subj-1 46.97617      0  22.63922
#     3: Subj-1 48.60388      0  22.99897
#     4: Subj-1 48.47036      0  22.88830
#     5: Subj-1 50.51208      0  23.17735
#    ---                                 
# 10511:   f9_9 50.34522      1  19.04696
# 10512:   f9_9 50.44379      1  24.20562
# 10513:   f9_9 51.52763      1  24.86237
# 10514:   f9_9 49.90129      1  23.18408
# 10515:   f9_9 50.21092      1  21.56067

Exchangeable working correlation structure

library(dplyr)
library(tidyr)
library(geepack)

# we must keep the order of subjects in LongPheno and frq the same!
LongPheno = LongPheno %>% arrange(IID) # We highly recommend sorting by IID first.

frq = LongPheno %>% group_by(IID) %>% summarize(n = n())
LongPheno = LongPheno %>% mutate(pseudoIID = rep(1:nrow(frq), times = frq$n))

nullmodel_exc = geeglm(LongPheno ~ AGE + GENDER, data = LongPheno, id = pseudoIID, corstr = "exc") # for exchangeable working correlation structure

summary(nullmodel_exc)$coefficients
#             Estimate Std.err    Wald  Pr(>|W|)
# (Intercept)  37.1621 1.13814 1066.12 0.000e+00
# AGE          -0.3261 0.02277  205.03 0.000e+00
# GENDER        0.7055 0.09318   57.31 3.719e-14

Calculate working correlation matrix

working.matrix_exc = matrix(nullmodel_exc$geese$alpha, nrow = max(frq$n), ncol = max(frq$n)); diag(working.matrix_exc) = 1 # for exchangeable working correlation structure

print(working.matrix_exc[1:4,1:4])
#        [,1]   [,2]   [,3]   [,4]
# [1,] 1.0000 0.2009 0.2009 0.2009
# [2,] 0.2009 1.0000 0.2009 0.2009
# [3,] 0.2009 0.2009 1.0000 0.2009
# [4,] 0.2009 0.2009 0.2009 1.0000

Obtain model residuals

modelreisd_exc = c()
pos = 0
for(i in 1:nrow(frq))
{
  modelreisd_exc = c(modelreisd_exc, solve(working.matrix_exc[1:frq$n[i], 1:frq$n[i]]) %*% nullmodel_exc$residuals[1:frq$n[i] + pos])
  pos = pos + frq$n[i]
}

ResidMat.exc = LongPheno %>%
  mutate(Resid = modelreisd_exc) %>%
  group_by(IID) %>% 
  summarize(Resid = sum(Resid)) %>%
  rename(SubjID = IID)

summary(ResidMat.exc$Resid)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# -15.606  -3.199  -0.093  -0.010   3.008  16.631

ResidMat.exc = ResidMat.exc %>% mutate(Resid = scale(Resid)) # if sum(Resid) != 0, we should rescale it! This often happens in real data analysis.

ResidMatFile = system.file("extdata", "ResidMat.txt", package = "GRAB")
data.table::fwrite(ResidMat.exc, file = ResidMatFile, row.names = FALSE, col.names = TRUE)

Autoregressive working correlation structure

library(dplyr)
library(tidyr)
library(geepack)

# we must keep the order of subjects in LongPheno and frq the same!
LongPheno = LongPheno %>% arrange(IID) # We highly recommend sorting by IID first.

frq = LongPheno %>% group_by(IID) %>% summarize(n = n())
LongPheno = LongPheno %>% mutate(pseudoIID = rep(1:nrow(frq), times = frq$n))

nullmodel_ar1 = geeglm(LongPheno ~ AGE + GENDER, data = LongPheno, id = pseudoIID, corstr = "ar1") # for autoregressive working correlation structure

summary(nullmodel_ar1)$coefficients
#             Estimate Std.err   Wald  Pr(>|W|)
# (Intercept)  36.2439  1.1755 950.71 0.000e+00
# AGE          -0.3079  0.0235 171.64 0.000e+00
# GENDER        0.7143  0.0883  65.44 5.551e-16

Calculate working correlation matrix

working.matrix_ar1 = toeplitz(1 * nullmodel_ar1$geese$alpha^(0:(max(frq$n)-1))) # for autoregressive working correlation structure
print(working.matrix_ar1[1:4,1:4])
#        [,1]   [,2]   [,3]   [,4]
# [1,] 1.0000 0.4982 0.2482 0.1236
# [2,] 0.4982 1.0000 0.4982 0.2482
# [3,] 0.2482 0.4982 1.0000 0.4982
# [4,] 0.1236 0.2482 0.4982 1.0000

Obtain model residuals

modelreisd_ar1 = c()
pos = 0
for(i in 1:nrow(frq))
{
  modelreisd_ar1 = c(modelreisd_ar1, solve(working.matrix_ar1[1:frq$n[i], 1:frq$n[i]]) %*% nullmodel_ar1$residuals[1:frq$n[i] + pos])
  pos = pos + frq$n[i]
}

ResidMat.ar1 = LongPheno %>%
  mutate(Resid = modelreisd_ar1) %>%
  group_by(IID) %>% 
  summarize(Resid = sum(Resid)) %>%
  rename(SubjID = IID)

summary(ResidMat.ar1$Resid)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# -18.786  -3.637  -0.085  -0.031   3.543  18.491 

ResidMat.ar1 = ResidMat.ar1 %>% mutate(Resid = scale(Resid)) # if sum(Resid) != 0, we should rescale it! This often happens in real data analysis.

ResidMatFile = system.file("extdata", "ResidMat.txt", package = "GRAB")
data.table::fwrite(ResidMat.ar1, file = ResidMatFile, row.names = FALSE, col.names = TRUE)

Note

  • If the sum of model residuals is not zero, please rescale them.
  • The column names of ResidMat must be exactly SubjID in the first column and Resid in the second column.
  • Each subject should match its corresponding residual.