# Load packages
library(ggplot2)
library(ggResidpanel)
library(gtsummary)
library(patchwork)Introduction to Medical Statistics 2026
Exercise 7 – Multiple linear regression
Data Analysis and Model Diagnostics
Exercise 7 – Multiple linear regression
Data Analysis and Model Diagnostics
Exercise i (Multivariable linear regression – Peru lung data set)
The dataset perulung contains data from a study of lung function among 636 children aged 7 to 10 years living in a deprived suburb of Lima, Peru. The outcome of interest is the maximum volume of air a child could breathe out in one second measured using a spirometer (forced expiratory volume, litres/second) and we aim to predict it based on other covariables. We will use the packages ggplot2, patchwork, gtsummary and ggResidpanel. Load these packages.
- Plot the outcome (fev1) against the child’s age and height. Do the associations look linear?
# Import the dataset
perulung <- read.csv("https://raw.githubusercontent.com/oucru-biostats/IntroductionToBiostatistics2024/main/data/perulung.csv", stringsAsFactors=TRUE)
# Plot the outcome (fev1) against the child's age and height
p1 <- ggplot(perulung, aes(age,fev1)) + geom_point()
p2 <- ggplot(perulung, aes(height,fev1)) + geom_point()
p1 + p2- Perform 2 separate linear regression models of fev1 against age and height.
# 2 simple linear regression models of fev1 against age or height, respectively
fit1 <- lm(fev1 ~ age, data=perulung)
summary(fit1)
Call:
lm(formula = fev1 ~ age, data = perulung)
Residuals:
Min 1Q Median 3Q Max
-0.76784 -0.16193 -0.00207 0.16417 0.90300
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.3679 0.1298 -2.835 0.00472 **
age 0.2185 0.0144 15.174 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.261 on 634 degrees of freedom
Multiple R-squared: 0.2664, Adjusted R-squared: 0.2652
F-statistic: 230.2 on 1 and 634 DF, p-value: < 2.2e-16
fit2 <- lm(fev1 ~ height, data=perulung)
summary(fit2)
Call:
lm(formula = fev1 ~ height, data = perulung)
Residuals:
Min 1Q Median 3Q Max
-1.0881 -0.1329 0.0173 0.1452 0.8503
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.265817 0.185474 -12.22 <2e-16 ***
height 0.031120 0.001493 20.84 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2347 on 634 degrees of freedom
Multiple R-squared: 0.4065, Adjusted R-squared: 0.4056
F-statistic: 434.3 on 1 and 634 DF, p-value: < 2.2e-16
- Perform a multiple linear regression analysis of fev1 depending on both age and height. Compare the regression coefficients of the multiple regression with the simple regression models. Why is the coefficient for age smaller in the multiple regression model?
Calculate the 95% confidence interval for the regression coefficients.
# Multiple linear regression analysis of fev1 depending on both age and height (+CI), compare
fit3 <- lm(fev1~age+height, data=perulung)
summary(fit3)
Call:
lm(formula = fev1 ~ age + height, data = perulung)
Residuals:
Min 1Q Median 3Q Max
-1.05234 -0.12940 0.00602 0.13698 0.80809
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.308718 0.181176 -12.743 < 2e-16 ***
age 0.089717 0.015719 5.708 1.76e-08 ***
height 0.024968 0.001813 13.775 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2291 on 633 degrees of freedom
Multiple R-squared: 0.4356, Adjusted R-squared: 0.4338
F-statistic: 244.3 on 2 and 633 DF, p-value: < 2.2e-16
signif(confint(fit3),3) 2.5 % 97.5 %
(Intercept) -2.6600 -1.9500
age 0.0588 0.1210
height 0.0214 0.0285
# The coefficient of age is likely smaller because of the correlation between age and height.
ggplot(perulung, aes(age,height)) + geom_point()- What is the interpretation of the intercept in the model c)? Scale the variables, so that both the intercept and the regression coefficients are easier to interpret.
For example, scale the variables such that the intercept corresponds to a child with age 7 years and height 120 cm and that the coefficient for height corresponds to a 10 cm increase in height.
fit4 <- lm(fev1~I(age-7)+I((height-120)/10), data=perulung)
summary(fit4)
Call:
lm(formula = fev1 ~ I(age - 7) + I((height - 120)/10), data = perulung)
Residuals:
Min 1Q Median 3Q Max
-1.05234 -0.12940 0.00602 0.13698 0.80809
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.31549 0.02892 45.487 < 2e-16 ***
I(age - 7) 0.08972 0.01572 5.708 1.76e-08 ***
I((height - 120)/10) 0.24968 0.01813 13.775 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2291 on 633 degrees of freedom
Multiple R-squared: 0.4356, Adjusted R-squared: 0.4338
F-statistic: 244.3 on 2 and 633 DF, p-value: < 2.2e-16
signif(confint(fit4),3) 2.5 % 97.5 %
(Intercept) 1.2600 1.370
I(age - 7) 0.0588 0.121
I((height - 120)/10) 0.2140 0.285
- Add sex as an additional covariate to the regression model. How can the coefficient for sex be interpreted? How much of the total variability in fev1 does the model explain?
# Add sex as an additional covariate to the regression model
fit5 <- lm(fev1~I(age-7)+I((height-120)/10)+sex, data=perulung)
summary(fit5)
Call:
lm(formula = fev1 ~ I(age - 7) + I((height - 120)/10) + sex,
data = perulung)
Residuals:
Min 1Q Median 3Q Max
-0.99143 -0.12124 0.01873 0.13472 0.74228
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.25001 0.02948 42.405 < 2e-16 ***
I(age - 7) 0.09460 0.01519 6.229 8.56e-10 ***
I((height - 120)/10) 0.24567 0.01750 14.036 < 2e-16 ***
sex 0.12133 0.01758 6.903 1.25e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2211 on 632 degrees of freedom
Multiple R-squared: 0.4752, Adjusted R-squared: 0.4727
F-statistic: 190.7 on 3 and 632 DF, p-value: < 2.2e-16
signif(confint(fit5),3) 2.5 % 97.5 %
(Intercept) 1.1900 1.310
I(age - 7) 0.0648 0.124
I((height - 120)/10) 0.2110 0.280
sex 0.0868 0.156
## To see clearer what the estimate for the sex variable refers to, create a categorical sex covariate
perulung$sex <- factor(perulung$sex, levels=c(0,1), labels=c("female","male"))
fit6 <- lm(fev1~I(age-7)+I((height-120)/10)+sex, data=perulung)
summary(fit6)
Call:
lm(formula = fev1 ~ I(age - 7) + I((height - 120)/10) + sex,
data = perulung)
Residuals:
Min 1Q Median 3Q Max
-0.99143 -0.12124 0.01873 0.13472 0.74228
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.25001 0.02948 42.405 < 2e-16 ***
I(age - 7) 0.09460 0.01519 6.229 8.56e-10 ***
I((height - 120)/10) 0.24567 0.01750 14.036 < 2e-16 ***
sexmale 0.12133 0.01758 6.903 1.25e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2211 on 632 degrees of freedom
Multiple R-squared: 0.4752, Adjusted R-squared: 0.4727
F-statistic: 190.7 on 3 and 632 DF, p-value: < 2.2e-16
signif(confint(fit6),3) 2.5 % 97.5 %
(Intercept) 1.1900 1.310
I(age - 7) 0.0648 0.124
I((height - 120)/10) 0.2110 0.280
sexmale 0.0868 0.156
summary(fit6)$r.squared[1] 0.4751594
- Perform appropriate diagnostic plots for the model in e) using the ggResidpanel package and the code resid_interact(fit6, plots=“R”). Is there any evidence that the assumptions of the regression model are violated? There is one individual that has fairly extreme residuals in all four plots. Can you find it? What happens if you refit the model without that individual?
# Diagnostic plots for the model
resid_panel(fit6,plots="all")## interactive plot
# resid_interact(fit6, plots=c("qq","cookd","lev")) fit6b <- lm(fev1~I(age-7)+I((height-120)/10)+sex, data=perulung[-275,])
summary(fit6b)
Call:
lm(formula = fev1 ~ I(age - 7) + I((height - 120)/10) + sex,
data = perulung[-275, ])
Residuals:
Min 1Q Median 3Q Max
-0.95213 -0.12411 0.01742 0.13462 0.73764
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.25273 0.02902 43.166 < 2e-16 ***
I(age - 7) 0.09256 0.01495 6.189 1.09e-09 ***
I((height - 120)/10) 0.25666 0.01739 14.758 < 2e-16 ***
sexmale 0.11810 0.01732 6.820 2.13e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2176 on 631 degrees of freedom
Multiple R-squared: 0.4911, Adjusted R-squared: 0.4887
F-statistic: 203 on 3 and 631 DF, p-value: < 2.2e-16
resid_panel(fit6b,plots="all")Exercise ii (Multivariable linear regression – Dengue viremia and interactions)
The dataset dengueViremia contains selected data from 121 children with dengue serotype 1 or 2 presenting to a community clinic in Ho Chi Minh City within 3 days of illness onset. In this exercise, we will investigate how the dengue serotype (DENV-1 or DENV-2) and the serology (primary or secondary infection) affect the child’s dengue viremia level on day 3. We will use log10-transformed viremia for all analyses.
- Import the dengue viremia dataset and create a boxplot of log10-viremia by type. Is there evidence of an interaction between the effect of the serotype and the effect of serology on viremia?
# Import the dataset
dengue <- read.csv("https://raw.githubusercontent.com/oucru-biostats/IntroductionToBiostatistics2024/main/data/dengueViremiaData.csv", stringsAsFactors=TRUE)
# Create a boxplot of log10-viremia by type
ggplot(dengue,aes(type,log10(vl.3))) + geom_boxplot() + geom_jitter(size = 1, alpha = 0.5, width = 0.25, colour = 'red') + ylab("log10-viremia")- Compare viremia-levels between primary and secondary infections with an appropriate test, combining both subtypes. In view of a), does this comparison make sense?
t.test(log10(vl.3)~serology, data=dengue)
Welch Two Sample t-test
data: log10(vl.3) by serology
t = 1.4082, df = 117.29, p-value = 0.1617
alternative hypothesis: true difference in means between group primary and group secondary is not equal to 0
95 percent confidence interval:
-0.1364023 0.8078071
sample estimates:
mean in group primary mean in group secondary
7.889474 7.553772
wilcox.test(log10(vl.3)~serology, data=dengue)
Wilcoxon rank sum test with continuity correction
data: log10(vl.3) by serology
W = 2143.5, p-value = 0.1011
alternative hypothesis: true location shift is not equal to 0
- Compare viremia-levels of primary and secondary infections in the subgroups of patients with DENV-1 and DENV-2 separately with appropriate tests.
t.test(log10(vl.3)~serology, data=dengue, subset=serotype=="DENV-1")
Welch Two Sample t-test
data: log10(vl.3) by serology
t = 2.5064, df = 74.219, p-value = 0.01439
alternative hypothesis: true difference in means between group primary and group secondary is not equal to 0
95 percent confidence interval:
0.1304221 1.1416409
sample estimates:
mean in group primary mean in group secondary
8.198892 7.562860
t.test(log10(vl.3)~serology, data=dengue, subset=serotype=="DENV-2")
Welch Two Sample t-test
data: log10(vl.3) by serology
t = -2.3095, df = 12.675, p-value = 0.03846
alternative hypothesis: true difference in means between group primary and group secondary is not equal to 0
95 percent confidence interval:
-2.5833992 -0.0827988
sample estimates:
mean in group primary mean in group secondary
6.204866 7.537965
- We want to assess whether dengue serotype and serology affect log10-viremia after controlling for age and gender. Model the log-10 viremia with a multiple linear regression model with the covariates age, gender, serotype, serology. What do you conclude?
# Effect of dengue serotype and serology on log10-viremia after controlling for age and gender
fit7 <- lm(log10(vl.3)~serotype+serology+age+sex, data=dengue)
summary(fit7)
Call:
lm(formula = log10(vl.3) ~ serotype + serology + age + sex, data = dengue)
Residuals:
Min 1Q Median 3Q Max
-3.3345 -0.7105 0.3020 0.8683 2.0474
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.44452 0.45766 16.266 < 2e-16 ***
serotypeDENV-2 -0.75725 0.27568 -2.747 0.00698 **
serologysecondary -0.22570 0.24470 -0.922 0.35825
age 0.06550 0.04056 1.615 0.10903
sexmale -0.24726 0.23904 -1.034 0.30312
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.271 on 116 degrees of freedom
Multiple R-squared: 0.0951, Adjusted R-squared: 0.0639
F-statistic: 3.048 on 4 and 116 DF, p-value: 0.01981
- This model fit may not be adequate because we already known that there may be an interaction between serotype and serology. Therefore, add an interaction between serotype and serology. Create an article-ready table using the tbl_regression function from the gtsummary package. Interpret the regression output. Use the predict function, or the ggpredict function from the ggeffects package, to obtain the expected value and 95% confidence intervals for each of the four serotype-serology combinations. Choose age=11 and sex =“female” (which are the default values chosen by ggpredict.)
## We use the shorthand "*" notation for interaction
fit8 <- lm(log10(vl.3)~serotype*serology+age+sex, data=dengue)
summary(fit8)
Call:
lm(formula = log10(vl.3) ~ serotype * serology + age + sex, data = dengue)
Residuals:
Min 1Q Median 3Q Max
-2.9996 -0.6668 0.2075 0.8283 2.0643
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.88101 0.45901 17.170 < 2e-16 ***
serotypeDENV-2 -1.95203 0.44931 -4.344 3.03e-05 ***
serologysecondary -0.62091 0.26386 -2.353 0.02031 *
age 0.03863 0.03979 0.971 0.33364
sexmale -0.18956 0.23019 -0.824 0.41193
serotypeDENV-2:serologysecondary 1.83717 0.55828 3.291 0.00133 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.22 on 115 degrees of freedom
Multiple R-squared: 0.173, Adjusted R-squared: 0.137
F-statistic: 4.811 on 5 and 115 DF, p-value: 0.0004967
library(gtsummary)
tbl_regression(fit8,show_single_row = c(sex, serology, serotype),label = list(serotype ~ "DENV-2", serology ~ "secondary", sex ~ "male"))| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| DENV-2 | -2.0 | -2.8, -1.1 | <0.001 |
| secondary | -0.62 | -1.1, -0.10 | 0.020 |
| age | 0.04 | -0.04, 0.12 | 0.3 |
| male | -0.19 | -0.65, 0.27 | 0.4 |
| DENV-2 * secondary | |||
| DENV-2 * secondary | 1.8 | 0.73, 2.9 | 0.001 |
| Abbreviation: CI = Confidence Interval | |||
## we can obtain expected values of viremia for each of the serology-serotype combinations
new.data <- expand.grid(serotype=levels(dengue$serotype),serology=levels(dengue$serology),age=11,sex="female")
predict(fit8, newdata=new.data, interval="confidence") fit lwr upr
1 8.305927 7.891867 8.719987
2 6.353894 5.511841 7.195948
3 7.685018 7.195115 8.174921
4 7.570153 7.032138 8.108167
# via ggeffects it is easier
library(ggeffects)
plot(ggpredict(fit8, terms=~serotype+serology))- Perform diagnostic plots for the model from e).
# Diagnostic plots for the model
resid_panel(fit8,plots="all")