# Load required packages
library(tidyverse) # Includes dplyr, tidyr for joins and pivots
library(lubridate) # Date manipulation
# Check setup
packageVersion("tidyverse")[1] '2.0.0'
Complete solutions for Module 3 exercises based on R4DS Chapters 12 and 13
# Load required packages
library(tidyverse) # Includes dplyr, tidyr for joins and pivots
library(lubridate) # Date manipulation
# Check setup
packageVersion("tidyverse")[1] '2.0.0'
# Create wide format lab dataset (NOT TIDY)
lab_wide <- tibble(
USUBJID = c("001-001", "001-002", "001-003", "001-004"),
ALT_BL = c(25, 30, 28, 35),
AST_BL = c(22, 28, 26, 32),
BILI_BL = c(0.8, 1.2, 0.9, 1.1),
ALT_W2 = c(27, 32, 30, 38),
AST_W2 = c(24, 30, 28, 35),
BILI_W2 = c(0.9, 1.1, 1.0, 1.2),
ALT_W4 = c(26, 29, 31, 36),
AST_W4 = c(23, 27, 29, 33),
BILI_W4 = c(0.8, 1.0, 1.1, 1.0)
)
# 1a. Convert to long format (TIDY) using pivot_longer()
lab_long <- lab_wide %>%
pivot_longer(
cols = -USUBJID,
names_to = c("PARAM", "VISIT"),
names_sep = "_",
values_to = "AVAL"
) %>%
mutate(
PARAM = case_when(
PARAM == "ALT" ~ "Alanine Aminotransferase",
PARAM == "AST" ~ "Aspartate Aminotransferase",
PARAM == "BILI" ~ "Bilirubin"
),
VISIT = case_when(
VISIT == "BL" ~ "Baseline",
VISIT == "W2" ~ "Week 2",
VISIT == "W4" ~ "Week 4"
)
)
lab_long# A tibble: 36 × 4
USUBJID PARAM VISIT AVAL
<chr> <chr> <chr> <dbl>
1 001-001 Alanine Aminotransferase Baseline 25
2 001-001 Aspartate Aminotransferase Baseline 22
3 001-001 Bilirubin Baseline 0.8
4 001-001 Alanine Aminotransferase Week 2 27
5 001-001 Aspartate Aminotransferase Week 2 24
6 001-001 Bilirubin Week 2 0.9
7 001-001 Alanine Aminotransferase Week 4 26
8 001-001 Aspartate Aminotransferase Week 4 23
9 001-001 Bilirubin Week 4 0.8
10 001-002 Alanine Aminotransferase Baseline 30
# ℹ 26 more rows
# 1b. Create summary table
lab_summary <- lab_long %>%
group_by(PARAM, VISIT) %>%
summarise(
n = n(),
mean_val = round(mean(AVAL), 2),
.groups = "drop"
)
lab_summary# A tibble: 9 × 4
PARAM VISIT n mean_val
<chr> <chr> <int> <dbl>
1 Alanine Aminotransferase Baseline 4 29.5
2 Alanine Aminotransferase Week 2 4 31.8
3 Alanine Aminotransferase Week 4 4 30.5
4 Aspartate Aminotransferase Baseline 4 27
5 Aspartate Aminotransferase Week 2 4 29.2
6 Aspartate Aminotransferase Week 4 4 28
7 Bilirubin Baseline 4 1
8 Bilirubin Week 2 4 1.05
9 Bilirubin Week 4 4 0.98
Key Points: - pivot_longer() converts wide data to long (tidy) format - Use names_sep to split column names into multiple variables - Long format is better for analysis and visualization - Always clean up variable values after pivoting
adae <- derive_vars_dt( dataset = adae, dtc = AESTDTC, new_vars_prefix = “A”
# Create sample clinical datasets
dm <- tibble(
USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
AGE = c(45, 67, 52, 71, 34, 58),
SEX = c("M", "F", "M", "F", "M", "F"),
ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO", "TRT"),
ARM = c("Treatment", "Placebo", "Treatment", "Treatment", "Placebo", "Treatment"),
RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")
) %>%
mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))
ae <- tibble(
USUBJID = c("001-001", "001-001", "001-002", "001-004", "001-007", "001-007"),
AESEQ = c(1, 2, 1, 1, 1, 2),
AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "HEADACHE", "VOMITING"),
AESEV = c("MILD", "MODERATE", "MILD", "SEVERE", "MILD", "MODERATE"),
AESTDTC = c("2024-01-20", "2024-01-22", "2024-01-21", "2024-01-25", "2024-02-01", "2024-02-03")
)
vs <- tibble(
USUBJID = rep(c("001-001", "001-002", "001-003", "001-004"), each = 2),
VSTESTCD = rep(c("SYSBP", "DIABP"), 4),
VSSTRESN = c(120, 80, 135, 85, 115, 75, 140, 90),
VISIT = rep("Baseline", 8)
)
# Display datasets
dm# A tibble: 6 × 7
USUBJID AGE SEX ARMCD ARM RFSTDTC ELDERLY
<chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-001 45 M TRT Treatment 2024-01-15 N
2 001-002 67 F PBO Placebo 2024-01-16 Y
3 001-003 52 M TRT Treatment 2024-01-17 N
4 001-004 71 F TRT Treatment 2024-01-18 Y
5 001-005 34 M PBO Placebo 2024-01-19 N
6 001-006 58 F TRT Treatment 2024-01-20 N
ae# A tibble: 6 × 5
USUBJID AESEQ AEDECOD AESEV AESTDTC
<chr> <dbl> <chr> <chr> <chr>
1 001-001 1 HEADACHE MILD 2024-01-20
2 001-001 2 NAUSEA MODERATE 2024-01-22
3 001-002 1 FATIGUE MILD 2024-01-21
4 001-004 1 DIZZINESS SEVERE 2024-01-25
5 001-007 1 HEADACHE MILD 2024-02-01
6 001-007 2 VOMITING MODERATE 2024-02-03
vs# A tibble: 8 × 4
USUBJID VSTESTCD VSSTRESN VISIT
<chr> <chr> <dbl> <chr>
1 001-001 SYSBP 120 Baseline
2 001-001 DIABP 80 Baseline
3 001-002 SYSBP 135 Baseline
4 001-002 DIABP 85 Baseline
5 001-003 SYSBP 115 Baseline
6 001-003 DIABP 75 Baseline
7 001-004 SYSBP 140 Baseline
8 001-004 DIABP 90 Baseline
Key Points: - Clinical data is naturally relational (connected by USUBJID) - Each domain serves a specific purpose (DM=demographics, AE=adverse events, VS=vital signs) - Note that some subjects appear in some domains but not others
# 3a. LEFT JOIN: Add demographics to adverse events
ae_with_demo <- ae %>%
left_join(dm, by = "USUBJID")
ae_with_demo# A tibble: 6 × 11
USUBJID AESEQ AEDECOD AESEV AESTDTC AGE SEX ARMCD ARM RFSTDTC ELDERLY
<chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-001 1 HEADACHE MILD 2024-0… 45 M TRT Trea… 2024-0… N
2 001-001 2 NAUSEA MODER… 2024-0… 45 M TRT Trea… 2024-0… N
3 001-002 1 FATIGUE MILD 2024-0… 67 F PBO Plac… 2024-0… Y
4 001-004 1 DIZZINESS SEVERE 2024-0… 71 F TRT Trea… 2024-0… Y
5 001-007 1 HEADACHE MILD 2024-0… NA <NA> <NA> <NA> <NA> <NA>
6 001-007 2 VOMITING MODER… 2024-0… NA <NA> <NA> <NA> <NA> <NA>
# 3b. INNER JOIN: Keep only AEs for subjects with complete demographics
ae_complete <- ae %>%
inner_join(dm, by = "USUBJID")
ae_complete# A tibble: 4 × 11
USUBJID AESEQ AEDECOD AESEV AESTDTC AGE SEX ARMCD ARM RFSTDTC ELDERLY
<chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-001 1 HEADACHE MILD 2024-0… 45 M TRT Trea… 2024-0… N
2 001-001 2 NAUSEA MODER… 2024-0… 45 M TRT Trea… 2024-0… N
3 001-002 1 FATIGUE MILD 2024-0… 67 F PBO Plac… 2024-0… Y
4 001-004 1 DIZZINESS SEVERE 2024-0… 71 F TRT Trea… 2024-0… Y
# 3c. Count subjects in each dataset
cat("Subject counts:\n")Subject counts:
cat("DM subjects:", n_distinct(dm$USUBJID), "\n")DM subjects: 6
cat("AE subjects:", n_distinct(ae$USUBJID), "\n")AE subjects: 4
cat("AE with demo (left join):", n_distinct(ae_with_demo$USUBJID), "\n")AE with demo (left join): 4
cat("AE complete (inner join):", n_distinct(ae_complete$USUBJID), "\n")AE complete (inner join): 3
Key Points: - left_join() preserves all rows from the left dataset (AE) - inner_join() keeps only rows that exist in both datasets - Subject 001-007 exists in AE but not DM, so gets NA values in left_join - Subject 001-007 is excluded from inner_join results
# 4a. SEMI JOIN: Find subjects who had any adverse events
subjects_with_ae <- dm %>%
semi_join(ae, by = "USUBJID")
subjects_with_ae# A tibble: 3 × 7
USUBJID AGE SEX ARMCD ARM RFSTDTC ELDERLY
<chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-001 45 M TRT Treatment 2024-01-15 N
2 001-002 67 F PBO Placebo 2024-01-16 Y
3 001-004 71 F TRT Treatment 2024-01-18 Y
# 4b. ANTI JOIN: Find subjects who had NO adverse events
subjects_without_ae <- dm %>%
anti_join(ae, by = "USUBJID")
subjects_without_ae# A tibble: 3 × 7
USUBJID AGE SEX ARMCD ARM RFSTDTC ELDERLY
<chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-003 52 M TRT Treatment 2024-01-17 N
2 001-005 34 M PBO Placebo 2024-01-19 N
3 001-006 58 F TRT Treatment 2024-01-20 N
Key Points: - semi_join() returns rows from dm that have matches in ae (but doesn’t add ae columns) - anti_join() returns rows from dm that have NO matches in ae - These are useful for creating population flags and data validation
# 5a. AE summary by treatment group
ae_by_treatment <- ae_with_demo %>%
filter(!is.na(ARMCD)) %>% # Exclude subjects not in DM
group_by(ARMCD) %>%
summarise(
n_subjects = n_distinct(USUBJID),
n_events = n(),
mean_age = round(mean(AGE, na.rm = TRUE), 1),
.groups = "drop"
)
ae_by_treatment# A tibble: 2 × 4
ARMCD n_subjects n_events mean_age
<chr> <int> <int> <dbl>
1 PBO 1 1 67
2 TRT 2 3 53.7
# 5b. AE summary by elderly status and severity
ae_by_elderly_severity <- ae_with_demo %>%
filter(!is.na(ELDERLY)) %>%
group_by(ELDERLY, AESEV) %>%
summarise(
n_events = n(),
n_subjects = n_distinct(USUBJID),
.groups = "drop"
) %>%
arrange(ELDERLY, AESEV)
ae_by_elderly_severity# A tibble: 4 × 4
ELDERLY AESEV n_events n_subjects
<chr> <chr> <int> <int>
1 N MILD 1 1
2 N MODERATE 1 1
3 Y MILD 1 1
4 Y SEVERE 1 1
Key Points: - Always filter out missing values before grouping - n_distinct() counts unique values (subjects) - n() counts total rows (events) - Use .groups = "drop" to avoid warnings
# Comprehensive analysis pipeline
clinical_summary <- ae %>%
# Step 1: Join with demographics
left_join(dm, by = "USUBJID") %>%
# Step 2: Filter to complete cases
filter(!is.na(ARMCD)) %>%
# Step 3: Calculate study day and severity flag
mutate(
AESTDT = ymd(AESTDTC),
RFSTDT = ymd(RFSTDTC),
AEDY = as.numeric(AESTDT - RFSTDT) + 1,
SEVERE_AE = ifelse(AESEV == "SEVERE", "Y", "N")
) %>%
# Step 4: Group and summarise
group_by(ARMCD, ELDERLY, SEVERE_AE) %>%
summarise(
n_subjects = n_distinct(USUBJID),
n_events = n(),
mean_onset_day = round(mean(AEDY, na.rm = TRUE), 1),
.groups = "drop"
) %>%
# Step 5: Arrange results
arrange(ARMCD, ELDERLY, SEVERE_AE)
clinical_summary# A tibble: 3 × 6
ARMCD ELDERLY SEVERE_AE n_subjects n_events mean_onset_day
<chr> <chr> <chr> <int> <int> <dbl>
1 PBO Y N 1 1 6
2 TRT N N 1 2 7
3 TRT Y Y 1 1 8
Key Points: - Pipelines combine multiple operations in logical sequence - Use ymd() to convert character dates to Date objects - Calculate study days as numeric difference + 1 - Complex groupings reveal detailed patterns in clinical data
# 7a. Create summary of vital signs by treatment (long format)
vs_summary <- vs %>%
left_join(dm %>% select(USUBJID, ARMCD), by = "USUBJID") %>%
group_by(ARMCD, VSTESTCD) %>%
summarise(mean_val = round(mean(VSSTRESN), 1), .groups = "drop")
vs_summary# A tibble: 4 × 3
ARMCD VSTESTCD mean_val
<chr> <chr> <dbl>
1 PBO DIABP 85
2 PBO SYSBP 135
3 TRT DIABP 81.7
4 TRT SYSBP 125
# 7b. Pivot to wide format for reporting
vs_wide <- vs_summary %>%
pivot_wider(
names_from = VSTESTCD,
values_from = mean_val
)
vs_wide# A tibble: 2 × 3
ARMCD DIABP SYSBP
<chr> <dbl> <dbl>
1 PBO 85 135
2 TRT 81.7 125
Key Points: - pivot_wider() converts long data to wide format - Wide format is often better for clinical reports and tables - Use names_from to specify column headers and values_from for cell values
# 8a. Identify subjects in AE but not in DM
missing_subjects <- ae %>%
anti_join(dm, by = "USUBJID") %>%
distinct(USUBJID)
missing_subjects# A tibble: 1 × 1
USUBJID
<chr>
1 001-007
# 8b. Create data quality report
data_quality <- tibble(
Dataset = c("DM", "AE", "VS"),
N_Subjects = c(
n_distinct(dm$USUBJID),
n_distinct(ae$USUBJID),
n_distinct(vs$USUBJID)
),
N_Records = c(
nrow(dm),
nrow(ae),
nrow(vs)
)
)
data_quality# A tibble: 3 × 3
Dataset N_Subjects N_Records
<chr> <int> <int>
1 DM 6 6
2 AE 4 6
3 VS 4 8
Key Points: - Use anti_join() to find missing relationships between datasets - Data quality checks are essential in clinical programming - Always validate data completeness before analysis
By completing these exercises, you have successfully mastered:
✅ R4DS Chapter 12: Applied tidy data principles with pivot_longer() and pivot_wider()
✅ R4DS Chapter 13: Used all types of joins for relational data analysis
You are now ready for advanced clinical data relationships and complex analyses! 🚀