Module 3 Solution — Joins and Summaries

📝 Module 3 Solution Guide

Complete solutions for Module 3 exercises based on R4DS Chapters 12 and 13

Setup

# Load required packages
library(tidyverse)  # Includes dplyr, tidyr for joins and pivots
library(lubridate)  # Date manipulation

# Check setup
packageVersion("tidyverse")

[1] '2.0.0'

Exercise 1: Tidy Data Practice (R4DS Ch. 12)

# Create wide format lab dataset (NOT TIDY)
lab_wide <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004"),
  ALT_BL = c(25, 30, 28, 35),
  AST_BL = c(22, 28, 26, 32),
  BILI_BL = c(0.8, 1.2, 0.9, 1.1),
  ALT_W2 = c(27, 32, 30, 38),
  AST_W2 = c(24, 30, 28, 35),
  BILI_W2 = c(0.9, 1.1, 1.0, 1.2),
  ALT_W4 = c(26, 29, 31, 36),
  AST_W4 = c(23, 27, 29, 33),
  BILI_W4 = c(0.8, 1.0, 1.1, 1.0)
)

# 1a. Convert to long format (TIDY) using pivot_longer()
lab_long <- lab_wide %>%
  pivot_longer(
    cols = -USUBJID,
    names_to = c("PARAM", "VISIT"),
    names_sep = "_",
    values_to = "AVAL"
  ) %>%
  mutate(
    PARAM = case_when(
      PARAM == "ALT" ~ "Alanine Aminotransferase",
      PARAM == "AST" ~ "Aspartate Aminotransferase", 
      PARAM == "BILI" ~ "Bilirubin"
    ),
    VISIT = case_when(
      VISIT == "BL" ~ "Baseline",
      VISIT == "W2" ~ "Week 2",
      VISIT == "W4" ~ "Week 4"
    )
  )

lab_long

# A tibble: 36 × 4
   USUBJID PARAM                      VISIT     AVAL
   <chr>   <chr>                      <chr>    <dbl>
 1 001-001 Alanine Aminotransferase   Baseline  25
 2 001-001 Aspartate Aminotransferase Baseline  22
 3 001-001 Bilirubin                  Baseline   0.8
 4 001-001 Alanine Aminotransferase   Week 2    27
 5 001-001 Aspartate Aminotransferase Week 2    24
 6 001-001 Bilirubin                  Week 2     0.9
 7 001-001 Alanine Aminotransferase   Week 4    26
 8 001-001 Aspartate Aminotransferase Week 4    23
 9 001-001 Bilirubin                  Week 4     0.8
10 001-002 Alanine Aminotransferase   Baseline  30
# ℹ 26 more rows

# 1b. Create summary table
lab_summary <- lab_long %>%
  group_by(PARAM, VISIT) %>%
  summarise(
    n = n(),
    mean_val = round(mean(AVAL), 2),
    .groups = "drop"
  )

lab_summary

# A tibble: 9 × 4
  PARAM                      VISIT        n mean_val
  <chr>                      <chr>    <int>    <dbl>
1 Alanine Aminotransferase   Baseline     4    29.5
2 Alanine Aminotransferase   Week 2       4    31.8
3 Alanine Aminotransferase   Week 4       4    30.5
4 Aspartate Aminotransferase Baseline     4    27
5 Aspartate Aminotransferase Week 2       4    29.2
6 Aspartate Aminotransferase Week 4       4    28
7 Bilirubin                  Baseline     4     1
8 Bilirubin                  Week 2       4     1.05
9 Bilirubin                  Week 4       4     0.98

Key Points: - pivot_longer() converts wide data to long (tidy) format - Use names_sep to split column names into multiple variables - Long format is better for analysis and visualization - Always clean up variable values after pivoting

Exercise 2: Set Up Clinical Domains (R4DS Ch. 13)

adae <- derive_vars_dt( dataset = adae, dtc = AESTDTC, new_vars_prefix = “A”

# Create sample clinical datasets
dm <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
  AGE = c(45, 67, 52, 71, 34, 58),
  SEX = c("M", "F", "M", "F", "M", "F"),
  ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO", "TRT"),
  ARM = c("Treatment", "Placebo", "Treatment", "Treatment", "Placebo", "Treatment"),
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")
) %>%
  mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))

ae <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-004", "001-007", "001-007"),
  AESEQ = c(1, 2, 1, 1, 1, 2),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "HEADACHE", "VOMITING"),
  AESEV = c("MILD", "MODERATE", "MILD", "SEVERE", "MILD", "MODERATE"),
  AESTDTC = c("2024-01-20", "2024-01-22", "2024-01-21", "2024-01-25", "2024-02-01", "2024-02-03")
)

vs <- tibble(
  USUBJID = rep(c("001-001", "001-002", "001-003", "001-004"), each = 2),
  VSTESTCD = rep(c("SYSBP", "DIABP"), 4),
  VSSTRESN = c(120, 80, 135, 85, 115, 75, 140, 90),
  VISIT = rep("Baseline", 8)
)

# Display datasets
dm

# A tibble: 6 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-001    45 M     TRT   Treatment 2024-01-15 N
2 001-002    67 F     PBO   Placebo   2024-01-16 Y
3 001-003    52 M     TRT   Treatment 2024-01-17 N
4 001-004    71 F     TRT   Treatment 2024-01-18 Y
5 001-005    34 M     PBO   Placebo   2024-01-19 N
6 001-006    58 F     TRT   Treatment 2024-01-20 N

ae

# A tibble: 6 × 5
  USUBJID AESEQ AEDECOD   AESEV    AESTDTC
  <chr>   <dbl> <chr>     <chr>    <chr>
1 001-001     1 HEADACHE  MILD     2024-01-20
2 001-001     2 NAUSEA    MODERATE 2024-01-22
3 001-002     1 FATIGUE   MILD     2024-01-21
4 001-004     1 DIZZINESS SEVERE   2024-01-25
5 001-007     1 HEADACHE  MILD     2024-02-01
6 001-007     2 VOMITING  MODERATE 2024-02-03

vs

# A tibble: 8 × 4
  USUBJID VSTESTCD VSSTRESN VISIT
  <chr>   <chr>       <dbl> <chr>
1 001-001 SYSBP         120 Baseline
2 001-001 DIABP          80 Baseline
3 001-002 SYSBP         135 Baseline
4 001-002 DIABP          85 Baseline
5 001-003 SYSBP         115 Baseline
6 001-003 DIABP          75 Baseline
7 001-004 SYSBP         140 Baseline
8 001-004 DIABP          90 Baseline

Key Points: - Clinical data is naturally relational (connected by USUBJID) - Each domain serves a specific purpose (DM=demographics, AE=adverse events, VS=vital signs) - Note that some subjects appear in some domains but not others

Exercise 3: Basic Join Operations (R4DS Ch. 13)

# 3a. LEFT JOIN: Add demographics to adverse events
ae_with_demo <- ae %>%
  left_join(dm, by = "USUBJID")

ae_with_demo

# A tibble: 6 × 11
  USUBJID AESEQ AEDECOD   AESEV  AESTDTC   AGE SEX   ARMCD ARM   RFSTDTC ELDERLY
  <chr>   <dbl> <chr>     <chr>  <chr>   <dbl> <chr> <chr> <chr> <chr>   <chr>
1 001-001     1 HEADACHE  MILD   2024-0…    45 M     TRT   Trea… 2024-0… N
2 001-001     2 NAUSEA    MODER… 2024-0…    45 M     TRT   Trea… 2024-0… N
3 001-002     1 FATIGUE   MILD   2024-0…    67 F     PBO   Plac… 2024-0… Y
4 001-004     1 DIZZINESS SEVERE 2024-0…    71 F     TRT   Trea… 2024-0… Y
5 001-007     1 HEADACHE  MILD   2024-0…    NA <NA>  <NA>  <NA>  <NA>    <NA>
6 001-007     2 VOMITING  MODER… 2024-0…    NA <NA>  <NA>  <NA>  <NA>    <NA>

# 3b. INNER JOIN: Keep only AEs for subjects with complete demographics
ae_complete <- ae %>%
  inner_join(dm, by = "USUBJID")

ae_complete

# A tibble: 4 × 11
  USUBJID AESEQ AEDECOD   AESEV  AESTDTC   AGE SEX   ARMCD ARM   RFSTDTC ELDERLY
  <chr>   <dbl> <chr>     <chr>  <chr>   <dbl> <chr> <chr> <chr> <chr>   <chr>
1 001-001     1 HEADACHE  MILD   2024-0…    45 M     TRT   Trea… 2024-0… N
2 001-001     2 NAUSEA    MODER… 2024-0…    45 M     TRT   Trea… 2024-0… N
3 001-002     1 FATIGUE   MILD   2024-0…    67 F     PBO   Plac… 2024-0… Y
4 001-004     1 DIZZINESS SEVERE 2024-0…    71 F     TRT   Trea… 2024-0… Y

# 3c. Count subjects in each dataset
cat("Subject counts:\n")

Subject counts:

cat("DM subjects:", n_distinct(dm$USUBJID), "\n")

DM subjects: 6

cat("AE subjects:", n_distinct(ae$USUBJID), "\n")

AE subjects: 4

cat("AE with demo (left join):", n_distinct(ae_with_demo$USUBJID), "\n")

AE with demo (left join): 4

cat("AE complete (inner join):", n_distinct(ae_complete$USUBJID), "\n")

AE complete (inner join): 3

Key Points: - left_join() preserves all rows from the left dataset (AE) - inner_join() keeps only rows that exist in both datasets - Subject 001-007 exists in AE but not DM, so gets NA values in left_join - Subject 001-007 is excluded from inner_join results

Exercise 4: Filtering Joins (R4DS Ch. 13)

# 4a. SEMI JOIN: Find subjects who had any adverse events
subjects_with_ae <- dm %>%
  semi_join(ae, by = "USUBJID")

subjects_with_ae

# A tibble: 3 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-001    45 M     TRT   Treatment 2024-01-15 N
2 001-002    67 F     PBO   Placebo   2024-01-16 Y
3 001-004    71 F     TRT   Treatment 2024-01-18 Y

# 4b. ANTI JOIN: Find subjects who had NO adverse events  
subjects_without_ae <- dm %>%
  anti_join(ae, by = "USUBJID")

subjects_without_ae

# A tibble: 3 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-003    52 M     TRT   Treatment 2024-01-17 N
2 001-005    34 M     PBO   Placebo   2024-01-19 N
3 001-006    58 F     TRT   Treatment 2024-01-20 N

Key Points: - semi_join() returns rows from dm that have matches in ae (but doesn’t add ae columns) - anti_join() returns rows from dm that have NO matches in ae - These are useful for creating population flags and data validation

Exercise 5: Group By and Summarise Practice

# 5a. AE summary by treatment group
ae_by_treatment <- ae_with_demo %>%
  filter(!is.na(ARMCD)) %>%  # Exclude subjects not in DM
  group_by(ARMCD) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    mean_age = round(mean(AGE, na.rm = TRUE), 1),
    .groups = "drop"
  )

ae_by_treatment

# A tibble: 2 × 4
  ARMCD n_subjects n_events mean_age
  <chr>      <int>    <int>    <dbl>
1 PBO            1        1     67
2 TRT            2        3     53.7

# 5b. AE summary by elderly status and severity
ae_by_elderly_severity <- ae_with_demo %>%
  filter(!is.na(ELDERLY)) %>%
  group_by(ELDERLY, AESEV) %>%
  summarise(
    n_events = n(),
    n_subjects = n_distinct(USUBJID),
    .groups = "drop"
  ) %>%
  arrange(ELDERLY, AESEV)

ae_by_elderly_severity

# A tibble: 4 × 4
  ELDERLY AESEV    n_events n_subjects
  <chr>   <chr>       <int>      <int>
1 N       MILD            1          1
2 N       MODERATE        1          1
3 Y       MILD            1          1
4 Y       SEVERE          1          1

Key Points: - Always filter out missing values before grouping - n_distinct() counts unique values (subjects) - n() counts total rows (events) - Use .groups = "drop" to avoid warnings

Exercise 6: Complex Analysis Pipeline

# Comprehensive analysis pipeline
clinical_summary <- ae %>%
  # Step 1: Join with demographics
  left_join(dm, by = "USUBJID") %>%
  # Step 2: Filter to complete cases
  filter(!is.na(ARMCD)) %>%
  # Step 3: Calculate study day and severity flag
  mutate(
    AESTDT = ymd(AESTDTC),
    RFSTDT = ymd(RFSTDTC),
    AEDY = as.numeric(AESTDT - RFSTDT) + 1,
    SEVERE_AE = ifelse(AESEV == "SEVERE", "Y", "N")
  ) %>%
  # Step 4: Group and summarise
  group_by(ARMCD, ELDERLY, SEVERE_AE) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    mean_onset_day = round(mean(AEDY, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  # Step 5: Arrange results
  arrange(ARMCD, ELDERLY, SEVERE_AE)

clinical_summary

# A tibble: 3 × 6
  ARMCD ELDERLY SEVERE_AE n_subjects n_events mean_onset_day
  <chr> <chr>   <chr>          <int>    <int>          <dbl>
1 PBO   Y       N                  1        1              6
2 TRT   N       N                  1        2              7
3 TRT   Y       Y                  1        1              8

Key Points: - Pipelines combine multiple operations in logical sequence - Use ymd() to convert character dates to Date objects - Calculate study days as numeric difference + 1 - Complex groupings reveal detailed patterns in clinical data

Exercise 7: Pivot Practice with Vital Signs

# 7a. Create summary of vital signs by treatment (long format)
vs_summary <- vs %>%
  left_join(dm %>% select(USUBJID, ARMCD), by = "USUBJID") %>%
  group_by(ARMCD, VSTESTCD) %>%
  summarise(mean_val = round(mean(VSSTRESN), 1), .groups = "drop")

vs_summary

# A tibble: 4 × 3
  ARMCD VSTESTCD mean_val
  <chr> <chr>       <dbl>
1 PBO   DIABP        85
2 PBO   SYSBP       135
3 TRT   DIABP        81.7
4 TRT   SYSBP       125

# 7b. Pivot to wide format for reporting
vs_wide <- vs_summary %>%
  pivot_wider(
    names_from = VSTESTCD,
    values_from = mean_val
  )

vs_wide

# A tibble: 2 × 3
  ARMCD DIABP SYSBP
  <chr> <dbl> <dbl>
1 PBO    85     135
2 TRT    81.7   125

Key Points: - pivot_wider() converts long data to wide format - Wide format is often better for clinical reports and tables - Use names_from to specify column headers and values_from for cell values

Exercise 8: Missing Data Analysis

# 8a. Identify subjects in AE but not in DM
missing_subjects <- ae %>%
  anti_join(dm, by = "USUBJID") %>%
  distinct(USUBJID)

missing_subjects

# A tibble: 1 × 1
  USUBJID
  <chr>
1 001-007

# 8b. Create data quality report
data_quality <- tibble(
  Dataset = c("DM", "AE", "VS"),
  N_Subjects = c(
    n_distinct(dm$USUBJID),
    n_distinct(ae$USUBJID),
    n_distinct(vs$USUBJID)
  ),
  N_Records = c(
    nrow(dm),
    nrow(ae),
    nrow(vs)
  )
)

data_quality

# A tibble: 3 × 3
  Dataset N_Subjects N_Records
  <chr>        <int>     <int>
1 DM               6         6
2 AE               4         6
3 VS               4         8

Key Points: - Use anti_join() to find missing relationships between datasets - Data quality checks are essential in clinical programming - Always validate data completeness before analysis

🎯 Learning Objectives Met

By completing these exercises, you have successfully mastered:

✅ R4DS Chapter 12: Applied tidy data principles with pivot_longer() and pivot_wider()
✅ R4DS Chapter 13: Used all types of joins for relational data analysis

Clinical Programming Skills Gained:

Reshaping clinical data between wide and long formats
Joining multiple clinical domains (DM, AE, VS)
Creating comprehensive clinical analysis pipelines
Data quality assessment and missing data identification
Complex grouping and summarization for clinical reporting

Next Steps:

Apply these concepts to real clinical trial datasets
Practice with more complex multi-domain joins
Move to Module 4: Advanced data manipulation
Explore tidyr functions for complex reshaping scenarios

You are now ready for advanced clinical data relationships and complex analyses! 🚀