Module 3 Solution — Joins and Summaries

📝 Module 3 Solution Guide

Complete solutions for Module 3 exercises based on R4DS Chapters 12 and 13

Setup

# Load required packages
library(tidyverse)  # Includes dplyr, tidyr for joins and pivots
library(lubridate)  # Date manipulation

# Check setup
packageVersion("tidyverse")
[1] '2.0.0'

Exercise 1: Tidy Data Practice (R4DS Ch. 12)

# Create wide format lab dataset (NOT TIDY)
lab_wide <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004"),
  ALT_BL = c(25, 30, 28, 35),
  AST_BL = c(22, 28, 26, 32),
  BILI_BL = c(0.8, 1.2, 0.9, 1.1),
  ALT_W2 = c(27, 32, 30, 38),
  AST_W2 = c(24, 30, 28, 35),
  BILI_W2 = c(0.9, 1.1, 1.0, 1.2),
  ALT_W4 = c(26, 29, 31, 36),
  AST_W4 = c(23, 27, 29, 33),
  BILI_W4 = c(0.8, 1.0, 1.1, 1.0)
)

# 1a. Convert to long format (TIDY) using pivot_longer()
lab_long <- lab_wide %>%
  pivot_longer(
    cols = -USUBJID,
    names_to = c("PARAM", "VISIT"),
    names_sep = "_",
    values_to = "AVAL"
  ) %>%
  mutate(
    PARAM = case_when(
      PARAM == "ALT" ~ "Alanine Aminotransferase",
      PARAM == "AST" ~ "Aspartate Aminotransferase", 
      PARAM == "BILI" ~ "Bilirubin"
    ),
    VISIT = case_when(
      VISIT == "BL" ~ "Baseline",
      VISIT == "W2" ~ "Week 2",
      VISIT == "W4" ~ "Week 4"
    )
  )

lab_long
# A tibble: 36 × 4
   USUBJID PARAM                      VISIT     AVAL
   <chr>   <chr>                      <chr>    <dbl>
 1 001-001 Alanine Aminotransferase   Baseline  25
 2 001-001 Aspartate Aminotransferase Baseline  22
 3 001-001 Bilirubin                  Baseline   0.8
 4 001-001 Alanine Aminotransferase   Week 2    27
 5 001-001 Aspartate Aminotransferase Week 2    24
 6 001-001 Bilirubin                  Week 2     0.9
 7 001-001 Alanine Aminotransferase   Week 4    26
 8 001-001 Aspartate Aminotransferase Week 4    23
 9 001-001 Bilirubin                  Week 4     0.8
10 001-002 Alanine Aminotransferase   Baseline  30
# ℹ 26 more rows
# 1b. Create summary table
lab_summary <- lab_long %>%
  group_by(PARAM, VISIT) %>%
  summarise(
    n = n(),
    mean_val = round(mean(AVAL), 2),
    .groups = "drop"
  )

lab_summary
# A tibble: 9 × 4
  PARAM                      VISIT        n mean_val
  <chr>                      <chr>    <int>    <dbl>
1 Alanine Aminotransferase   Baseline     4    29.5
2 Alanine Aminotransferase   Week 2       4    31.8
3 Alanine Aminotransferase   Week 4       4    30.5
4 Aspartate Aminotransferase Baseline     4    27
5 Aspartate Aminotransferase Week 2       4    29.2
6 Aspartate Aminotransferase Week 4       4    28
7 Bilirubin                  Baseline     4     1
8 Bilirubin                  Week 2       4     1.05
9 Bilirubin                  Week 4       4     0.98

Key Points: - pivot_longer() converts wide data to long (tidy) format - Use names_sep to split column names into multiple variables - Long format is better for analysis and visualization - Always clean up variable values after pivoting


Exercise 2: Set Up Clinical Domains (R4DS Ch. 13)

adae <- derive_vars_dt( dataset = adae, dtc = AESTDTC, new_vars_prefix = “A”

# Create sample clinical datasets
dm <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
  AGE = c(45, 67, 52, 71, 34, 58),
  SEX = c("M", "F", "M", "F", "M", "F"),
  ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO", "TRT"),
  ARM = c("Treatment", "Placebo", "Treatment", "Treatment", "Placebo", "Treatment"),
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")
) %>%
  mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))

ae <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-004", "001-007", "001-007"),
  AESEQ = c(1, 2, 1, 1, 1, 2),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "HEADACHE", "VOMITING"),
  AESEV = c("MILD", "MODERATE", "MILD", "SEVERE", "MILD", "MODERATE"),
  AESTDTC = c("2024-01-20", "2024-01-22", "2024-01-21", "2024-01-25", "2024-02-01", "2024-02-03")
)

vs <- tibble(
  USUBJID = rep(c("001-001", "001-002", "001-003", "001-004"), each = 2),
  VSTESTCD = rep(c("SYSBP", "DIABP"), 4),
  VSSTRESN = c(120, 80, 135, 85, 115, 75, 140, 90),
  VISIT = rep("Baseline", 8)
)

# Display datasets
dm
# A tibble: 6 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-001    45 M     TRT   Treatment 2024-01-15 N
2 001-002    67 F     PBO   Placebo   2024-01-16 Y
3 001-003    52 M     TRT   Treatment 2024-01-17 N
4 001-004    71 F     TRT   Treatment 2024-01-18 Y
5 001-005    34 M     PBO   Placebo   2024-01-19 N
6 001-006    58 F     TRT   Treatment 2024-01-20 N      
ae
# A tibble: 6 × 5
  USUBJID AESEQ AEDECOD   AESEV    AESTDTC
  <chr>   <dbl> <chr>     <chr>    <chr>
1 001-001     1 HEADACHE  MILD     2024-01-20
2 001-001     2 NAUSEA    MODERATE 2024-01-22
3 001-002     1 FATIGUE   MILD     2024-01-21
4 001-004     1 DIZZINESS SEVERE   2024-01-25
5 001-007     1 HEADACHE  MILD     2024-02-01
6 001-007     2 VOMITING  MODERATE 2024-02-03
vs
# A tibble: 8 × 4
  USUBJID VSTESTCD VSSTRESN VISIT
  <chr>   <chr>       <dbl> <chr>
1 001-001 SYSBP         120 Baseline
2 001-001 DIABP          80 Baseline
3 001-002 SYSBP         135 Baseline
4 001-002 DIABP          85 Baseline
5 001-003 SYSBP         115 Baseline
6 001-003 DIABP          75 Baseline
7 001-004 SYSBP         140 Baseline
8 001-004 DIABP          90 Baseline

Key Points: - Clinical data is naturally relational (connected by USUBJID) - Each domain serves a specific purpose (DM=demographics, AE=adverse events, VS=vital signs) - Note that some subjects appear in some domains but not others


Exercise 3: Basic Join Operations (R4DS Ch. 13)

# 3a. LEFT JOIN: Add demographics to adverse events
ae_with_demo <- ae %>%
  left_join(dm, by = "USUBJID")

ae_with_demo
# A tibble: 6 × 11
  USUBJID AESEQ AEDECOD   AESEV  AESTDTC   AGE SEX   ARMCD ARM   RFSTDTC ELDERLY
  <chr>   <dbl> <chr>     <chr>  <chr>   <dbl> <chr> <chr> <chr> <chr>   <chr>
1 001-001     1 HEADACHE  MILD   2024-0…    45 M     TRT   Trea… 2024-0… N
2 001-001     2 NAUSEA    MODER… 2024-0…    45 M     TRT   Trea… 2024-0… N
3 001-002     1 FATIGUE   MILD   2024-0…    67 F     PBO   Plac… 2024-0… Y
4 001-004     1 DIZZINESS SEVERE 2024-0…    71 F     TRT   Trea… 2024-0… Y
5 001-007     1 HEADACHE  MILD   2024-0…    NA <NA>  <NA>  <NA>  <NA>    <NA>
6 001-007     2 VOMITING  MODER… 2024-0…    NA <NA>  <NA>  <NA>  <NA>    <NA>   
# 3b. INNER JOIN: Keep only AEs for subjects with complete demographics
ae_complete <- ae %>%
  inner_join(dm, by = "USUBJID")

ae_complete
# A tibble: 4 × 11
  USUBJID AESEQ AEDECOD   AESEV  AESTDTC   AGE SEX   ARMCD ARM   RFSTDTC ELDERLY
  <chr>   <dbl> <chr>     <chr>  <chr>   <dbl> <chr> <chr> <chr> <chr>   <chr>
1 001-001     1 HEADACHE  MILD   2024-0…    45 M     TRT   Trea… 2024-0… N
2 001-001     2 NAUSEA    MODER… 2024-0…    45 M     TRT   Trea… 2024-0… N
3 001-002     1 FATIGUE   MILD   2024-0…    67 F     PBO   Plac… 2024-0… Y
4 001-004     1 DIZZINESS SEVERE 2024-0…    71 F     TRT   Trea… 2024-0… Y      
# 3c. Count subjects in each dataset
cat("Subject counts:\n")
Subject counts:
cat("DM subjects:", n_distinct(dm$USUBJID), "\n")
DM subjects: 6 
cat("AE subjects:", n_distinct(ae$USUBJID), "\n")
AE subjects: 4 
cat("AE with demo (left join):", n_distinct(ae_with_demo$USUBJID), "\n")
AE with demo (left join): 4 
cat("AE complete (inner join):", n_distinct(ae_complete$USUBJID), "\n")
AE complete (inner join): 3 

Key Points: - left_join() preserves all rows from the left dataset (AE) - inner_join() keeps only rows that exist in both datasets - Subject 001-007 exists in AE but not DM, so gets NA values in left_join - Subject 001-007 is excluded from inner_join results


Exercise 4: Filtering Joins (R4DS Ch. 13)

# 4a. SEMI JOIN: Find subjects who had any adverse events
subjects_with_ae <- dm %>%
  semi_join(ae, by = "USUBJID")

subjects_with_ae
# A tibble: 3 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-001    45 M     TRT   Treatment 2024-01-15 N
2 001-002    67 F     PBO   Placebo   2024-01-16 Y
3 001-004    71 F     TRT   Treatment 2024-01-18 Y      
# 4b. ANTI JOIN: Find subjects who had NO adverse events  
subjects_without_ae <- dm %>%
  anti_join(ae, by = "USUBJID")

subjects_without_ae
# A tibble: 3 × 7
  USUBJID   AGE SEX   ARMCD ARM       RFSTDTC    ELDERLY
  <chr>   <dbl> <chr> <chr> <chr>     <chr>      <chr>
1 001-003    52 M     TRT   Treatment 2024-01-17 N
2 001-005    34 M     PBO   Placebo   2024-01-19 N
3 001-006    58 F     TRT   Treatment 2024-01-20 N      

Key Points: - semi_join() returns rows from dm that have matches in ae (but doesn’t add ae columns) - anti_join() returns rows from dm that have NO matches in ae - These are useful for creating population flags and data validation


Exercise 5: Group By and Summarise Practice

# 5a. AE summary by treatment group
ae_by_treatment <- ae_with_demo %>%
  filter(!is.na(ARMCD)) %>%  # Exclude subjects not in DM
  group_by(ARMCD) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    mean_age = round(mean(AGE, na.rm = TRUE), 1),
    .groups = "drop"
  )

ae_by_treatment
# A tibble: 2 × 4
  ARMCD n_subjects n_events mean_age
  <chr>      <int>    <int>    <dbl>
1 PBO            1        1     67
2 TRT            2        3     53.7
# 5b. AE summary by elderly status and severity
ae_by_elderly_severity <- ae_with_demo %>%
  filter(!is.na(ELDERLY)) %>%
  group_by(ELDERLY, AESEV) %>%
  summarise(
    n_events = n(),
    n_subjects = n_distinct(USUBJID),
    .groups = "drop"
  ) %>%
  arrange(ELDERLY, AESEV)

ae_by_elderly_severity
# A tibble: 4 × 4
  ELDERLY AESEV    n_events n_subjects
  <chr>   <chr>       <int>      <int>
1 N       MILD            1          1
2 N       MODERATE        1          1
3 Y       MILD            1          1
4 Y       SEVERE          1          1

Key Points: - Always filter out missing values before grouping - n_distinct() counts unique values (subjects) - n() counts total rows (events) - Use .groups = "drop" to avoid warnings


Exercise 6: Complex Analysis Pipeline

# Comprehensive analysis pipeline
clinical_summary <- ae %>%
  # Step 1: Join with demographics
  left_join(dm, by = "USUBJID") %>%
  # Step 2: Filter to complete cases
  filter(!is.na(ARMCD)) %>%
  # Step 3: Calculate study day and severity flag
  mutate(
    AESTDT = ymd(AESTDTC),
    RFSTDT = ymd(RFSTDTC),
    AEDY = as.numeric(AESTDT - RFSTDT) + 1,
    SEVERE_AE = ifelse(AESEV == "SEVERE", "Y", "N")
  ) %>%
  # Step 4: Group and summarise
  group_by(ARMCD, ELDERLY, SEVERE_AE) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    mean_onset_day = round(mean(AEDY, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  # Step 5: Arrange results
  arrange(ARMCD, ELDERLY, SEVERE_AE)

clinical_summary
# A tibble: 3 × 6
  ARMCD ELDERLY SEVERE_AE n_subjects n_events mean_onset_day
  <chr> <chr>   <chr>          <int>    <int>          <dbl>
1 PBO   Y       N                  1        1              6
2 TRT   N       N                  1        2              7
3 TRT   Y       Y                  1        1              8

Key Points: - Pipelines combine multiple operations in logical sequence - Use ymd() to convert character dates to Date objects - Calculate study days as numeric difference + 1 - Complex groupings reveal detailed patterns in clinical data


Exercise 7: Pivot Practice with Vital Signs

# 7a. Create summary of vital signs by treatment (long format)
vs_summary <- vs %>%
  left_join(dm %>% select(USUBJID, ARMCD), by = "USUBJID") %>%
  group_by(ARMCD, VSTESTCD) %>%
  summarise(mean_val = round(mean(VSSTRESN), 1), .groups = "drop")

vs_summary
# A tibble: 4 × 3
  ARMCD VSTESTCD mean_val
  <chr> <chr>       <dbl>
1 PBO   DIABP        85
2 PBO   SYSBP       135
3 TRT   DIABP        81.7
4 TRT   SYSBP       125  
# 7b. Pivot to wide format for reporting
vs_wide <- vs_summary %>%
  pivot_wider(
    names_from = VSTESTCD,
    values_from = mean_val
  )

vs_wide
# A tibble: 2 × 3
  ARMCD DIABP SYSBP
  <chr> <dbl> <dbl>
1 PBO    85     135
2 TRT    81.7   125

Key Points: - pivot_wider() converts long data to wide format - Wide format is often better for clinical reports and tables - Use names_from to specify column headers and values_from for cell values


Exercise 8: Missing Data Analysis

# 8a. Identify subjects in AE but not in DM
missing_subjects <- ae %>%
  anti_join(dm, by = "USUBJID") %>%
  distinct(USUBJID)

missing_subjects
# A tibble: 1 × 1
  USUBJID
  <chr>
1 001-007
# 8b. Create data quality report
data_quality <- tibble(
  Dataset = c("DM", "AE", "VS"),
  N_Subjects = c(
    n_distinct(dm$USUBJID),
    n_distinct(ae$USUBJID),
    n_distinct(vs$USUBJID)
  ),
  N_Records = c(
    nrow(dm),
    nrow(ae),
    nrow(vs)
  )
)

data_quality
# A tibble: 3 × 3
  Dataset N_Subjects N_Records
  <chr>        <int>     <int>
1 DM               6         6
2 AE               4         6
3 VS               4         8

Key Points: - Use anti_join() to find missing relationships between datasets - Data quality checks are essential in clinical programming - Always validate data completeness before analysis


🎯 Learning Objectives Met

By completing these exercises, you have successfully mastered:

R4DS Chapter 12: Applied tidy data principles with pivot_longer() and pivot_wider()
R4DS Chapter 13: Used all types of joins for relational data analysis

Clinical Programming Skills Gained:

  • Reshaping clinical data between wide and long formats
  • Joining multiple clinical domains (DM, AE, VS)
  • Creating comprehensive clinical analysis pipelines
  • Data quality assessment and missing data identification
  • Complex grouping and summarization for clinical reporting

Next Steps:

  1. Apply these concepts to real clinical trial datasets
  2. Practice with more complex multi-domain joins
  3. Move to Module 4: Advanced data manipulation
  4. Explore tidyr functions for complex reshaping scenarios

You are now ready for advanced clinical data relationships and complex analyses! 🚀