# Load required packages
library(tidyverse) # Includes dplyr, tibble, readr, and %>% operator
library(lubridate) # Date manipulation
library(haven) # SAS/XPT import
# Check setup
packageVersion("tidyverse")[1] '2.0.0'
Complete solutions for Module 2 exercises based on R4DS Chapters 18, 5, 10, and 11
# Load required packages
library(tidyverse) # Includes dplyr, tibble, readr, and %>% operator
library(lubridate) # Date manipulation
library(haven) # SAS/XPT import
# Check setup
packageVersion("tidyverse")[1] '2.0.0'
# Create demographics dataset using tibble() - preferred over data.frame()
dm <- tibble(
USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006", "001-007", "001-008"),
AGE = c(28, 45, 67, 52, 71, 34, 58, 76),
SEX = c("F", "M", "F", "M", "F", "M", "F", "M"),
RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20", "2024-01-21", "2024-01-22"),
COUNTRY = c("USA", "CAN", "USA", "GBR", "USA", "CAN", "GBR", "USA"),
ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO", "TRT", "PBO", "TRT")
)
# Display dataset (note how tibbles print better than data.frames)
dm# A tibble: 8 × 6
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD
<chr> <dbl> <chr> <chr> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT
2 001-002 45 M 2024-01-16 CAN PBO
3 001-003 67 F 2024-01-17 USA TRT
4 001-004 52 M 2024-01-18 GBR TRT
5 001-005 71 F 2024-01-19 USA PBO
6 001-006 34 M 2024-01-20 CAN TRT
7 001-007 58 F 2024-01-21 GBR PBO
8 001-008 76 M 2024-01-22 USA TRT
# Use glimpse() to see structure (better than str() for tibbles)
glimpse(dm)Rows: 8
Columns: 6
$ USUBJID <chr> "001-001", "001-002", "001-003", "001-004", "001-005", "001-00…
$ AGE <dbl> 28, 45, 67, 52, 71, 34, 58, 76
$ SEX <chr> "F", "M", "F", "M", "F", "M", "F", "M"
$ RFSTDTC <chr> "2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-…
$ COUNTRY <chr> "USA", "CAN", "USA", "GBR", "USA", "CAN", "GBR", "USA"
$ ARMCD <chr> "TRT", "PBO", "TRT", "TRT", "PBO", "TRT", "PBO", "TRT"
Key Points: - tibble() creates enhanced data frames with better printing - Tibbles show data types and dimensions clearly - glimpse() is the tidyverse equivalent of str() - Tibbles never convert strings to factors automatically
# 2a. Import demographics from CSV file (common for EDC exports)
dm_csv <- read_csv("data/demographics.csv",
col_types = cols(
USUBJID = col_character(),
AGE = col_double(),
SEX = col_character(),
RFSTDTC = col_character()
))
# 2b. Import lab data from Excel (common for central lab data)
lab_excel <- read_excel("data/lab_results.xlsx",
sheet = "Lab_Results",
skip = 1) # Skip header row if needed
# 2c. Import legacy SAS dataset (common in established clinical programs)
dm_sas <- read_sas("legacy_data/dm.sas7bdat")
dm_xpt <- read_xpt("submission/dm.xpt") # Regulatory submission formatKey Points: - Always specify column types explicitly for clinical data - Use read_excel() with sheet parameter for multi-sheet files - read_sas() and read_xpt() work with legacy clinical datasets - Check problems() after import to identify parsing issues
# 1. Filter subjects who are 65 years or older
elderly_subjects <- dm %>%
filter(AGE >= 65)
elderly_subjects# A tibble: 3 × 6
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD
<chr> <dbl> <chr> <chr> <chr> <chr>
1 001-003 67 F 2024-01-17 USA TRT
2 001-005 71 F 2024-01-19 USA PBO
3 001-008 76 M 2024-01-22 USA TRT
# 2. Filter female subjects only
female_subjects <- dm %>%
filter(SEX == "F")
female_subjects# A tibble: 4 × 6
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD
<chr> <dbl> <chr> <chr> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT
2 001-003 67 F 2024-01-17 USA TRT
3 001-005 71 F 2024-01-19 USA PBO
4 001-007 58 F 2024-01-21 GBR PBO
# 3. Filter subjects on treatment (ARMCD == "TRT") who are over 50
treatment_over_50 <- dm %>%
filter(ARMCD == "TRT" & AGE > 50)
treatment_over_50# A tibble: 3 × 6
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD
<chr> <dbl> <chr> <chr> <chr> <chr>
1 001-003 67 F 2024-01-17 USA TRT
2 001-004 52 M 2024-01-18 GBR TRT
3 001-008 76 M 2024-01-22 USA TRT
# 4. Filter subjects from North America (USA or CAN)
north_america <- dm %>%
filter(COUNTRY %in% c("USA", "CAN"))
north_america# A tibble: 6 × 6
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD
<chr> <dbl> <chr> <chr> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT
2 001-002 45 M 2024-01-16 CAN PBO
3 001-003 67 F 2024-01-17 USA TRT
4 001-005 71 F 2024-01-19 USA PBO
5 001-006 34 M 2024-01-20 CAN TRT
6 001-008 76 M 2024-01-22 USA TRT
Key Points: - Use comparison operators: >=, >, ==, != - Combine conditions with & (AND) and | (OR) - Use %in% to check if values are in a vector - Multiple filter conditions can be separated by commas (implicit AND)
# 1. Select only USUBJID, AGE, and SEX
basic_demo <- dm %>%
select(USUBJID, AGE, SEX)
basic_demo# A tibble: 8 × 3
USUBJID AGE SEX
<chr> <dbl> <chr>
1 001-001 28 F
2 001-002 45 M
3 001-003 67 F
4 001-004 52 M
5 001-005 71 F
6 001-006 34 M
7 001-007 58 F
8 001-008 76 M
# 2. Select all columns except COUNTRY
no_country <- dm %>%
select(-COUNTRY)
no_country# A tibble: 8 × 5
USUBJID AGE SEX RFSTDTC ARMCD
<chr> <dbl> <chr> <chr> <chr>
1 001-001 28 F 2024-01-15 TRT
2 001-002 45 M 2024-01-16 PBO
3 001-003 67 F 2024-01-17 TRT
4 001-004 52 M 2024-01-18 TRT
5 001-005 71 F 2024-01-19 PBO
6 001-006 34 M 2024-01-20 TRT
7 001-007 58 F 2024-01-21 PBO
8 001-008 76 M 2024-01-22 TRT
# 3. Select USUBJID and all columns that start with "A" (AGE, ARMCD)
usubjid_and_a_vars <- dm %>%
select(USUBJID, starts_with("A"))
usubjid_and_a_vars# A tibble: 8 × 3
USUBJID AGE ARMCD
<chr> <dbl> <chr>
1 001-001 28 TRT
2 001-002 45 PBO
3 001-003 67 TRT
4 001-004 52 TRT
5 001-005 71 PBO
6 001-006 34 TRT
7 001-007 58 PBO
8 001-008 76 TRT
Key Points: - Use column names directly: select(col1, col2, col3) - Use - to exclude columns: select(-col1, -col2) - Use helper functions: starts_with(), ends_with(), contains() - Can combine patterns: select(USUBJID, starts_with("A"))
# Add derived variables to dm
dm <- dm %>%
mutate(
# 1. ELDERLY: "Y" if AGE >= 65, "N" otherwise
ELDERLY = ifelse(AGE >= 65, "Y", "N"),
# 2. RFSTDT: Convert RFSTDTC to Date format using ymd()
RFSTDT = ymd(RFSTDTC),
# 3. AGEGRP: "18-39", "40-64", "65+" based on age ranges
AGEGRP = case_when(
AGE < 40 ~ "18-39",
AGE < 65 ~ "40-64",
AGE >= 65 ~ "65+"
),
# 4. FEMALE: "Y" if SEX == "F", "N" otherwise
FEMALE = ifelse(SEX == "F", "Y", "N"),
# 5. TREATMENT: "Active" if ARMCD == "TRT", "Placebo" otherwise
TREATMENT = ifelse(ARMCD == "TRT", "Active", "Placebo")
)
# Display the updated dataset
dm# A tibble: 8 × 11
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
2 001-002 45 M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
3 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
4 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
5 001-005 71 F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
6 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
7 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
8 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
# ℹ 1 more variable: TREATMENT <chr>
glimpse(dm)Rows: 8
Columns: 11
$ USUBJID <chr> "001-001", "001-002", "001-003", "001-004", "001-005", "001-…
$ AGE <dbl> 28, 45, 67, 52, 71, 34, 58, 76
$ SEX <chr> "F", "M", "F", "M", "F", "M", "F", "M"
$ RFSTDTC <chr> "2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "202…
$ COUNTRY <chr> "USA", "CAN", "USA", "GBR", "USA", "CAN", "GBR", "USA"
$ ARMCD <chr> "TRT", "PBO", "TRT", "TRT", "PBO", "TRT", "PBO", "TRT"
$ ELDERLY <chr> "N", "N", "Y", "N", "Y", "N", "N", "Y"
$ RFSTDT <date> 2024-01-15, 2024-01-16, 2024-01-17, 2024-01-18, 2024-01-19, …
$ AGEGRP <chr> "18-39", "40-64", "65+", "40-64", "65+", "18-39", "40-64", "…
$ FEMALE <chr> "Y", "N", "Y", "N", "Y", "N", "Y", "N"
$ TREATMENT <chr> "Active", "Placebo", "Active", "Active", "Placebo", "Active"…
Key Points: - ifelse() for simple binary conditions - case_when() for multiple conditions (like SAS IF-THEN-ELSE IF) - ymd() converts character dates to Date objects - Always end case_when() with TRUE ~ default_value for completeness
# 1. Sort by age (ascending)
dm_by_age <- dm %>%
arrange(AGE)
dm_by_age# A tibble: 8 × 11
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
2 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
3 001-002 45 M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
4 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
5 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
6 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
7 001-005 71 F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
8 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
# ℹ 1 more variable: TREATMENT <chr>
# 2. Sort by country, then by age (descending)
dm_by_country_age <- dm %>%
arrange(COUNTRY, desc(AGE))
dm_by_country_age# A tibble: 8 × 11
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-002 45 M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
2 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
3 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
4 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
5 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
6 001-005 71 F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
7 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
8 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
# ℹ 1 more variable: TREATMENT <chr>
# 3. Sort by treatment arm, then by elderly flag, then by age
dm_complex_sort <- dm %>%
arrange(ARMCD, ELDERLY, AGE)
dm_complex_sort# A tibble: 8 × 11
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-002 45 M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
2 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
3 001-005 71 F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
4 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
5 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
6 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
7 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
8 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
# ℹ 1 more variable: TREATMENT <chr>
Key Points: - Default sorting is ascending - Use desc() for descending order - Multiple variables create nested sorting - Character variables sort alphabetically
# 7a. Calculate overall statistics for all subjects
overall_stats <- dm %>%
summarise(
n = n(),
mean_age = mean(AGE),
median_age = median(AGE),
sd_age = sd(AGE),
min_age = min(AGE),
max_age = max(AGE)
)
overall_stats# A tibble: 1 × 6
n mean_age median_age sd_age min_age max_age
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 8 53.9 55 17.4 28 76
# 7b. Calculate statistics by sex
sex_stats <- dm %>%
group_by(SEX) %>%
summarise(
n = n(),
mean_age = round(mean(AGE), 1),
elderly_count = sum(AGE >= 65),
elderly_pct = round(100 * mean(AGE >= 65), 1),
.groups = "drop"
)
sex_stats# A tibble: 2 × 5
SEX n mean_age elderly_count elderly_pct
<chr> <int> <dbl> <int> <dbl>
1 F 4 56 2 50
2 M 4 51.8 1 25
# 7c. Calculate clinical trial enrollment summary
enrollment_summary <- dm %>%
group_by(COUNTRY, ARMCD) %>%
summarise(
subjects = n(),
mean_age = round(mean(AGE), 1),
elderly_pct = round(100 * mean(AGE >= 65), 1),
female_pct = round(100 * mean(SEX == "F"), 1),
.groups = "drop"
)
enrollment_summary# A tibble: 6 × 6
COUNTRY ARMCD subjects mean_age elderly_pct female_pct
<chr> <chr> <int> <dbl> <dbl> <dbl>
1 CAN PBO 1 45 0 0
2 CAN TRT 1 34 0 0
3 GBR PBO 1 58 0 100
4 GBR TRT 1 52 0 0
5 USA PBO 1 71 100 100
6 USA TRT 3 57 66.7 66.7
Key Points: - summarise() reduces groups to single rows - n() counts observations in each group - Use sum(condition) to count TRUE values - Use mean(condition) to get proportions - Always use .groups = "drop" to avoid warnings
# Create a pipeline that:
# 1. Filters subjects >= 40 years old
# 2. Creates a variable MATURE = "Y" (since all will be >= 40)
# 3. Sorts by treatment arm, then age
# 4. Selects relevant variables
mature_subjects <- dm %>%
filter(AGE >= 40) %>% # Step 1: Adults 40+
mutate(MATURE = "Y") %>% # Step 2: Add flag
arrange(ARMCD, AGE) %>% # Step 3: Sort
select(USUBJID, AGE, SEX, ARMCD, TREATMENT, ELDERLY, MATURE) # Step 4: Select
mature_subjects# A tibble: 6 × 7
USUBJID AGE SEX ARMCD TREATMENT ELDERLY MATURE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 001-002 45 M PBO Placebo N Y
2 001-007 58 F PBO Placebo N Y
3 001-005 71 F PBO Placebo Y Y
4 001-004 52 M TRT Active N Y
5 001-003 67 F TRT Active Y Y
6 001-008 76 M TRT Active Y Y
Key Points: - Pipes (%>%) make code read left-to-right like English - Each step builds on the previous result - Better readability than nested functions - Can span multiple lines for complex operations
# Create a version of dm with some missing ages
dm_with_na <- dm %>%
mutate(AGE = case_when(
USUBJID == "001-002" ~ NA_real_,
USUBJID == "001-005" ~ NA_real_,
TRUE ~ as.numeric(AGE)
))
# Create elderly flag that handles missing values properly
dm_with_na <- dm_with_na %>%
mutate(
ELDERLY_SAFE = case_when(
is.na(AGE) ~ NA_character_, # Handle missing AGE values
AGE >= 65 ~ "Y",
TRUE ~ "N"
)
)
dm_with_na# A tibble: 8 × 12
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
2 001-002 NA M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
3 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
4 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
5 001-005 NA F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
6 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
7 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
8 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
# ℹ 2 more variables: TREATMENT <chr>, ELDERLY_SAFE <chr>
Key Points: - Use is.na() to check for missing values - Use NA_character_, NA_real_, NA_integer_ for typed missing values - Always handle missing values explicitly in clinical programming - case_when() is preferred over ifelse() for missing value handling
# Create BMI categories assuming average BMI values
dm <- dm %>%
mutate(
# Simulate BMI based on age (just for demonstration)
BMI = case_when(
AGE < 40 ~ 24.5,
AGE < 65 ~ 26.8,
TRUE ~ 25.2
),
BMI_CAT = case_when(
BMI < 18.5 ~ "Underweight",
BMI < 25 ~ "Normal",
BMI < 30 ~ "Overweight",
TRUE ~ "Obese"
)
)
# Flag subjects enrolled in January 2024
dm <- dm %>%
mutate(JAN_2024 = ifelse(substr(RFSTDTC, 1, 7) == "2024-01", "Y", "N"))
# Create a site number from USUBJID (first 3 digits)
dm <- dm %>%
mutate(SITEID = substr(USUBJID, 1, 3))
# Display final dataset
dm# A tibble: 8 × 15
USUBJID AGE SEX RFSTDTC COUNTRY ARMCD ELDERLY RFSTDT AGEGRP FEMALE
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <date> <chr> <chr>
1 001-001 28 F 2024-01-15 USA TRT N 2024-01-15 18-39 Y
2 001-002 45 M 2024-01-16 CAN PBO N 2024-01-16 40-64 N
3 001-003 67 F 2024-01-17 USA TRT Y 2024-01-17 65+ Y
4 001-004 52 M 2024-01-18 GBR TRT N 2024-01-18 40-64 N
5 001-005 71 F 2024-01-19 USA PBO Y 2024-01-19 65+ Y
6 001-006 34 M 2024-01-20 CAN TRT N 2024-01-20 18-39 N
7 001-007 58 F 2024-01-21 GBR PBO N 2024-01-21 40-64 Y
8 001-008 76 M 2024-01-22 USA TRT Y 2024-01-22 65+ N
# ℹ 5 more variables: TREATMENT <chr>, BMI <dbl>, BMI_CAT <chr>,
# JAN_2024 <chr>, SITEID <chr>
Key Points: - Copilot suggestions are context-aware - Write descriptive comments for better suggestions - Always verify and test generated code - Clinical context helps generate relevant code
By completing these exercises, you have successfully mastered:
✅ R4DS Chapter 18: Used pipes (%>%) for readable data workflows
✅ R4DS Chapter 5: Applied all five data transformation verbs
✅ R4DS Chapter 10: Created and worked with tibbles effectively
✅ R4DS Chapter 11: Understood clinical data import patterns
You are now ready for advanced data manipulation and joins! 🚀
You can try these prompts in Copilot to assist you: - # Convert AESTDAT to AESTDTC - # Derive AEDY from AESTDTC and RFSTDTC - # Create AETPT based on AETM - # Assign AESEQ by subject - # Create AESPID using FORMID
AESTDTC, AEDY, AETPT, AESEQ, AESPIDlabelled::var_label().xpt