library(dplyr)
library(tidyr)
library(lubridate)
library(stringr)
cat("=== Data Manipulation Examples ===\n")=== Data Manipulation Examples ===
cat("Essential dplyr operations for data analysis\n\n")Essential dplyr operations for data analysis
Practical Data Manipulation Examples with dplyr
Practical Data Manipulation Examples with dplyr
Module 2-3: Data Manipulation, Filtering, Joins, and Summaries
This guide demonstrates essential dplyr operations for clinical data analysis, covering core functions and techniques used in data manipulation workflows.
Data manipulation is fundamental to clinical programming. This document covers:
select, filter, mutate, summarise)library(dplyr)
library(tidyr)
library(lubridate)
library(stringr)
cat("=== Data Manipulation Examples ===\n")=== Data Manipulation Examples ===
cat("Essential dplyr operations for data analysis\n\n")Essential dplyr operations for data analysis
Let’s create sample clinical datasets to demonstrate data manipulation techniques.
# Load validated SDTM datasets (in production, these would be from validated sources)
# Demographics Domain (DM)
dm_sdtm <- data.frame(
STUDYID = rep("ABC-123", 20),
DOMAIN = rep("DM", 20),
USUBJID = paste0("ABC-123-", sprintf("%03d", 1:20)),
SUBJID = sprintf("%03d", 1:20),
SITEID = rep(c("001", "002", "003"), length.out = 20),
BRTHDTC = sample(seq(as.Date("1950-01-01"), as.Date("1990-12-31"), by = "day"), 20),
SEX = sample(c("M", "F"), 20, replace = TRUE, prob = c(0.6, 0.4)),
RACE = sample(c("WHITE", "BLACK OR AFRICAN AMERICAN", "ASIAN"), 20, replace = TRUE),
ETHNIC = sample(c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO"), 20, replace = TRUE),
ARMCD = rep(c("PBO", "TRT"), length.out = 20),
RFSTDTC = as.Date("2024-01-15"),
RFENDTC = as.Date("2024-04-15"),
stringsAsFactors = FALSE
) %>%
mutate(ARM = case_when(
ARMCD == "PBO" ~ "Placebo",
ARMCD == "TRT" ~ "Study Drug 10mg"
))
print("Demographics Domain (DM):")[1] "Demographics Domain (DM):"
print(dm_sdtm) STUDYID DOMAIN USUBJID SUBJID SITEID BRTHDTC SEX
1 ABC-123 DM ABC-123-001 001 001 1960-08-03 M
2 ABC-123 DM ABC-123-002 002 002 1959-01-23 M
3 ABC-123 DM ABC-123-003 003 003 1979-01-01 M
4 ABC-123 DM ABC-123-004 004 001 1990-02-04 F
5 ABC-123 DM ABC-123-005 005 002 1955-07-06 F
6 ABC-123 DM ABC-123-006 006 003 1976-11-29 M
7 ABC-123 DM ABC-123-007 007 001 1972-11-28 F
8 ABC-123 DM ABC-123-008 008 002 1973-08-05 F
9 ABC-123 DM ABC-123-009 009 003 1963-09-06 F
10 ABC-123 DM ABC-123-010 010 001 1986-10-10 F
11 ABC-123 DM ABC-123-011 011 002 1983-06-18 M
12 ABC-123 DM ABC-123-012 012 003 1963-07-13 M
13 ABC-123 DM ABC-123-013 013 001 1962-09-10 M
14 ABC-123 DM ABC-123-014 014 002 1982-02-11 M
15 ABC-123 DM ABC-123-015 015 003 1957-09-28 M
16 ABC-123 DM ABC-123-016 016 001 1982-06-12 M
17 ABC-123 DM ABC-123-017 017 002 1955-01-08 M
18 ABC-123 DM ABC-123-018 018 003 1953-02-16 M
19 ABC-123 DM ABC-123-019 019 001 1963-01-17 M
20 ABC-123 DM ABC-123-020 020 002 1972-05-02 M
RACE ETHNIC ARMCD RFSTDTC RFENDTC
1 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
2 WHITE HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
3 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
4 WHITE HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
5 ASIAN NOT HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
6 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
7 WHITE HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
8 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
9 ASIAN NOT HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
10 ASIAN NOT HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
11 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
12 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
13 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
14 WHITE HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
15 WHITE NOT HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
16 ASIAN NOT HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
17 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
18 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
19 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO PBO 2024-01-15 2024-04-15
20 WHITE NOT HISPANIC OR LATINO TRT 2024-01-15 2024-04-15
ARM
1 Placebo
2 Study Drug 10mg
3 Placebo
4 Study Drug 10mg
5 Placebo
6 Study Drug 10mg
7 Placebo
8 Study Drug 10mg
9 Placebo
10 Study Drug 10mg
11 Placebo
12 Study Drug 10mg
13 Placebo
14 Study Drug 10mg
15 Placebo
16 Study Drug 10mg
17 Placebo
18 Study Drug 10mg
19 Placebo
20 Study Drug 10mg
# Vital Signs Domain (VS) - Multiple records per subject
vs_list <- lapply(dm_sdtm$USUBJID[1:10], function(subj) {
visits <- c("BASELINE", "WEEK 2", "WEEK 4", "WEEK 8", "WEEK 12")
data.frame(
STUDYID = "ABC-123",
DOMAIN = "VS",
USUBJID = subj,
VISIT = rep(visits, 3),
VISITNUM = rep(1:5, 3),
VSTESTCD = rep(c("SYSBP", "DIABP", "PULSE"), each = 5),
VSTEST = rep(c("Systolic Blood Pressure", "Diastolic Blood Pressure", "Pulse Rate"), each = 5),
VSORRES = c(
round(rnorm(5, 125, 15)), # Systolic BP
round(rnorm(5, 80, 10)), # Diastolic BP
round(rnorm(5, 70, 12)) # Pulse
),
VSORRESU = rep(c("mmHg", "mmHg", "beats/min"), each = 5),
VSDTC = rep(seq(as.Date("2024-01-15"), by = "2 weeks", length.out = 5), 3),
stringsAsFactors = FALSE
)
})
vs_sdtm <- do.call(rbind, vs_list) %>%
arrange(USUBJID, VSTESTCD, VISITNUM)
print("Vital Signs Domain (VS) - First 15 records:")[1] "Vital Signs Domain (VS) - First 15 records:"
print(vs_sdtm %>% slice(1:15)) STUDYID DOMAIN USUBJID VISIT VISITNUM VSTESTCD
1 ABC-123 VS ABC-123-001 BASELINE 1 DIABP
2 ABC-123 VS ABC-123-001 WEEK 2 2 DIABP
3 ABC-123 VS ABC-123-001 WEEK 4 3 DIABP
4 ABC-123 VS ABC-123-001 WEEK 8 4 DIABP
5 ABC-123 VS ABC-123-001 WEEK 12 5 DIABP
6 ABC-123 VS ABC-123-001 BASELINE 1 PULSE
7 ABC-123 VS ABC-123-001 WEEK 2 2 PULSE
8 ABC-123 VS ABC-123-001 WEEK 4 3 PULSE
9 ABC-123 VS ABC-123-001 WEEK 8 4 PULSE
10 ABC-123 VS ABC-123-001 WEEK 12 5 PULSE
11 ABC-123 VS ABC-123-001 BASELINE 1 SYSBP
12 ABC-123 VS ABC-123-001 WEEK 2 2 SYSBP
13 ABC-123 VS ABC-123-001 WEEK 4 3 SYSBP
14 ABC-123 VS ABC-123-001 WEEK 8 4 SYSBP
15 ABC-123 VS ABC-123-001 WEEK 12 5 SYSBP
VSTEST VSORRES VSORRESU VSDTC
1 Diastolic Blood Pressure 85 mmHg 2024-01-15
2 Diastolic Blood Pressure 67 mmHg 2024-01-29
3 Diastolic Blood Pressure 88 mmHg 2024-02-12
4 Diastolic Blood Pressure 61 mmHg 2024-02-26
5 Diastolic Blood Pressure 68 mmHg 2024-03-11
6 Pulse Rate 84 beats/min 2024-01-15
7 Pulse Rate 54 beats/min 2024-01-29
8 Pulse Rate 57 beats/min 2024-02-12
9 Pulse Rate 65 beats/min 2024-02-26
10 Pulse Rate 72 beats/min 2024-03-11
11 Systolic Blood Pressure 102 mmHg 2024-01-15
12 Systolic Blood Pressure 142 mmHg 2024-01-29
13 Systolic Blood Pressure 130 mmHg 2024-02-12
14 Systolic Blood Pressure 128 mmHg 2024-02-26
15 Systolic Blood Pressure 129 mmHg 2024-03-11
# Adverse Events Domain (AE)
ae_sdtm <- data.frame(
STUDYID = rep("ABC-123", 12),
DOMAIN = rep("AE", 12),
USUBJID = sample(dm_sdtm$USUBJID, 12, replace = TRUE),
AEDECOD = sample(c("Headache", "Nausea", "Dizziness", "Fatigue", "Insomnia"), 12, replace = TRUE),
AESEV = sample(c("MILD", "MODERATE", "SEVERE"), 12, replace = TRUE, prob = c(0.6, 0.3, 0.1)),
AEREL = sample(c("RELATED", "NOT RELATED", "POSSIBLY RELATED"), 12, replace = TRUE),
AESTDTC = sample(seq(as.Date("2024-01-20"), as.Date("2024-04-10"), by = "day"), 12),
AEENDTC = sample(seq(as.Date("2024-02-01"), as.Date("2024-04-15"), by = "day"), 12),
stringsAsFactors = FALSE
) %>%
arrange(USUBJID, AESTDTC)
print("Adverse Events Domain (AE):")[1] "Adverse Events Domain (AE):"
print(ae_sdtm) STUDYID DOMAIN USUBJID AEDECOD AESEV AEREL AESTDTC
1 ABC-123 AE ABC-123-003 Dizziness MILD POSSIBLY RELATED 2024-03-29
2 ABC-123 AE ABC-123-004 Insomnia MILD POSSIBLY RELATED 2024-02-24
3 ABC-123 AE ABC-123-004 Nausea MILD POSSIBLY RELATED 2024-04-08
4 ABC-123 AE ABC-123-009 Headache MILD NOT RELATED 2024-02-05
5 ABC-123 AE ABC-123-009 Insomnia MODERATE RELATED 2024-03-28
6 ABC-123 AE ABC-123-010 Headache MILD RELATED 2024-04-06
7 ABC-123 AE ABC-123-015 Insomnia MODERATE RELATED 2024-02-07
8 ABC-123 AE ABC-123-015 Nausea MODERATE NOT RELATED 2024-03-25
9 ABC-123 AE ABC-123-016 Nausea MODERATE RELATED 2024-02-26
10 ABC-123 AE ABC-123-018 Nausea MILD RELATED 2024-03-04
11 ABC-123 AE ABC-123-019 Nausea SEVERE NOT RELATED 2024-03-07
12 ABC-123 AE ABC-123-020 Dizziness MILD POSSIBLY RELATED 2024-03-16
AEENDTC
1 2024-02-09
2 2024-04-09
3 2024-03-13
4 2024-02-20
5 2024-03-16
6 2024-02-08
7 2024-02-04
8 2024-02-16
9 2024-02-29
10 2024-02-28
11 2024-04-03
12 2024-03-06
cat("=== SELECT Operations ===\n")=== SELECT Operations ===
# Select specific columns
dm_basic <- dm_sdtm %>%
select(USUBJID, SEX, RACE, ARM)
print("Basic demographics (selected columns):")[1] "Basic demographics (selected columns):"
print(dm_basic) USUBJID SEX RACE ARM
1 ABC-123-001 M BLACK OR AFRICAN AMERICAN Placebo
2 ABC-123-002 M WHITE Study Drug 10mg
3 ABC-123-003 M BLACK OR AFRICAN AMERICAN Placebo
4 ABC-123-004 F WHITE Study Drug 10mg
5 ABC-123-005 F ASIAN Placebo
6 ABC-123-006 M BLACK OR AFRICAN AMERICAN Study Drug 10mg
7 ABC-123-007 F WHITE Placebo
8 ABC-123-008 F BLACK OR AFRICAN AMERICAN Study Drug 10mg
9 ABC-123-009 F ASIAN Placebo
10 ABC-123-010 F ASIAN Study Drug 10mg
11 ABC-123-011 M BLACK OR AFRICAN AMERICAN Placebo
12 ABC-123-012 M BLACK OR AFRICAN AMERICAN Study Drug 10mg
13 ABC-123-013 M BLACK OR AFRICAN AMERICAN Placebo
14 ABC-123-014 M WHITE Study Drug 10mg
15 ABC-123-015 M WHITE Placebo
16 ABC-123-016 M ASIAN Study Drug 10mg
17 ABC-123-017 M BLACK OR AFRICAN AMERICAN Placebo
18 ABC-123-018 M BLACK OR AFRICAN AMERICAN Study Drug 10mg
19 ABC-123-019 M BLACK OR AFRICAN AMERICAN Placebo
20 ABC-123-020 M WHITE Study Drug 10mg
# Select columns using patterns
dm_dates <- dm_sdtm %>%
select(USUBJID, ends_with("DTC"))
print("\nDate columns:")[1] "\nDate columns:"
print(dm_dates) USUBJID BRTHDTC RFSTDTC RFENDTC
1 ABC-123-001 1960-08-03 2024-01-15 2024-04-15
2 ABC-123-002 1959-01-23 2024-01-15 2024-04-15
3 ABC-123-003 1979-01-01 2024-01-15 2024-04-15
4 ABC-123-004 1990-02-04 2024-01-15 2024-04-15
5 ABC-123-005 1955-07-06 2024-01-15 2024-04-15
6 ABC-123-006 1976-11-29 2024-01-15 2024-04-15
7 ABC-123-007 1972-11-28 2024-01-15 2024-04-15
8 ABC-123-008 1973-08-05 2024-01-15 2024-04-15
9 ABC-123-009 1963-09-06 2024-01-15 2024-04-15
10 ABC-123-010 1986-10-10 2024-01-15 2024-04-15
11 ABC-123-011 1983-06-18 2024-01-15 2024-04-15
12 ABC-123-012 1963-07-13 2024-01-15 2024-04-15
13 ABC-123-013 1962-09-10 2024-01-15 2024-04-15
14 ABC-123-014 1982-02-11 2024-01-15 2024-04-15
15 ABC-123-015 1957-09-28 2024-01-15 2024-04-15
16 ABC-123-016 1982-06-12 2024-01-15 2024-04-15
17 ABC-123-017 1955-01-08 2024-01-15 2024-04-15
18 ABC-123-018 1953-02-16 2024-01-15 2024-04-15
19 ABC-123-019 1963-01-17 2024-01-15 2024-04-15
20 ABC-123-020 1972-05-02 2024-01-15 2024-04-15
# Select and rename columns
dm_renamed <- dm_sdtm %>%
select(
Subject_ID = USUBJID,
Gender = SEX,
Treatment = ARM,
Start_Date = RFSTDTC
)
print("\nRenamed columns:")[1] "\nRenamed columns:"
print(head(dm_renamed)) Subject_ID Gender Treatment Start_Date
1 ABC-123-001 M Placebo 2024-01-15
2 ABC-123-002 M Study Drug 10mg 2024-01-15
3 ABC-123-003 M Placebo 2024-01-15
4 ABC-123-004 F Study Drug 10mg 2024-01-15
5 ABC-123-005 F Placebo 2024-01-15
6 ABC-123-006 M Study Drug 10mg 2024-01-15
cat("\n=== FILTER Operations ===\n")
=== FILTER Operations ===
# Simple filtering
males_only <- dm_sdtm %>%
filter(SEX == "M")
print("Male subjects only:")[1] "Male subjects only:"
print(males_only %>% select(USUBJID, SEX, ARM)) USUBJID SEX ARM
1 ABC-123-001 M Placebo
2 ABC-123-002 M Study Drug 10mg
3 ABC-123-003 M Placebo
4 ABC-123-006 M Study Drug 10mg
5 ABC-123-011 M Placebo
6 ABC-123-012 M Study Drug 10mg
7 ABC-123-013 M Placebo
8 ABC-123-014 M Study Drug 10mg
9 ABC-123-015 M Placebo
10 ABC-123-016 M Study Drug 10mg
11 ABC-123-017 M Placebo
12 ABC-123-018 M Study Drug 10mg
13 ABC-123-019 M Placebo
14 ABC-123-020 M Study Drug 10mg
# Multiple conditions with AND
treatment_males <- dm_sdtm %>%
filter(SEX == "M" & ARMCD == "TRT")
print("\nTreatment group males:")[1] "\nTreatment group males:"
print(treatment_males %>% select(USUBJID, SEX, ARM)) USUBJID SEX ARM
1 ABC-123-002 M Study Drug 10mg
2 ABC-123-006 M Study Drug 10mg
3 ABC-123-012 M Study Drug 10mg
4 ABC-123-014 M Study Drug 10mg
5 ABC-123-016 M Study Drug 10mg
6 ABC-123-018 M Study Drug 10mg
7 ABC-123-020 M Study Drug 10mg
# Multiple conditions with OR
minority_subjects <- dm_sdtm %>%
filter(RACE %in% c("BLACK OR AFRICAN AMERICAN", "ASIAN") |
ETHNIC == "HISPANIC OR LATINO")
print("\nMinority subjects:")[1] "\nMinority subjects:"
print(minority_subjects %>% select(USUBJID, RACE, ETHNIC)) USUBJID RACE ETHNIC
1 ABC-123-001 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
2 ABC-123-002 WHITE HISPANIC OR LATINO
3 ABC-123-003 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO
4 ABC-123-004 WHITE HISPANIC OR LATINO
5 ABC-123-005 ASIAN NOT HISPANIC OR LATINO
6 ABC-123-006 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
7 ABC-123-007 WHITE HISPANIC OR LATINO
8 ABC-123-008 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO
9 ABC-123-009 ASIAN NOT HISPANIC OR LATINO
10 ABC-123-010 ASIAN NOT HISPANIC OR LATINO
11 ABC-123-011 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
12 ABC-123-012 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
13 ABC-123-013 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO
14 ABC-123-014 WHITE HISPANIC OR LATINO
15 ABC-123-016 ASIAN NOT HISPANIC OR LATINO
16 ABC-123-017 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
17 ABC-123-018 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
18 ABC-123-019 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO
# Filtering with date conditions
recent_births <- dm_sdtm %>%
filter(BRTHDTC >= as.Date("1980-01-01"))
print("\nSubjects born after 1980:")[1] "\nSubjects born after 1980:"
print(recent_births %>% select(USUBJID, BRTHDTC) %>% arrange(BRTHDTC)) USUBJID BRTHDTC
1 ABC-123-014 1982-02-11
2 ABC-123-016 1982-06-12
3 ABC-123-011 1983-06-18
4 ABC-123-010 1986-10-10
5 ABC-123-004 1990-02-04
cat("\n=== MUTATE Operations ===\n")
=== MUTATE Operations ===
# Calculate age and create age groups
dm_enhanced <- dm_sdtm %>%
mutate(
# Calculate age at study start
AGE = as.numeric(floor(difftime(RFSTDTC, BRTHDTC, units = "days") / 365.25)),
# Create age groups
AGEGROUP = case_when(
AGE < 30 ~ "18-29",
AGE < 50 ~ "30-49",
AGE < 65 ~ "50-64",
TRUE ~ "65+"
),
# Create binary treatment indicator
TREATMENT_FLAG = if_else(ARMCD == "TRT", 1, 0),
# Format treatment description
TREATMENT_DESC = str_to_title(ARM),
# Study duration
STUDY_DAYS = as.numeric(RFENDTC - RFSTDTC)
)
print("Enhanced demographics with calculated variables:")[1] "Enhanced demographics with calculated variables:"
print(dm_enhanced %>%
select(USUBJID, SEX, AGE, AGEGROUP, TREATMENT_DESC, STUDY_DAYS) %>%
head(10)) USUBJID SEX AGE AGEGROUP TREATMENT_DESC STUDY_DAYS
1 ABC-123-001 M 63 50-64 Placebo 91
2 ABC-123-002 M 64 50-64 Study Drug 10mg 91
3 ABC-123-003 M 45 30-49 Placebo 91
4 ABC-123-004 F 33 30-49 Study Drug 10mg 91
5 ABC-123-005 F 68 65+ Placebo 91
6 ABC-123-006 M 47 30-49 Study Drug 10mg 91
7 ABC-123-007 F 51 50-64 Placebo 91
8 ABC-123-008 F 50 50-64 Study Drug 10mg 91
9 ABC-123-009 F 60 50-64 Placebo 91
10 ABC-123-010 F 37 30-49 Study Drug 10mg 91
cat("\n=== ARRANGE Operations ===\n")
=== ARRANGE Operations ===
# Sort by single variable
dm_by_age <- dm_enhanced %>%
arrange(AGE) %>%
select(USUBJID, SEX, AGE, ARM)
print("Subjects sorted by age (ascending):")[1] "Subjects sorted by age (ascending):"
print(head(dm_by_age, 8)) USUBJID SEX AGE ARM
1 ABC-123-004 F 33 Study Drug 10mg
2 ABC-123-010 F 37 Study Drug 10mg
3 ABC-123-011 M 40 Placebo
4 ABC-123-014 M 41 Study Drug 10mg
5 ABC-123-016 M 41 Study Drug 10mg
6 ABC-123-003 M 45 Placebo
7 ABC-123-006 M 47 Study Drug 10mg
8 ABC-123-008 F 50 Study Drug 10mg
# Sort by multiple variables
dm_multi_sort <- dm_enhanced %>%
arrange(ARM, desc(AGE)) %>%
select(USUBJID, ARM, SEX, AGE)
print("\nSorted by treatment, then age (descending):")[1] "\nSorted by treatment, then age (descending):"
print(head(dm_multi_sort, 10)) USUBJID ARM SEX AGE
1 ABC-123-017 Placebo M 69
2 ABC-123-005 Placebo F 68
3 ABC-123-015 Placebo M 66
4 ABC-123-001 Placebo M 63
5 ABC-123-013 Placebo M 61
6 ABC-123-009 Placebo F 60
7 ABC-123-019 Placebo M 60
8 ABC-123-007 Placebo F 51
9 ABC-123-003 Placebo M 45
10 ABC-123-011 Placebo M 40
cat("\n=== GROUP BY and SUMMARISE Operations ===\n")
=== GROUP BY and SUMMARISE Operations ===
# Basic summary by treatment group
dm_summary <- dm_enhanced %>%
group_by(ARM) %>%
summarise(
N = n(),
Mean_Age = round(mean(AGE), 1),
Median_Age = median(AGE),
Min_Age = min(AGE),
Max_Age = max(AGE),
SD_Age = round(sd(AGE), 1),
Female_N = sum(SEX == "F"),
Female_Pct = round(Female_N / N * 100, 1)
)
print("Demographics summary by treatment:")[1] "Demographics summary by treatment:"
print(dm_summary)# A tibble: 2 × 9
ARM N Mean_Age Median_Age Min_Age Max_Age SD_Age Female_N Female_Pct
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 Placebo 10 58.3 60.5 40 69 9.8 3 30
2 Study Dr… 10 49.4 48.5 33 70 12.1 3 30
# Multiple grouping variables
site_treatment_summary <- dm_enhanced %>%
group_by(SITEID, ARM) %>%
summarise(
N = n(),
Mean_Age = round(mean(AGE), 1),
.groups = "drop"
)
print("\nSummary by site and treatment:")[1] "\nSummary by site and treatment:"
print(site_treatment_summary)# A tibble: 6 × 4
SITEID ARM N Mean_Age
<chr> <chr> <int> <dbl>
1 001 Placebo 4 58.8
2 001 Study Drug 10mg 3 37
3 002 Placebo 3 59
4 002 Study Drug 10mg 4 51.5
5 003 Placebo 3 57
6 003 Study Drug 10mg 3 59
# Group-wise calculations with mutate
dm_group_calc <- dm_enhanced %>%
group_by(ARM) %>%
mutate(
# Z-score of age within treatment group
AGE_ZSCORE = round((AGE - mean(AGE)) / sd(AGE), 2),
# Rank within group
AGE_RANK = rank(AGE),
# Group size
GROUP_SIZE = n(),
# Age percentile within group
AGE_PERCENTILE = round(percent_rank(AGE) * 100, 1)
) %>%
ungroup()
print("Group-wise calculations:")[1] "Group-wise calculations:"
print(dm_group_calc %>%
select(USUBJID, ARM, AGE, AGE_ZSCORE, AGE_RANK, AGE_PERCENTILE) %>%
arrange(ARM, AGE_RANK) %>%
head(12))# A tibble: 12 × 6
USUBJID ARM AGE AGE_ZSCORE AGE_RANK AGE_PERCENTILE
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ABC-123-011 Placebo 40 -1.86 1 0
2 ABC-123-003 Placebo 45 -1.35 2 11.1
3 ABC-123-007 Placebo 51 -0.74 3 22.2
4 ABC-123-009 Placebo 60 0.17 4.5 33.3
5 ABC-123-019 Placebo 60 0.17 4.5 33.3
6 ABC-123-013 Placebo 61 0.27 6 55.6
7 ABC-123-001 Placebo 63 0.48 7 66.7
8 ABC-123-015 Placebo 66 0.78 8 77.8
9 ABC-123-005 Placebo 68 0.99 9 88.9
10 ABC-123-017 Placebo 69 1.09 10 100
11 ABC-123-004 Study Drug 10mg 33 -1.35 1 0
12 ABC-123-010 Study Drug 10mg 37 -1.02 2 11.1
cat("\n=== JOIN Operations ===\n")
=== JOIN Operations ===
# Inner join: Only subjects with both demographics and AEs
dm_ae_inner <- dm_sdtm %>%
inner_join(ae_sdtm, by = "USUBJID") %>%
select(USUBJID, SEX, ARM, AEDECOD, AESEV, AEREL)
print("Inner join - Subjects with AEs:")[1] "Inner join - Subjects with AEs:"
print(dm_ae_inner) USUBJID SEX ARM AEDECOD AESEV AEREL
1 ABC-123-003 M Placebo Dizziness MILD POSSIBLY RELATED
2 ABC-123-004 F Study Drug 10mg Insomnia MILD POSSIBLY RELATED
3 ABC-123-004 F Study Drug 10mg Nausea MILD POSSIBLY RELATED
4 ABC-123-009 F Placebo Headache MILD NOT RELATED
5 ABC-123-009 F Placebo Insomnia MODERATE RELATED
6 ABC-123-010 F Study Drug 10mg Headache MILD RELATED
7 ABC-123-015 M Placebo Insomnia MODERATE RELATED
8 ABC-123-015 M Placebo Nausea MODERATE NOT RELATED
9 ABC-123-016 M Study Drug 10mg Nausea MODERATE RELATED
10 ABC-123-018 M Study Drug 10mg Nausea MILD RELATED
11 ABC-123-019 M Placebo Nausea SEVERE NOT RELATED
12 ABC-123-020 M Study Drug 10mg Dizziness MILD POSSIBLY RELATED
# Left join: All subjects, with AE data where available
dm_ae_left <- dm_sdtm %>%
left_join(
ae_sdtm %>%
group_by(USUBJID) %>%
summarise(
Total_AEs = n(),
Severe_AEs = sum(AESEV == "SEVERE"),
Related_AEs = sum(AEREL == "RELATED"),
.groups = "drop"
),
by = "USUBJID"
) %>%
mutate(
Total_AEs = replace_na(Total_AEs, 0),
Severe_AEs = replace_na(Severe_AEs, 0),
Related_AEs = replace_na(Related_AEs, 0)
)
print("\nLeft join - All subjects with AE counts:")[1] "\nLeft join - All subjects with AE counts:"
print(dm_ae_left %>%
select(USUBJID, ARM, Total_AEs, Severe_AEs, Related_AEs) %>%
arrange(desc(Total_AEs))) USUBJID ARM Total_AEs Severe_AEs Related_AEs
1 ABC-123-004 Study Drug 10mg 2 0 0
2 ABC-123-009 Placebo 2 0 1
3 ABC-123-015 Placebo 2 0 1
4 ABC-123-003 Placebo 1 0 0
5 ABC-123-010 Study Drug 10mg 1 0 1
6 ABC-123-016 Study Drug 10mg 1 0 1
7 ABC-123-018 Study Drug 10mg 1 0 1
8 ABC-123-019 Placebo 1 1 0
9 ABC-123-020 Study Drug 10mg 1 0 0
10 ABC-123-001 Placebo 0 0 0
11 ABC-123-002 Study Drug 10mg 0 0 0
12 ABC-123-005 Placebo 0 0 0
13 ABC-123-006 Study Drug 10mg 0 0 0
14 ABC-123-007 Placebo 0 0 0
15 ABC-123-008 Study Drug 10mg 0 0 0
16 ABC-123-011 Placebo 0 0 0
17 ABC-123-012 Study Drug 10mg 0 0 0
18 ABC-123-013 Placebo 0 0 0
19 ABC-123-014 Study Drug 10mg 0 0 0
20 ABC-123-017 Placebo 0 0 0
# Join demographics with baseline vital signs
baseline_vitals <- vs_sdtm %>%
filter(VISIT == "BASELINE") %>%
select(USUBJID, VSTESTCD, VSORRES) %>%
pivot_wider(
names_from = VSTESTCD,
values_from = VSORRES,
names_prefix = "BL_"
)
dm_vitals <- dm_enhanced %>%
left_join(baseline_vitals, by = "USUBJID") %>%
filter(!is.na(BL_SYSBP)) # Only subjects with baseline vitals
print("Demographics with baseline vitals:")[1] "Demographics with baseline vitals:"
print(dm_vitals %>%
select(USUBJID, ARM, AGE, BL_SYSBP, BL_DIABP, BL_PULSE) %>%
head(8)) USUBJID ARM AGE BL_SYSBP BL_DIABP BL_PULSE
1 ABC-123-001 Placebo 63 102 85 84
2 ABC-123-002 Study Drug 10mg 64 115 91 67
3 ABC-123-003 Placebo 45 129 61 79
4 ABC-123-004 Study Drug 10mg 33 149 87 99
5 ABC-123-005 Placebo 68 134 92 93
6 ABC-123-006 Study Drug 10mg 47 122 80 82
7 ABC-123-007 Placebo 51 97 75 74
8 ABC-123-008 Study Drug 10mg 50 113 88 70
cat("\n=== Conditional Operations ===\n")
=== Conditional Operations ===
# Complex conditional logic
dm_risk_assessment <- dm_enhanced %>%
mutate(
# Risk categorization based on multiple factors
RISK_CATEGORY = case_when(
AGE >= 65 & SEX == "M" ~ "High Risk",
AGE >= 65 & SEX == "F" ~ "Moderate Risk",
AGE < 65 & SEX == "M" ~ "Moderate Risk",
AGE < 65 & SEX == "F" ~ "Low Risk",
TRUE ~ "Unknown"
),
# Eligibility criteria
ELIGIBLE = case_when(
AGE < 18 ~ "Too Young",
AGE > 75 ~ "Too Old",
TRUE ~ "Eligible"
),
# Priority scoring
PRIORITY_SCORE = case_when(
ARMCD == "TRT" & AGE >= 65 ~ 3,
ARMCD == "TRT" & AGE < 65 ~ 2,
ARMCD == "PBO" ~ 1,
TRUE ~ 0
)
)
print("Risk assessment and eligibility:")[1] "Risk assessment and eligibility:"
print(dm_risk_assessment %>%
select(USUBJID, AGE, SEX, ARM, RISK_CATEGORY, ELIGIBLE, PRIORITY_SCORE) %>%
arrange(desc(PRIORITY_SCORE))) USUBJID AGE SEX ARM RISK_CATEGORY ELIGIBLE PRIORITY_SCORE
1 ABC-123-018 70 M Study Drug 10mg High Risk Eligible 3
2 ABC-123-002 64 M Study Drug 10mg Moderate Risk Eligible 2
3 ABC-123-004 33 F Study Drug 10mg Low Risk Eligible 2
4 ABC-123-006 47 M Study Drug 10mg Moderate Risk Eligible 2
5 ABC-123-008 50 F Study Drug 10mg Low Risk Eligible 2
6 ABC-123-010 37 F Study Drug 10mg Low Risk Eligible 2
7 ABC-123-012 60 M Study Drug 10mg Moderate Risk Eligible 2
8 ABC-123-014 41 M Study Drug 10mg Moderate Risk Eligible 2
9 ABC-123-016 41 M Study Drug 10mg Moderate Risk Eligible 2
10 ABC-123-020 51 M Study Drug 10mg Moderate Risk Eligible 2
11 ABC-123-001 63 M Placebo Moderate Risk Eligible 1
12 ABC-123-003 45 M Placebo Moderate Risk Eligible 1
13 ABC-123-005 68 F Placebo Moderate Risk Eligible 1
14 ABC-123-007 51 F Placebo Low Risk Eligible 1
15 ABC-123-009 60 F Placebo Low Risk Eligible 1
16 ABC-123-011 40 M Placebo Moderate Risk Eligible 1
17 ABC-123-013 61 M Placebo Moderate Risk Eligible 1
18 ABC-123-015 66 M Placebo High Risk Eligible 1
19 ABC-123-017 69 M Placebo High Risk Eligible 1
20 ABC-123-019 60 M Placebo Moderate Risk Eligible 1
# Advanced string operations
dm_strings <- dm_sdtm %>%
mutate(
# Extract site region from site ID
REGION = case_when(
str_detect(SITEID, "^00[1-2]") ~ "North America",
str_detect(SITEID, "^003") ~ "Europe",
TRUE ~ "Other"
),
# Format subject ID for display
DISPLAY_ID = str_replace(USUBJID, "ABC-123-", "Subject "),
# Create composite descriptions
SUBJECT_DESC = paste(SEX, RACE, "subject in", ARM, "group"),
# Clean and standardize race
RACE_CLEAN = str_to_title(str_to_lower(RACE))
)
print("String manipulation examples:")[1] "String manipulation examples:"
print(dm_strings %>%
select(USUBJID, DISPLAY_ID, REGION, SUBJECT_DESC) %>%
head(8)) USUBJID DISPLAY_ID REGION
1 ABC-123-001 Subject 001 North America
2 ABC-123-002 Subject 002 North America
3 ABC-123-003 Subject 003 Europe
4 ABC-123-004 Subject 004 North America
5 ABC-123-005 Subject 005 North America
6 ABC-123-006 Subject 006 Europe
7 ABC-123-007 Subject 007 North America
8 ABC-123-008 Subject 008 North America
SUBJECT_DESC
1 M BLACK OR AFRICAN AMERICAN subject in Placebo group
2 M WHITE subject in Study Drug 10mg group
3 M BLACK OR AFRICAN AMERICAN subject in Placebo group
4 F WHITE subject in Study Drug 10mg group
5 F ASIAN subject in Placebo group
6 M BLACK OR AFRICAN AMERICAN subject in Study Drug 10mg group
7 F WHITE subject in Placebo group
8 F BLACK OR AFRICAN AMERICAN subject in Study Drug 10mg group
# Window functions for rankings and running calculations
vs_analysis <- vs_sdtm %>%
filter(VSTESTCD == "SYSBP") %>%
group_by(USUBJID) %>%
arrange(USUBJID, VISITNUM) %>%
mutate(
# Running mean
RUNNING_MEAN = round(cummean(VSORRES), 1),
# Change from baseline
CHANGE_BL = VSORRES - first(VSORRES),
# Percent change from baseline
PCT_CHANGE_BL = round((VSORRES - first(VSORRES)) / first(VSORRES) * 100, 1),
# Visit rank (within subject)
VISIT_RANK = row_number(),
# Flag last visit
LAST_VISIT = if_else(row_number() == n(), "Y", "N")
) %>%
ungroup()
print("Window functions - Systolic BP analysis:")[1] "Window functions - Systolic BP analysis:"
print(vs_analysis %>%
select(USUBJID, VISIT, VSORRES, RUNNING_MEAN, CHANGE_BL, PCT_CHANGE_BL, LAST_VISIT) %>%
head(15))# A tibble: 15 × 7
USUBJID VISIT VSORRES RUNNING_MEAN CHANGE_BL PCT_CHANGE_BL LAST_VISIT
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 ABC-123-001 BASELINE 102 102 0 0 N
2 ABC-123-001 WEEK 2 142 122 40 39.2 N
3 ABC-123-001 WEEK 4 130 125. 28 27.5 N
4 ABC-123-001 WEEK 8 128 126. 26 25.5 N
5 ABC-123-001 WEEK 12 129 126. 27 26.5 Y
6 ABC-123-002 BASELINE 115 115 0 0 N
7 ABC-123-002 WEEK 2 111 113 -4 -3.5 N
8 ABC-123-002 WEEK 4 106 111. -9 -7.8 N
9 ABC-123-002 WEEK 8 153 121. 38 33 N
10 ABC-123-002 WEEK 12 106 118. -9 -7.8 Y
11 ABC-123-003 BASELINE 129 129 0 0 N
12 ABC-123-003 WEEK 2 94 112. -35 -27.1 N
13 ABC-123-003 WEEK 4 127 117. -2 -1.6 N
14 ABC-123-003 WEEK 8 98 112 -31 -24 N
15 ABC-123-003 WEEK 12 105 111. -24 -18.6 Y
# Comprehensive treatment group comparison
treatment_comparison <- dm_enhanced %>%
group_by(ARM) %>%
summarise(
# Sample size
N = n(),
# Age statistics
Age_Mean = round(mean(AGE), 1),
Age_SD = round(sd(AGE), 1),
Age_Median = median(AGE),
Age_Min = min(AGE),
Age_Max = max(AGE),
# Gender distribution
Female_N = sum(SEX == "F"),
Female_Pct = round(Female_N / N * 100, 1),
Male_N = sum(SEX == "M"),
Male_Pct = round(Male_N / N * 100, 1),
# Race distribution
White_N = sum(RACE == "WHITE"),
White_Pct = round(White_N / N * 100, 1),
.groups = "drop"
)
print("Treatment group comparison:")[1] "Treatment group comparison:"
print(treatment_comparison)# A tibble: 2 × 13
ARM N Age_Mean Age_SD Age_Median Age_Min Age_Max Female_N Female_Pct
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 Placebo 10 58.3 9.8 60.5 40 69 3 30
2 Study Dr… 10 49.4 12.1 48.5 33 70 3 30
# ℹ 4 more variables: Male_N <int>, Male_Pct <dbl>, White_N <int>,
# White_Pct <dbl>
# Site-level enrollment and demographics
site_analysis <- dm_enhanced %>%
group_by(SITEID) %>%
summarise(
Total_Enrolled = n(),
Treatment_N = sum(ARMCD == "TRT"),
Placebo_N = sum(ARMCD == "PBO"),
Randomization_Ratio = round(Treatment_N / Placebo_N, 2),
Mean_Age = round(mean(AGE), 1),
Female_Pct = round(sum(SEX == "F") / n() * 100, 1),
.groups = "drop"
) %>%
arrange(desc(Total_Enrolled))
print("Site-level analysis:")[1] "Site-level analysis:"
print(site_analysis)# A tibble: 3 × 7
SITEID Total_Enrolled Treatment_N Placebo_N Randomization_Ratio Mean_Age
<chr> <int> <int> <int> <dbl> <dbl>
1 001 7 3 4 0.75 49.4
2 002 7 4 3 1.33 54.7
3 003 6 3 3 1 58
# ℹ 1 more variable: Female_Pct <dbl>
%>%): Chain operations for readable codegroup_by() for category-wise calculationscase_when() for complex categorizationsselect() to keep only needed columnsdata.table for very large datasetsThis guide demonstrates essential data manipulation techniques for clinical programming using modern R practices with dplyr.