Module 5 Solutions — Functions, Vectors & Iteration


🔧 Module 5 — Functions, Vectors & Iteration

📝 Solutions

📦 Load Libraries

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tibble) 
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(stringr)
library(purrr)

✅ Exercise 1 Solutions: Functions (R4DS Chapter 19)

Task 1.1 Solution: DRY Principle - Rescale Function

# Rescale function to normalize values to 0-1 range
rescale01 <- function(x) {
  range_x <- range(x, na.rm = TRUE)
  (x - range_x[1]) / (range_x[2] - range_x[1])
}

# Test the function
test_values <- c(10, 20, 30, NA, 40)
cat("Original values:", test_values, "\n")
Original values: 10 20 30 NA 40 
cat("Rescaled values:", rescale01(test_values), "\n")
Rescaled values: 0 0.3333333 0.6666667 NA 1 

Task 1.2 Solution: Function Components and Arguments

clinical_age_category <- function(age, pediatric_cutoff = 18, elderly_cutoff = 65) {
  case_when(
    is.na(age) ~ "Unknown",
    age < pediatric_cutoff ~ "Pediatric",
    age >= pediatric_cutoff & age < elderly_cutoff ~ "Adult",
    age >= elderly_cutoff ~ "Elderly"
  )
}

# Test with different cutoffs
test_ages <- c(16, 25, 45, 67, 72, NA)
cat("Age categories (default cutoffs):\n")
Age categories (default cutoffs):
print(clinical_age_category(test_ages))
[1] "Pediatric" "Adult"     "Adult"     "Elderly"   "Elderly"   "Unknown"  
cat("Age categories (pediatric: 21, elderly: 60):\n")
Age categories (pediatric: 21, elderly: 60):
print(clinical_age_category(test_ages, pediatric_cutoff = 21, elderly_cutoff = 60))
[1] "Pediatric" "Adult"     "Adult"     "Elderly"   "Elderly"   "Unknown"  

Task 1.3 Solution: Function with … Argument

clinical_summary <- function(data, ..., na.rm = TRUE, digits = 2) {
  data %>%
    summarise(
      across(c(...), list(
        mean = ~ round(mean(.x, na.rm = na.rm), digits),
        sd = ~ round(sd(.x, na.rm = na.rm), digits),
        min = ~ min(.x, na.rm = na.rm),
        max = ~ max(.x, na.rm = na.rm)
      ))
    )
}

# Create demo data for testing
demo <- tibble(
  USUBJID = paste0("001-", sprintf("%03d", 1:10)),
  AGE = c(23, 45, 67, 52, 71, 34, 58, 63, 29, 76),
  SEX = c("F", "M", "F", "M", "F", "F", "M", "F", "M", "F"),
  WEIGHT = c(65, 80, 58, 75, 62, 70, 85, 60, 78, 55),
  HEIGHT = c(160, 175, 155, 180, 158, 165, 185, 162, 172, 150),
  RFSTDTC = "2024-01-15"
)

# Test summary function
cat("Clinical summary for AGE and WEIGHT:\n")
Clinical summary for AGE and WEIGHT:
print(clinical_summary(demo, AGE, WEIGHT))
# A tibble: 1 × 8
  AGE_mean AGE_sd AGE_min AGE_max WEIGHT_mean WEIGHT_sd WEIGHT_min WEIGHT_max
     <dbl>  <dbl>   <dbl>   <dbl>       <dbl>     <dbl>      <dbl>      <dbl>
1     51.8   18.4      23      76        68.8      10.3         55         85

Task 1.4 Solution: BMI Function with Validation

create_bmi_with_validation <- function(data, weight_var, height_var) {
  # Input validation
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  data %>%
    mutate(
      BMI = case_when(
        is.na({{ weight_var }}) | is.na({{ height_var }}) ~ NA_real_,
        {{ weight_var }} <= 0 | {{ height_var }} <= 0 ~ NA_real_,
        TRUE ~ {{ weight_var }} / ({{ height_var }} / 100)^2
      ),
      BMI_CATEGORY = case_when(
        is.na(BMI) ~ "Unknown",
        BMI < 18.5 ~ "Underweight",
        BMI >= 18.5 & BMI < 25 ~ "Normal", 
        BMI >= 25 & BMI < 30 ~ "Overweight",
        BMI >= 30 ~ "Obese"
      )
    )
}

# Test BMI function
demo_with_bmi <- demo %>%
  create_bmi_with_validation(WEIGHT, HEIGHT)

cat("BMI results:\n")
BMI results:
print(demo_with_bmi %>% select(USUBJID, WEIGHT, HEIGHT, BMI, BMI_CATEGORY))
# A tibble: 10 × 5
   USUBJID WEIGHT HEIGHT   BMI BMI_CATEGORY
   <chr>    <dbl>  <dbl> <dbl> <chr>
 1 001-001     65    160  25.4 Overweight
 2 001-002     80    175  26.1 Overweight
 3 001-003     58    155  24.1 Normal
 4 001-004     75    180  23.1 Normal
 5 001-005     62    158  24.8 Normal
 6 001-006     70    165  25.7 Overweight
 7 001-007     85    185  24.8 Normal
 8 001-008     60    162  22.9 Normal
 9 001-009     78    172  26.4 Overweight
10 001-010     55    150  24.4 Normal      

✅ Exercise 2 Solutions: Vectors (R4DS Chapter 20)

Task 2.1 Solution: Vector Types

# Vector types in clinical data
logical_response <- c(TRUE, FALSE, TRUE, FALSE)
visit_numbers <- c(1L, 2L, 3L, 4L)
lab_values <- c(120.5, 85.2, 95.8, 110.1)
subject_sex <- c("M", "F", "M", "F")

cat("Response vector type:", typeof(logical_response), "\n")
Response vector type: logical 
cat("Visit numbers type:", typeof(visit_numbers), "\n")
Visit numbers type: integer 
cat("Lab values type:", typeof(lab_values), "\n") 
Lab values type: double 
cat("Subject sex type:", typeof(subject_sex), "\n")
Subject sex type: character 

Task 2.2 Solution: Safe Vector Coercion

safe_lab_conversion <- function(x) {
  # Replace descriptive terms with NA
  cleaned <- str_replace_all(x, "normal|high|low", "")
  cleaned <- str_trim(cleaned)
  cleaned[cleaned == ""] <- NA_character_
  
  # Convert to numeric with suppressed warnings
  suppressWarnings(as.numeric(cleaned))
}

messy_lab_data <- c("120", "85", "normal", "95", "high", "75.5")
cleaned_labs <- safe_lab_conversion(messy_lab_data)

cat("Original messy data:\n")
Original messy data:
print(messy_lab_data)
[1] "120"    "85"     "normal" "95"     "high"   "75.5"  
cat("Cleaned numeric data:\n")
Cleaned numeric data:
print(cleaned_labs)
[1] 120.0  85.0    NA  95.0    NA  75.5

Task 2.3 Solution: Vector Subsetting

subject_ids <- c("001-001", "001-002", "001-003", "001-004", "001-005")
ages <- c(25, 67, 45, 72, 34)
adverse_events <- c(2, 0, 1, 3, 1)

# Name the vectors
names(ages) <- subject_ids
names(adverse_events) <- subject_ids

# Solutions
selected_subjects_pos <- subject_ids[c(1, 3, 5)]
selected_subjects_neg <- subject_ids[-c(2, 4)]
elderly_subjects <- subject_ids[ages >= 65]
multiple_ae_subjects <- subject_ids[adverse_events > 1]
specific_ages <- ages[c("001-001", "001-004")]

cat("Subjects 1, 3, 5:", selected_subjects_pos, "\n")
Subjects 1, 3, 5: 001-001 001-003 001-005 
cat("Exclude 2, 4:", selected_subjects_neg, "\n")
Exclude 2, 4: 001-001 001-003 001-005 
cat("Elderly subjects:", elderly_subjects, "\n") 
Elderly subjects: 001-002 001-004 
cat("Multiple AE subjects:", multiple_ae_subjects, "\n")
Multiple AE subjects: 001-001 001-004 
cat("Specific ages:", specific_ages, "\n")
Specific ages: 25 72 

Task 2.4 Solution: Check Column Types

clinical_data <- tibble(
  USUBJID = paste0("SUB-", sprintf("%03d", 1:8)),
  AGE = c(25, 45, 67, 52, 71, 34, 58, 63),
  SEX = c("F", "M", "F", "M", "F", "F", "M", "F"),
  WEIGHT = c(65.5, 80.2, 58.7, 75.1, 62.3, 70.8, 85.4, 60.2),
  HEIGHT = c(160L, 175L, 155L, 180L, 158L, 165L, 185L, 162L),
  VISIT = factor(c("Baseline", "Week 2", "Week 4", "Week 8", "Baseline", "Week 2", "Week 4", "Week 8")),
  TREATMENT_RESPONSE = c(TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE)
)

cat("Column types in clinical data:\n")
Column types in clinical data:
print(sapply(clinical_data, typeof))
           USUBJID                AGE                SEX             WEIGHT
       "character"           "double"        "character"           "double"
            HEIGHT              VISIT TREATMENT_RESPONSE
         "integer"          "integer"          "logical" 

✅ Exercise 3 Solutions: Iteration (R4DS Chapter 21)

Task 3.1 Solution: For Loop Summary Statistics

numeric_vars <- c("AGE", "WEIGHT", "HEIGHT")
summary_results <- vector("list", length(numeric_vars))
names(summary_results) <- numeric_vars

for (i in seq_along(numeric_vars)) {
  var_name <- numeric_vars[[i]]
  var_data <- clinical_data[[var_name]]
  
  if (is.numeric(var_data)) {
    summary_results[[i]] <- list(
      variable = var_name,
      n = sum(!is.na(var_data)),
      mean = round(mean(var_data, na.rm = TRUE), 2),
      sd = round(sd(var_data, na.rm = TRUE), 2),
      min = min(var_data, na.rm = TRUE),
      max = max(var_data, na.rm = TRUE)
    )
  }
}

cat("Summary statistics using for loop:\n")
Summary statistics using for loop:
print(summary_results)
$AGE
$AGE$variable
[1] "AGE"

$AGE$n
[1] 8

$AGE$mean
[1] 51.88

$AGE$sd
[1] 16.23

$AGE$min
[1] 25

$AGE$max
[1] 71


$WEIGHT
$WEIGHT$variable
[1] "WEIGHT"

$WEIGHT$n
[1] 8

$WEIGHT$mean
[1] 69.78

$WEIGHT$sd
[1] 9.79

$WEIGHT$min
[1] 58.7

$WEIGHT$max
[1] 85.4


$HEIGHT
$HEIGHT$variable
[1] "HEIGHT"

$HEIGHT$n
[1] 8

$HEIGHT$mean
[1] 167.5

$HEIGHT$sd
[1] 11.07

$HEIGHT$min
[1] 155

$HEIGHT$max
[1] 185

Task 3.2 Solution: Text Cleaning with For Loop

# Create AE data
ae_data <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-002", "001-003", "001-003"),
  AESEQ = c(1, 2, 1, 2, 1, 2),
  AEDECOD = c("  headache ", "NAUSEA", "fatigue", "  DIZZINESS  ", "rash", "COUGH"),
  CMDECOD = c("aspirin  ", "  IBUPROFEN", "acetaminophen  ", "NSAID", "  topical cream", "cough syrup"),
  AESTDTC = c("2024-01-20", "2024-01-25", "2024-01-18", "2024-01-22", "2024-01-26", "2024-01-28"),
  AEENDTC = c("2024-01-22", "2024-01-26", "2024-01-20", "2024-01-23", "2024-01-28", "2024-01-30"),
  RFSTDTC = c("2024-01-15", "2024-01-15", "2024-01-16", "2024-01-16", "2024-01-15", "2024-01-15")
)

text_cols <- c("AEDECOD", "CMDECOD")

for (col in text_cols) {
  ae_data[[col]] <- ae_data[[col]] %>%
    str_trim() %>%
    str_to_upper() %>%
    str_replace_all("\\s+", " ")
}

cat("Cleaned AE data:\n")
Cleaned AE data:
print(ae_data)
# A tibble: 6 × 7
  USUBJID AESEQ AEDECOD   CMDECOD       AESTDTC    AEENDTC    RFSTDTC
  <chr>   <dbl> <chr>     <chr>         <chr>      <chr>      <chr>
1 001-001     1 HEADACHE  ASPIRIN       2024-01-20 2024-01-22 2024-01-15
2 001-001     2 NAUSEA    IBUPROFEN     2024-01-25 2024-01-26 2024-01-15
3 001-002     1 FATIGUE   ACETAMINOPHEN 2024-01-18 2024-01-20 2024-01-16
4 001-002     2 DIZZINESS NSAID         2024-01-22 2024-01-23 2024-01-16
5 001-003     1 RASH      TOPICAL CREAM 2024-01-26 2024-01-28 2024-01-15
6 001-003     2 COUGH     COUGH SYRUP   2024-01-28 2024-01-30 2024-01-15

Task 3.3 Solution: While Loop Dose Escalation

dose_escalation <- function(starting_dose = 10, max_dose = 80, safety_prob = 0.8) {
  current_dose <- starting_dose
  doses <- current_dose
  step <- 1
  
  cat("Dose escalation simulation:\n")
  cat("Step", step, ": Starting dose =", current_dose, "mg\n")
  
  while (current_dose < max_dose) {
    # Simulate safety assessment
    safety_ok <- rbinom(1, 1, prob = safety_prob)
    
    if (safety_ok) {
      current_dose <- min(current_dose * 1.5, max_dose)
      step <- step + 1
      cat("Step", step, ": Escalated to", round(current_dose, 1), "mg (SAFE)\n")
    } else {
      cat("Step", step + 1, ": SAFETY ISSUE - Stop escalation\n")
      break
    }
    
    doses <- c(doses, current_dose)
    
    # Safety check to prevent infinite loop
    if (length(doses) > 10) {
      cat("Maximum escalation steps reached\n")
      break
    }
  }
  
  return(doses)
}

final_doses <- dose_escalation()
Dose escalation simulation:
Step 1 : Starting dose = 10 mg
Step 2 : SAFETY ISSUE - Stop escalation

Task 3.4 Solution: Functional Programming Alternative

summary_functional <- clinical_data %>%
  select(where(is.numeric)) %>%
  map_dfr(~ tibble(
    n = sum(!is.na(.x)),
    mean = round(mean(.x, na.rm = TRUE), 2),
    sd = round(sd(.x, na.rm = TRUE), 2),
    min = min(.x, na.rm = TRUE),
    max = max(.x, na.rm = TRUE)
  ), .id = "variable")

cat("Summary using functional programming:\n")
Summary using functional programming:
print(summary_functional)
# A tibble: 3 × 6
  variable     n  mean    sd   min   max
  <chr>    <int> <dbl> <dbl> <dbl> <dbl>
1 AGE          8  51.9 16.2   25    71
2 WEIGHT       8  69.8  9.79  58.7  85.4
3 HEIGHT       8 168.  11.1  155   185  

✅ Exercise 4 Solutions: SAS Macro Translation

Task 4.1 Solution: Study Day Calculation Function

derive_study_day <- function(data, event_date_var, ref_date_var, new_var_name = "STUDY_DAY") {
  data %>%
    mutate(
      !!new_var_name := case_when(
        is.na(ymd({{ event_date_var }})) | is.na(ymd({{ ref_date_var }})) ~ NA_real_,
        ymd({{ event_date_var }}) >= ymd({{ ref_date_var }}) ~ as.numeric(ymd({{ event_date_var }}) - ymd({{ ref_date_var }})) + 1,
        ymd({{ event_date_var }}) < ymd({{ ref_date_var }}) ~ as.numeric(ymd({{ event_date_var }}) - ymd({{ ref_date_var }}))
      )
    )
}

# Test the function
ae_with_studyday <- ae_data %>%
  derive_study_day(AESTDTC, RFSTDTC, "AESTDY")

cat("AE data with study day:\n")
AE data with study day:
print(ae_with_studyday %>% select(USUBJID, AESTDTC, RFSTDTC, AESTDY))
# A tibble: 6 × 4
  USUBJID AESTDTC    RFSTDTC    AESTDY
  <chr>   <chr>      <chr>       <dbl>
1 001-001 2024-01-20 2024-01-15      6
2 001-001 2024-01-25 2024-01-15     11
3 001-002 2024-01-18 2024-01-16      3
4 001-002 2024-01-22 2024-01-16      7
5 001-003 2024-01-26 2024-01-15     12
6 001-003 2024-01-28 2024-01-15     14

Task 4.2 Solution: Comprehensive AE Processing Function

process_ae_data_complete <- function(data, ae_term_var, start_date_var, end_date_var, ref_date_var) {
  # Input validation
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  # Check required columns exist
  required_cols <- c(rlang::as_name(rlang::enquo(ae_term_var)),
                    rlang::as_name(rlang::enquo(start_date_var)),
                    rlang::as_name(rlang::enquo(end_date_var)),
                    rlang::as_name(rlang::enquo(ref_date_var)))
  
  missing_cols <- setdiff(required_cols, names(data))
  if (length(missing_cols) > 0) {
    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
  }
  
  result <- data %>%
    # Standardize AE terms
    mutate(
      {{ ae_term_var }} := {{ ae_term_var }} %>%
        str_trim() %>%
        str_to_upper() %>%
        str_replace_all("\\s+", " ")
    ) %>%
    # Calculate study days
    derive_study_day({{ start_date_var }}, {{ ref_date_var }}, "AESTDY") %>%
    derive_study_day({{ end_date_var }}, {{ ref_date_var }}, "AEENDY") %>%
    # Calculate duration
    mutate(
      AE_DURATION = case_when(
        is.na(AESTDY) | is.na(AEENDY) ~ NA_real_,
        AEENDY >= AESTDY ~ AEENDY - AESTDY + 1,
        TRUE ~ NA_real_
      ),
      # Add validation flags
      missing_start_date = is.na(ymd({{ start_date_var }})),
      missing_end_date = is.na(ymd({{ end_date_var }})),
      data_complete = !missing_start_date & !missing_end_date
    )
  
  # Generate warnings for missing data
  n_missing_start <- sum(result$missing_start_date)
  n_missing_end <- sum(result$missing_end_date)
  
  if (n_missing_start > 0) {
    warning(paste(n_missing_start, "records have missing start dates"))
  }
  if (n_missing_end > 0) {
    warning(paste(n_missing_end, "records have missing end dates"))
  }
  
  return(result)
}

# Test comprehensive function
processed_ae <- ae_data %>%
  process_ae_data_complete(AEDECOD, AESTDTC, AEENDTC, RFSTDTC)

cat("Fully processed AE data:\n")
Fully processed AE data:
print(processed_ae %>% select(USUBJID, AEDECOD, AESTDY, AEENDY, AE_DURATION, data_complete))
# A tibble: 6 × 6
  USUBJID AEDECOD   AESTDY AEENDY AE_DURATION data_complete
  <chr>   <chr>      <dbl>  <dbl>       <dbl> <lgl>
1 001-001 HEADACHE       6      8           3 TRUE
2 001-001 NAUSEA        11     12           2 TRUE
3 001-002 FATIGUE        3      5           3 TRUE
4 001-002 DIZZINESS      7      8           2 TRUE
5 001-003 RASH          12     14           3 TRUE
6 001-003 COUGH         14     16           3 TRUE         

✅ Exercise 5 Solutions: Advanced Functional Programming

Task 5.1 Solution: Basic map() Functions

# Create study datasets
study_datasets <- list(
  study_a = tibble(
    USUBJID = c("A-001", "A-002", "A-003", "A-004"),
    AGE = c(45, 67, 34, 52),
    WEIGHT = c(70, 65, 75, 80),
    SEX = c("M", "F", "M", "F")
  ),
  study_b = tibble(
    USUBJID = c("B-001", "B-002", "B-003"),
    AGE = c(28, 49, 63),
    WEIGHT = c(68, 72, 58),
    SEX = c("F", "M", "F")
  ),
  study_c = tibble(
    USUBJID = c("C-001", "C-002", "C-003", "C-004", "C-005"),
    AGE = c(39, 58, 46, 62, 33),
    WEIGHT = c(74, 69, 77, 61, 73),
    SEX = c("M", "F", "M", "F", "M")
  )
)

# Solutions
mean_ages_list <- study_datasets %>%
  map(~ mean(.x$AGE))

mean_ages_vector <- study_datasets %>%
  map_dbl(~ mean(.x$AGE))

age_summaries <- study_datasets %>%
  map_chr(~ paste("Mean:", round(mean(.x$AGE), 1), "| SD:", round(sd(.x$AGE), 1)))

cat("Mean ages (list):\n")
Mean ages (list):
print(mean_ages_list)
$study_a
[1] 49.5

$study_b
[1] 46.66667

$study_c
[1] 47.6
cat("Mean ages (vector):\n")
Mean ages (vector):
print(mean_ages_vector)
 study_a  study_b  study_c
49.50000 46.66667 47.60000 
cat("Age summaries:\n")
Age summaries:
print(age_summaries)
                study_a                 study_b                 study_c
"Mean: 49.5 | SD: 13.8" "Mean: 46.7 | SD: 17.6" "Mean: 47.6 | SD: 12.3" 

Task 5.2 Solution: map2() for Two Inputs

weights <- c(70, 65, 80, 75, 68)
heights <- c(175, 160, 185, 180, 155)

bmis <- map2_dbl(weights, heights, ~ .x / (.y / 100)^2)

cat("BMI calculations:\n")
BMI calculations:
print(round(bmis, 2))
[1] 22.86 25.39 23.37 23.15 28.30

Task 5.3 Solution: pmap() for Multiple Inputs

patient_profiles <- list(
  weight = c(70, 65, 80, 75),
  height = c(175, 160, 185, 180),
  age = c(45, 62, 38, 55),
  sex = c("M", "F", "M", "F")
)

calculate_risk_score <- function(weight, height, age, sex) {
  bmi <- weight / (height / 100)^2
  age_risk <- ifelse(age > 60, 1.2, 1.0)
  sex_risk <- ifelse(sex == "M", 1.1, 1.0)
  bmi_risk <- ifelse(bmi > 25, 1.3, 1.0)
  
  round(age_risk * sex_risk * bmi_risk, 2)
}

risk_scores <- pmap_dbl(patient_profiles, calculate_risk_score)

cat("Risk scores:\n")
Risk scores:
print(risk_scores)
[1] 1.10 1.56 1.10 1.00

Task 5.4 Solution: Process Multiple Datasets

process_study_data <- function(data) {
  data %>%
    mutate(
      BMI = WEIGHT / (1.70^2),  # Assume height = 170cm
      AGE_GROUP = case_when(
        AGE < 40 ~ "Young",
        AGE >= 40 & AGE < 60 ~ "Middle",
        AGE >= 60 ~ "Senior"
      ),
      ELDERLY_FLAG = ifelse(AGE >= 65, "Y", "N")
    )
}

processed_studies <- study_datasets %>%
  map(process_study_data)

cat("Processed studies summary:\n")
Processed studies summary:
iwalk(processed_studies, ~ {
  cat("Study", .y, "- N:", nrow(.x), "- Mean BMI:", round(mean(.x$BMI), 1), "\n")
})
Study study_a - N: 4 - Mean BMI: 25.1
Study study_b - N: 3 - Mean BMI: 22.8
Study study_c - N: 5 - Mean BMI: 24.5 

Task 5.5 Solution: Extract Summary Information

elderly_counts <- processed_studies %>%
  map_int(~ sum(.x$ELDERLY_FLAG == "Y"))

cat("Elderly subjects per study:\n")
Elderly subjects per study:
print(elderly_counts)
study_a study_b study_c
      1       0       0 

✅ Exercise 6 Solutions: Predicate Functions and Advanced Patterns

Task 6.1 Solution: keep() and discard()

lab_results <- list(
  glucose = c(90, 95, 110, 85, 92),
  creatinine = c(1.1, 0.9, 1.3, 1.0, 1.2),
  invalid_test = c(NA, NA, NA, NA, NA),
  hemoglobin = c(13.5, 12.1, 14.2, 13.8, 12.9),
  empty_test = numeric(0),
  protein = c(7.2, 6.8, 7.5, 7.1, 6.9)
)

valid_labs <- lab_results %>%
  keep(~ length(.x) > 0 && !all(is.na(.x)))

abnormal_labs <- lab_results %>%
  keep(~ any(.x > 15 | .x < 0.5, na.rm = TRUE))

cat("Valid lab tests:", names(valid_labs), "\n")
Valid lab tests: glucose creatinine hemoglobin protein 
cat("Labs with abnormal values:", names(abnormal_labs), "\n")
Labs with abnormal values: glucose 

Task 6.2 Solution: reduce() for Cumulative Operations

daily_enrollment <- c(3, 2, 5, 1, 4, 2, 3, 6, 2, 1)

cumulative_enrollment <- daily_enrollment %>%
  accumulate(`+`)

cat("Daily enrollment:", daily_enrollment, "\n")
Daily enrollment: 3 2 5 1 4 2 3 6 2 1 
cat("Cumulative enrollment:", cumulative_enrollment, "\n")
Cumulative enrollment: 3 5 10 11 15 17 20 26 28 29 

Task 6.3 Solution: Combine Datasets with reduce()

enrollment_batches <- list(
  batch1 = tibble(USUBJID = c("001", "002"), SITE = "Site A"),
  batch2 = tibble(USUBJID = c("003", "004", "005"), SITE = "Site B"),
  batch3 = tibble(USUBJID = c("006", "007"), SITE = "Site C")
)

all_subjects <- enrollment_batches %>%
  reduce(bind_rows)

cat("Combined enrollment:\n")
Combined enrollment:
print(all_subjects)
# A tibble: 7 × 2
  USUBJID SITE
  <chr>   <chr>
1 001     Site A
2 002     Site A
3 003     Site B
4 004     Site B
5 005     Site B
6 006     Site C
7 007     Site C

Task 6.4 Solution: walk() for Side Effects

create_site_report <- function(data, site_name) {
  cat("=== Site", site_name, "Report ===\n")
  cat("Subjects enrolled:", nrow(data), "\n")
  cat("Subject IDs:", paste(data$USUBJID, collapse = ", "), "\n\n")
}

walk2(enrollment_batches, names(enrollment_batches), create_site_report)
=== Site batch1 Report ===
Subjects enrolled: 2
Subject IDs: 001, 002

=== Site batch2 Report ===
Subjects enrolled: 3
Subject IDs: 003, 004, 005

=== Site batch3 Report ===
Subjects enrolled: 2
Subject IDs: 006, 007 

✅ Exercise 7 Solutions: Error Handling and Function Factories

Task 7.1 Solution: Function Factories

create_domain_checker <- function(domain_prefix, min_length = 5) {
  function(subject_id) {
    if (nchar(subject_id) < min_length) {
      return(FALSE)
    }
    str_detect(subject_id, paste0("^", domain_prefix, "-"))
  }
}

# Create specific checkers
check_ae_subject <- create_domain_checker("AE")
check_dm_subject <- create_domain_checker("DM")

# Test function factory
test_subjects <- c("AE-001", "DM-002", "VS-003", "AE-004")
cat("AE subject validation:", map_lgl(test_subjects, check_ae_subject), "\n")
AE subject validation: TRUE FALSE FALSE TRUE 

Task 7.2 Solution: Error Handling with possibly() and safely()

risky_calculation <- function(x) {
  if (x < 0) stop("Value must be positive")
  if (x > 100) stop("Value too large")
  sqrt(x)
}

# Create safe versions
safe_calc <- possibly(risky_calculation, otherwise = NA)
safe_calc_detailed <- safely(risky_calculation)

test_values <- c(4, 9, -1, 16, 150, 25)

# Test both approaches
safe_results <- map_dbl(test_values, safe_calc)
detailed_results <- map(test_values, safe_calc_detailed)

cat("Safe results:", safe_results, "\n")
Safe results: 2 3 NA 4 NA 5 
cat("First error from safely():\n")
First error from safely():
print(detailed_results[[3]])
$result
NULL

$error
<simpleError in .f(...): Value must be positive>

✅ Exercise 8 Solutions: GitHub Copilot Examples

Solution: Function Examples with Copilot-Style Comments

# Create function to flag subjects with multiple adverse events in clinical trial
flag_multiple_aes <- function(data) {
  data %>%
    group_by(USUBJID) %>%
    summarise(n_aes = n(), .groups = "drop") %>%
    mutate(multiple_aes_flag = ifelse(n_aes > 1, "Y", "N"))
}

# Function to calculate percent change from baseline for lab values
calc_percent_change <- function(baseline, post_baseline) {
  ((post_baseline - baseline) / baseline) * 100
}

# Derive safety population flag based on treatment exposure duration
derive_safety_flag <- function(data, min_exposure_days = 1) {
  data %>%
    mutate(
      safety_flag = case_when(
        is.na(EXPOSURE_DAYS) ~ "N",
        EXPOSURE_DAYS >= min_exposure_days ~ "Y",
        TRUE ~ "N"
      )
    )
}

# Function to create CDISC-compliant variable labels for SDTM domains
create_cdisc_labels <- function(data, domain = "AE") {
  labels <- switch(domain,
    "AE" = list(
      USUBJID = "Unique Subject Identifier",
      AETERM = "Reported Term for the Adverse Event",
      AEDECOD = "Dictionary-Derived Term",
      AESTDTC = "Start Date/Time of Adverse Event"
    ),
    "DM" = list(
      USUBJID = "Unique Subject Identifier", 
      AGE = "Age",
      SEX = "Sex",
      RACE = "Race"
    )
  )
  
  # Apply labels to matching columns
  for (col in names(data)) {
    if (col %in% names(labels)) {
      attr(data[[col]], "label") <- labels[[col]]
    }
  }
  
  return(data)
}

cat("Example Copilot-assisted functions created successfully!\n")
Example Copilot-assisted functions created successfully!

✅ Bonus Challenge Solution: Complete Clinical Pipeline

create_clinical_pipeline <- function(raw_demo_data, raw_ae_data, 
                                   age_cutoff = 65, 
                                   output_format = "summary") {
  
  # Input validation (R4DS Ch. 19)
  if (!is.data.frame(raw_demo_data) || !is.data.frame(raw_ae_data)) {
    stop("Both inputs must be data frames")
  }
  
  # Process demographics data using vector operations (R4DS Ch. 20)
  processed_demo <- raw_demo_data %>%
    mutate(
      # Clean text variables
      across(where(is.character), ~ str_trim(str_to_upper(.x))),
      
      # Derive age-related variables
      ELDERLY = case_when(
        is.na(AGE) ~ "Unknown",
        AGE >= age_cutoff ~ "Yes",
        TRUE ~ "No"
      ),
      
      # Calculate BMI with proper vector handling
      BMI = case_when(
        is.na(WEIGHT) | is.na(HEIGHT) ~ NA_real_,
        WEIGHT <= 0 | HEIGHT <= 0 ~ NA_real_,
        TRUE ~ WEIGHT / (HEIGHT / 100)^2
      ),
      
      BMI_CATEGORY = case_when(
        is.na(BMI) ~ "Unknown",
        BMI < 18.5 ~ "Underweight",
        BMI >= 18.5 & BMI < 25 ~ "Normal",
        BMI >= 25 & BMI < 30 ~ "Overweight",
        BMI >= 30 ~ "Obese"
      ),
      
      AGE_GROUP = case_when(
        is.na(AGE) ~ "Unknown",
        AGE < 40 ~ "Young Adult",
        AGE >= 40 & AGE < 65 ~ "Middle Age",
        AGE >= 65 ~ "Senior"
      )
    )
  
  # Process AE data if provided
  if (nrow(raw_ae_data) > 0) {
    processed_ae <- raw_ae_data %>%
      process_ae_data_complete(AEDECOD, AESTDTC, AEENDTC, RFSTDTC)
  } else {
    processed_ae <- tibble()
  }
  
  # Generate summary using iteration (R4DS Ch. 21)
  if (output_format == "summary") {
    demo_summary <- processed_demo %>%
      summarise(
        n_subjects = n(),
        mean_age = round(mean(AGE, na.rm = TRUE), 1),
        sd_age = round(sd(AGE, na.rm = TRUE), 1),
        mean_bmi = round(mean(BMI, na.rm = TRUE), 1),
        n_elderly = sum(ELDERLY == "Yes", na.rm = TRUE),
        pct_elderly = round(n_elderly / n_subjects * 100, 1)
      )
    
    # Count by categories using functional programming
    age_group_counts <- processed_demo %>%
      count(AGE_GROUP, name = "n") %>%
      mutate(percentage = round(n / sum(n) * 100, 1))
    
    bmi_category_counts <- processed_demo %>%
      count(BMI_CATEGORY, name = "n") %>%
      mutate(percentage = round(n / sum(n) * 100, 1))
    
    return(list(
      demographics = processed_demo,
      adverse_events = processed_ae,
      summary = demo_summary,
      age_groups = age_group_counts,
      bmi_categories = bmi_category_counts
    ))
    
  } else if (output_format == "data_only") {
    return(list(
      demographics = processed_demo,
      adverse_events = processed_ae
    ))
    
  } else {
    stop("output_format must be 'summary' or 'data_only'")
  }
}

# Test the comprehensive pipeline
cat("Testing complete clinical pipeline:\n")
Testing complete clinical pipeline:
pipeline_results <- create_clinical_pipeline(demo, ae_data)

cat("Pipeline completed successfully!\n")
Pipeline completed successfully!
cat("Demographics processed:", nrow(pipeline_results$demographics), "subjects\n")
Demographics processed: 10 subjects
cat("AE data processed:", nrow(pipeline_results$adverse_events), "records\n")
AE data processed: 6 records

🎯 Summary

This comprehensive solution demonstrates core R programming concepts applied to clinical programming applications:

✅ Functions

  • Function creation and structure
  • Arguments and validation
  • Error handling and robustness
  • Function factories for reusable patterns

✅ Vectors (Chapter 20)

  • Understanding vector types in clinical data
  • Safe type coercion and conversion
  • Effective subsetting for data analysis
  • Proper handling of missing values

✅ Iteration (Chapter 21)

  • For loops for systematic processing
  • While loops for conditional iteration
  • Functional programming with map() family
  • Advanced patterns: reduce(), keep(), walk()

✅ Clinical Programming Applications

  • SAS macro translation to modern R functions
  • Vectorized operations for efficient processing
  • Batch processing of multiple studies
  • Error handling for robust clinical applications
  • CDISC-compliant data processing workflows

The solutions provide a solid foundation for advanced clinical programming with R!

````