This library provides custom R functions that replicate common SAS macro functionality for clinical programming, along with utility functions for data processing and analysis.
Overview
This collection includes:
SAS Macro Equivalents: R functions that replicate SAS macro behavior
Data Validation Functions: Quality control and data checking utilities
Date/Time Functions: Enhanced date manipulation for clinical data
String Processing Functions: Text cleaning and standardization
Statistical Functions: Common calculations for clinical analyses
cat("Reusable R functions for clinical programming\n\n")
Reusable R functions for clinical programming
SAS Macro Equivalents
1. Frequency Procedure (PROC FREQ equivalent)
#' Calculate frequencies and percentages (like SAS PROC FREQ)#' @param data Dataset to analyze#' @param var Variable name to analyze#' @param by_var Optional grouping variable#' @param missing Include missing values (default: FALSE)#' @return Frequency table with counts and percentagesproc_freq <-function(data, var, by_var =NULL, missing =FALSE) {if (!missing) { data <- data %>%filter(!is.na(!!sym(var)) &!!sym(var) !="") }if (!is.null(by_var)) {# Cross-tabulation result <- data %>%group_by(!!sym(by_var), !!sym(var)) %>%summarise(FREQUENCY =n(),.groups ="drop" ) %>%group_by(!!sym(by_var)) %>%mutate(PERCENT =round(FREQUENCY /sum(FREQUENCY) *100, 2),CUM_FREQ =cumsum(FREQUENCY),CUM_PERCENT =round(cumsum(PERCENT), 2) ) %>%ungroup() } else {# Simple frequency result <- data %>%group_by(!!sym(var)) %>%summarise(FREQUENCY =n(),.groups ="drop" ) %>%mutate(PERCENT =round(FREQUENCY /sum(FREQUENCY) *100, 2),CUM_FREQ =cumsum(FREQUENCY),CUM_PERCENT =round(cumsum(PERCENT), 2) ) }return(result)}# Example usagesample_data <-tibble(SEX =sample(c("M", "F"), 50, replace =TRUE),ARM =sample(c("Placebo", "Treatment"), 50, replace =TRUE),AGE_GROUP =sample(c("18-44", "45-64", "65+"), 50, replace =TRUE))print("PROC FREQ example - Sex distribution:")
[1] "PROC FREQ example - Sex distribution:"
print(proc_freq(sample_data, "SEX"))
# A tibble: 2 × 5
SEX FREQUENCY PERCENT CUM_FREQ CUM_PERCENT
<chr> <int> <dbl> <int> <dbl>
1 F 29 58 29 58
2 M 21 42 50 100
print("\nCross-tabulation - Sex by Treatment:")
[1] "\nCross-tabulation - Sex by Treatment:"
print(proc_freq(sample_data, "SEX", "ARM"))
# A tibble: 4 × 6
ARM SEX FREQUENCY PERCENT CUM_FREQ CUM_PERCENT
<chr> <chr> <int> <dbl> <int> <dbl>
1 Placebo F 16 53.3 16 53.3
2 Placebo M 14 46.7 30 100
3 Treatment F 13 65 13 65
4 Treatment M 7 35 20 100
2. Means Procedure (PROC MEANS equivalent)
#' Calculate descriptive statistics (like SAS PROC MEANS)#' @param data Dataset to analyze#' @param var Numeric variable to analyze#' @param by_var Optional grouping variable#' @param stats Statistics to calculate#' @return Summary statistics tableproc_means <-function(data, var, by_var =NULL, stats =c("n", "mean", "std", "min", "median", "max")) {# Filter out missing values data_clean <- data %>%filter(!is.na(!!sym(var)))if (!is.null(by_var)) { result <- data_clean %>%group_by(!!sym(by_var)) %>%summarise(N =if ("n"%in% stats) n() elseNULL,MEAN =if ("mean"%in% stats) round(mean(!!sym(var)), 2) elseNULL,STD =if ("std"%in% stats) round(sd(!!sym(var)), 2) elseNULL,MIN =if ("min"%in% stats) min(!!sym(var)) elseNULL,MEDIAN =if ("median"%in% stats) median(!!sym(var)) elseNULL,MAX =if ("max"%in% stats) max(!!sym(var)) elseNULL,.groups ="drop" ) %>%select_if(~!all(is.null(.))) } else { result <- data_clean %>%summarise(N =if ("n"%in% stats) n() elseNULL,MEAN =if ("mean"%in% stats) round(mean(!!sym(var)), 2) elseNULL,STD =if ("std"%in% stats) round(sd(!!sym(var)), 2) elseNULL,MIN =if ("min"%in% stats) min(!!sym(var)) elseNULL,MEDIAN =if ("median"%in% stats) median(!!sym(var)) elseNULL,MAX =if ("max"%in% stats) max(!!sym(var)) elseNULL ) %>%select_if(~!all(is.null(.))) }return(result)}# Example usagesample_data$AGE <-sample(18:75, 50, replace =TRUE)print("PROC MEANS example - Age statistics:")
[1] "PROC MEANS example - Age statistics:"
print(proc_means(sample_data, "AGE"))
# A tibble: 1 × 6
N MEAN STD MIN MEDIAN MAX
<int> <dbl> <dbl> <int> <dbl> <int>
1 50 47.5 15.1 19 48 74
print("\nAge statistics by treatment group:")
[1] "\nAge statistics by treatment group:"
print(proc_means(sample_data, "AGE", "ARM"))
# A tibble: 2 × 7
ARM N MEAN STD MIN MEDIAN MAX
<chr> <int> <dbl> <dbl> <int> <dbl> <int>
1 Placebo 30 45.5 14.8 24 44 74
2 Treatment 20 50.6 15.3 19 54 70
3. Data Step Functions
#' SAS LAG function equivalent#' @param x Vector to lag#' @param n Number of periods to lag (default: 1)#' @return Lagged vectorlag_sas <-function(x, n =1) {c(rep(NA, n), head(x, -n))}#' SAS DIF function equivalent #' @param x Vector to difference#' @param n Number of periods for difference (default: 1)#' @return Differenced vectordif_sas <-function(x, n =1) { x -lag_sas(x, n)}#' SAS FIRST./LAST. variable equivalent#' @param data Dataset#' @param by_var Grouping variable#' @return Dataset with FIRST and LAST indicatorsfirst_last <-function(data, by_var) { data %>%group_by(!!sym(by_var)) %>%mutate(FIRST =row_number() ==1,LAST =row_number() ==n() ) %>%ungroup()}# Example usagetest_data <-tibble(USUBJID =rep(c("001", "002", "003"), each =3),VISIT =rep(1:3, 3),VALUE =c(10, 12, 15, 8, 11, 9, 20, 18, 22))test_result <- test_data %>%mutate(LAG_VALUE =lag_sas(VALUE),CHANGE =dif_sas(VALUE) ) %>%first_last("USUBJID")print("SAS function equivalents:")
#' Calculate age in years (like SAS YRDIF function)#' @param birth_date Birth date#' @param ref_date Reference date (default: today)#' @param basis Age calculation basis#' @return Age in yearscalculate_age <-function(birth_date, ref_date =Sys.Date(), basis ="actual") {if (basis =="actual") { age <-as.numeric(difftime(ref_date, birth_date, units ="days")) /365.25 } elseif (basis =="30/360") {# Simplified 30/360 calculation age <-as.numeric(difftime(ref_date, birth_date, units ="days")) /360 } else { age <-as.numeric(difftime(ref_date, birth_date, units ="days")) /365 }return(floor(age))}#' Study day calculation (like SAS study day)#' @param event_date Event date#' @param ref_date Reference date (study start)#' @return Study daystudy_day <-function(event_date, ref_date) { diff_days <-as.numeric(event_date - ref_date)# Study day convention: Day 1 is first day, negative for pre-study days study_days <-if_else(diff_days >=0, diff_days +1, diff_days)return(study_days)}#' Format dates for CDISC (ISO 8601)#' @param date_var Date variable#' @param include_time Include time component#' @return Formatted date stringformat_cdisc_date <-function(date_var, include_time =FALSE) {if (include_time) {# Full ISO 8601 datetime format formatted <-format(as.POSIXct(date_var), "%Y-%m-%dT%H:%M:%S") } else {# Date only formatted <-format(as.Date(date_var), "%Y-%m-%d") }return(formatted)}# Example usagedate_examples <-tibble(USUBJID =paste0("S00", 1:5),BIRTH_DATE =as.Date(c("1990-05-15", "1975-12-03", "1988-08-20", "1982-01-10", "1995-09-25")),STUDY_START =as.Date("2024-01-15"),VISIT_DATE =as.Date(c("2024-01-15", "2024-01-22", "2024-01-10", "2024-01-18", "2024-01-20"))) %>%mutate(AGE =calculate_age(BIRTH_DATE, STUDY_START),STUDY_DAY =study_day(VISIT_DATE, STUDY_START),VISIT_DTC =format_cdisc_date(VISIT_DATE) )print("Date function examples:")
Processing clinical dataset...
Validation results:
Missing data summary available
Age validation complete
print("Processed demographics summary:")
[1] "Processed demographics summary:"
print(results$demographics)
# A tibble: 4 × 7
SEX N MEAN STD MIN MEDIAN MAX
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 5 49.4 18.2 29 41 71
2 FEMALE 4 57 10.0 43 59.5 66
3 M 5 44.4 10.8 31 41 60
4 MALE 6 56.8 15.0 31 62 73
Function Documentation
Best Practices
Function Design
Single responsibility principle
Clear parameter names and defaults
Comprehensive error handling
Consistent return formats
Documentation
Use roxygen2-style comments
Include parameter descriptions
Provide usage examples
Document assumptions and limitations
Testing
Test with edge cases
Validate against known results
Check for proper error handling
Compare with SAS output when applicable
Performance
Use vectorized operations
Avoid unnecessary loops
Profile for bottlenecks
Consider data.table for large datasets
This custom functions library provides a solid foundation for clinical programming in R, with emphasis on SAS compatibility and regulatory requirements.