Production CDISC SDTM Domain Creation from Raw Clinical Data
📊 SDTM Programming Guide
Production CDISC SDTM Domain Creation from Raw Clinical Data using R.
Clinical Programming Training - SDTM Implementation Guide v1.7
Uses sdtm.oak package for metadata-driven programming.
Overview
This guide demonstrates how to convert raw clinical data to SDTM-compliant domains using R programming techniques that mirror SAS clinical programming practices.
Required Libraries
library(dplyr)library(tidyr)library(stringr)library(lubridate)# library(haven) # For reading SAS datasets - not needed for this demo# library(sdtm.oak) # Uncomment when sdtm.oak is availablecat("=== CDISC SDTM Domain Creation ===\n")
=== CDISC SDTM Domain Creation ===
cat("Converting raw clinical data to SDTM-compliant domains\n\n")
Converting raw clinical data to SDTM-compliant domains
Study Parameters
# Set study parametersSTUDY_ID <-"ABC-123"STUDY_RFSTDTC <-as.Date("2024-01-15") # Common study start date
Mock Raw Clinical Data
This represents CRF/EDC data that would typically come from your clinical database.
Raw Demographics Data
# Raw Demographics Data (typically from EDC)raw_demographics <-data.frame(subject_id =sprintf("%03d", 1:25),site_number =sample(c("001", "002", "003"), 25, replace =TRUE),birth_date =sample(seq(as.Date("1950-01-01"), as.Date("1990-12-31"), by ="day"), 25),gender =sample(c("Male", "Female"), 25, replace =TRUE, prob =c(0.55, 0.45)),race =sample(c("White", "Black or African American", "Asian", "American Indian or Alaska Native"), 25, replace =TRUE, prob =c(0.7, 0.15, 0.12, 0.03)),ethnicity =sample(c("Hispanic or Latino", "Not Hispanic or Latino"), 25, replace =TRUE, prob =c(0.2, 0.8)),consent_date = STUDY_RFSTDTC +sample(-30:30, 25, replace =TRUE),randomization_date =NA, # Will be filled based on treatment assignmentstringsAsFactors =FALSE)print("Raw Demographics Data Structure:")
The DM domain contains demographics and administrative information for each subject.
# Create SDTM DM (Demographics) Domaindm_domain <- raw_demographics %>%mutate(# Standard SDTM variablesSTUDYID = STUDY_ID,DOMAIN ="DM",USUBJID =paste(STUDY_ID, subject_id, sep ="-"),SUBJID = subject_id,SITEID = site_number,# DemographicsAGE =as.numeric(floor(difftime(STUDY_RFSTDTC, birth_date, units ="days") /365.25)),AGEU ="YEARS",SEX =case_when( gender =="Male"~"M", gender =="Female"~"F",TRUE~"" ),RACE =case_when( race =="White"~"WHITE", race =="Black or African American"~"BLACK OR AFRICAN AMERICAN", race =="Asian"~"ASIAN", race =="American Indian or Alaska Native"~"AMERICAN INDIAN OR ALASKA NATIVE",TRUE~"UNKNOWN" ),ETHNIC =case_when( ethnicity =="Hispanic or Latino"~"HISPANIC OR LATINO", ethnicity =="Not Hispanic or Latino"~"NOT HISPANIC OR LATINO",TRUE~"UNKNOWN" ),# Study datesRFSTDTC =format(STUDY_RFSTDTC, "%Y-%m-%d"),RFENDTC ="", # To be populated when study endsDTHDTC ="", # Death date if applicableDTHFL =""# Death flag ) %>%# Assign treatment arms (simplified randomization)mutate(ARM =sample(c("Treatment A", "Treatment B", "Placebo"), nrow(.), replace =TRUE),ARMCD =case_when( ARM =="Treatment A"~"TRT_A", ARM =="Treatment B"~"TRT_B", ARM =="Placebo"~"PBO",TRUE~"" ),ACTARM = ARM,ACTARMCD = ARMCD ) %>%# Select final SDTM variables in proper orderselect(STUDYID, DOMAIN, USUBJID, SUBJID, SITEID, AGE, AGEU, SEX, RACE, ETHNIC, ARM, ARMCD, ACTARM, ACTARMCD, RFSTDTC, RFENDTC, DTHDTC, DTHFL)# Display sample of DM domainprint("SDTM DM Domain (first 10 subjects):")
[1] "SDTM DM Domain (first 10 subjects):"
print(dm_domain %>%slice(1:10))
STUDYID DOMAIN USUBJID SUBJID SITEID AGE AGEU SEX
1 ABC-123 DM ABC-123-001 001 002 44 YEARS F
2 ABC-123 DM ABC-123-002 002 001 49 YEARS M
3 ABC-123 DM ABC-123-003 003 001 69 YEARS F
4 ABC-123 DM ABC-123-004 004 003 35 YEARS M
5 ABC-123 DM ABC-123-005 005 001 72 YEARS M
6 ABC-123 DM ABC-123-006 006 001 35 YEARS M
7 ABC-123 DM ABC-123-007 007 001 63 YEARS M
8 ABC-123 DM ABC-123-008 008 002 51 YEARS M
9 ABC-123 DM ABC-123-009 009 001 49 YEARS M
10 ABC-123 DM ABC-123-010 010 002 47 YEARS F
RACE ETHNIC ARM ARMCD
1 WHITE HISPANIC OR LATINO Treatment A TRT_A
2 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO Placebo PBO
3 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO Placebo PBO
4 WHITE HISPANIC OR LATINO Treatment A TRT_A
5 WHITE NOT HISPANIC OR LATINO Treatment B TRT_B
6 WHITE HISPANIC OR LATINO Placebo PBO
7 BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO Treatment B TRT_B
8 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO Treatment A TRT_A
9 ASIAN NOT HISPANIC OR LATINO Treatment A TRT_A
10 ASIAN NOT HISPANIC OR LATINO Placebo PBO
ACTARM ACTARMCD RFSTDTC RFENDTC DTHDTC DTHFL
1 Treatment A TRT_A 2024-01-15
2 Placebo PBO 2024-01-15
3 Placebo PBO 2024-01-15
4 Treatment A TRT_A 2024-01-15
5 Treatment B TRT_B 2024-01-15
6 Placebo PBO 2024-01-15
7 Treatment B TRT_B 2024-01-15
8 Treatment A TRT_A 2024-01-15
9 Treatment A TRT_A 2024-01-15
10 Placebo PBO 2024-01-15
common_subjects_dm <-unique(dm_domain$USUBJID)common_subjects_vs <-unique(vs_domain$USUBJID)cat("- Subjects in both DM and VS:", length(intersect(common_subjects_dm, common_subjects_vs)), "\n")
- Subjects in both DM and VS: 25
cat("- Subjects only in DM:", length(setdiff(common_subjects_dm, common_subjects_vs)), "\n")
- Subjects only in DM: 0
cat("- Subjects only in VS:", length(setdiff(common_subjects_vs, common_subjects_dm)), "\n")
- Subjects only in VS: 0
Export SDTM Datasets
# Export to SAS transport files (XPT format)# haven::write_xpt(list(dm = dm_domain), "dm.xpt")# haven::write_xpt(list(vs = vs_domain), "vs.xpt") # Or export to CSV for review# write.csv(dm_domain, "dm.csv", row.names = FALSE)# write.csv(vs_domain, "vs.csv", row.names = FALSE)cat("SDTM datasets ready for export\n")cat("- DM domain: ", nrow(dm_domain), " records\n")cat("- VS domain: ", nrow(vs_domain), " records\n")