SDTM Programming Guide

Production CDISC SDTM Domain Creation from Raw Clinical Data

📊 SDTM Programming Guide

Production CDISC SDTM Domain Creation from Raw Clinical Data using R.

Clinical Programming Training - SDTM Implementation Guide v1.7
Uses sdtm.oak package for metadata-driven programming.

Overview

This guide demonstrates how to convert raw clinical data to SDTM-compliant domains using R programming techniques that mirror SAS clinical programming practices.

Required Libraries

library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
# library(haven)       # For reading SAS datasets - not needed for this demo
# library(sdtm.oak)    # Uncomment when sdtm.oak is available

cat("=== CDISC SDTM Domain Creation ===\n")
=== CDISC SDTM Domain Creation ===
cat("Converting raw clinical data to SDTM-compliant domains\n\n")
Converting raw clinical data to SDTM-compliant domains

Study Parameters

# Set study parameters
STUDY_ID <- "ABC-123"
STUDY_RFSTDTC <- as.Date("2024-01-15")  # Common study start date

Mock Raw Clinical Data

This represents CRF/EDC data that would typically come from your clinical database.

Raw Demographics Data

# Raw Demographics Data (typically from EDC)
raw_demographics <- data.frame(
  subject_id = sprintf("%03d", 1:25),
  site_number = sample(c("001", "002", "003"), 25, replace = TRUE),
  birth_date = sample(seq(as.Date("1950-01-01"), as.Date("1990-12-31"), by = "day"), 25),
  gender = sample(c("Male", "Female"), 25, replace = TRUE, prob = c(0.55, 0.45)),
  race = sample(c("White", "Black or African American", "Asian", "American Indian or Alaska Native"), 
                25, replace = TRUE, prob = c(0.7, 0.15, 0.12, 0.03)),
  ethnicity = sample(c("Hispanic or Latino", "Not Hispanic or Latino"), 
                     25, replace = TRUE, prob = c(0.2, 0.8)),
  consent_date = STUDY_RFSTDTC + sample(-30:30, 25, replace = TRUE),
  randomization_date = NA,  # Will be filled based on treatment assignment
  stringsAsFactors = FALSE
)

print("Raw Demographics Data Structure:")
[1] "Raw Demographics Data Structure:"
glimpse(raw_demographics)
Rows: 25
Columns: 8
$ subject_id         <chr> "001", "002", "003", "004", "005", "006", "007", "0…
$ site_number        <chr> "002", "001", "001", "003", "001", "001", "001", "0…
$ birth_date         <date> 1979-09-29, 1975-01-12, 1954-03-05, 1988-12-11, 19…
$ gender             <chr> "Female", "Male", "Female", "Male", "Male", "Male",…
$ race               <chr> "White", "Black or African American", "Black or Afr…
$ ethnicity          <chr> "Hispanic or Latino", "Hispanic or Latino", "Not Hi…
$ consent_date       <date> 2024-01-11, 2024-02-05, 2024-01-25, 2024-02-08, 20…
$ randomization_date <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Raw Vital Signs Data

# Raw Vital Signs Data
# Generate multiple visits per subject
set.seed(123)
visits <- c("Screening", "Baseline", "Week 2", "Week 4", "Week 8", "Week 12", "End of Study")

raw_vitals_list <- lapply(raw_demographics$subject_id, function(subj) {
  n_visits <- sample(4:7, 1)  # Random number of visits per subject
  selected_visits <- sample(visits, n_visits)
  
  data.frame(
    subject_id = subj,
    visit = selected_visits,
    visit_date = STUDY_RFSTDTC + cumsum(sample(7:28, n_visits, replace = TRUE)),
    systolic_bp = round(rnorm(n_visits, 125, 15)),
    diastolic_bp = round(rnorm(n_visits, 80, 10)),
    heart_rate = round(rnorm(n_visits, 72, 12)),
    weight_kg = round(rnorm(n_visits, 75, 15), 1),
    height_cm = round(rnorm(n_visits, 170, 10), 1),
    temperature_c = round(rnorm(n_visits, 36.5, 0.5), 1)
  )
})

raw_vitals <- do.call(rbind, raw_vitals_list)

print("Raw Vital Signs Data Structure:")
[1] "Raw Vital Signs Data Structure:"
glimpse(raw_vitals)
Rows: 139
Columns: 9
$ subject_id    <chr> "001", "001", "001", "001", "001", "001", "002", "002", …
$ visit         <chr> "End of Study", "Week 2", "Week 12", "Baseline", "Week 4…
$ visit_date    <date> 2024-02-01, 2024-02-12, 2024-03-09, 2024-03-29, 2024-04…
$ systolic_bp   <dbl> 118, 143, 130, 131, 127, 117, 150, 143, 129, 109, 117, 1…
$ diastolic_bp  <dbl> 98, 85, 60, 87, 75, 69, 96, 69, 97, 78, 75, 93, 75, 83, …
$ heart_rate    <dbl> 69, 60, 63, 64, 52, 82, 63, 98, 56, 78, 87, 74, 93, 72, …
$ weight_kg     <dbl> 77.3, 57.9, 93.8, 81.4, 70.6, 88.4, 84.5, 81.2, 86.9, 72…
$ height_cm     <dbl> 178.8, 178.2, 176.9, 175.5, 169.4, 166.9, 161.0, 162.6, …
$ temperature_c <dbl> 36.3, 36.2, 36.4, 35.9, 37.6, 37.1, 36.2, 36.4, 36.1, 36…

SDTM Domain Creation

Demographics (DM) Domain

The DM domain contains demographics and administrative information for each subject.

# Create SDTM DM (Demographics) Domain
dm_domain <- raw_demographics %>%
  mutate(
    # Standard SDTM variables
    STUDYID = STUDY_ID,
    DOMAIN = "DM",
    USUBJID = paste(STUDY_ID, subject_id, sep = "-"),
    SUBJID = subject_id,
    SITEID = site_number,
    
    # Demographics
    AGE = as.numeric(floor(difftime(STUDY_RFSTDTC, birth_date, units = "days") / 365.25)),
    AGEU = "YEARS",
    SEX = case_when(
      gender == "Male" ~ "M",
      gender == "Female" ~ "F",
      TRUE ~ ""
    ),
    RACE = case_when(
      race == "White" ~ "WHITE",
      race == "Black or African American" ~ "BLACK OR AFRICAN AMERICAN", 
      race == "Asian" ~ "ASIAN",
      race == "American Indian or Alaska Native" ~ "AMERICAN INDIAN OR ALASKA NATIVE",
      TRUE ~ "UNKNOWN"
    ),
    ETHNIC = case_when(
      ethnicity == "Hispanic or Latino" ~ "HISPANIC OR LATINO",
      ethnicity == "Not Hispanic or Latino" ~ "NOT HISPANIC OR LATINO",
      TRUE ~ "UNKNOWN"
    ),
    
    # Study dates
    RFSTDTC = format(STUDY_RFSTDTC, "%Y-%m-%d"),
    RFENDTC = "",  # To be populated when study ends
    DTHDTC = "",   # Death date if applicable
    DTHFL = ""     # Death flag
  ) %>%
  
  # Assign treatment arms (simplified randomization)
  mutate(
    ARM = sample(c("Treatment A", "Treatment B", "Placebo"), nrow(.), replace = TRUE),
    ARMCD = case_when(
      ARM == "Treatment A" ~ "TRT_A",
      ARM == "Treatment B" ~ "TRT_B", 
      ARM == "Placebo" ~ "PBO",
      TRUE ~ ""
    ),
    ACTARM = ARM,
    ACTARMCD = ARMCD
  ) %>%
  
  # Select final SDTM variables in proper order
  select(STUDYID, DOMAIN, USUBJID, SUBJID, SITEID, AGE, AGEU, SEX, RACE, ETHNIC,
         ARM, ARMCD, ACTARM, ACTARMCD, RFSTDTC, RFENDTC, DTHDTC, DTHFL)

# Display sample of DM domain
print("SDTM DM Domain (first 10 subjects):")
[1] "SDTM DM Domain (first 10 subjects):"
print(dm_domain %>% slice(1:10))
   STUDYID DOMAIN     USUBJID SUBJID SITEID AGE  AGEU SEX
1  ABC-123     DM ABC-123-001    001    002  44 YEARS   F
2  ABC-123     DM ABC-123-002    002    001  49 YEARS   M
3  ABC-123     DM ABC-123-003    003    001  69 YEARS   F
4  ABC-123     DM ABC-123-004    004    003  35 YEARS   M
5  ABC-123     DM ABC-123-005    005    001  72 YEARS   M
6  ABC-123     DM ABC-123-006    006    001  35 YEARS   M
7  ABC-123     DM ABC-123-007    007    001  63 YEARS   M
8  ABC-123     DM ABC-123-008    008    002  51 YEARS   M
9  ABC-123     DM ABC-123-009    009    001  49 YEARS   M
10 ABC-123     DM ABC-123-010    010    002  47 YEARS   F
                        RACE                 ETHNIC         ARM ARMCD
1                      WHITE     HISPANIC OR LATINO Treatment A TRT_A
2  BLACK OR AFRICAN AMERICAN     HISPANIC OR LATINO     Placebo   PBO
3  BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO     Placebo   PBO
4                      WHITE     HISPANIC OR LATINO Treatment A TRT_A
5                      WHITE NOT HISPANIC OR LATINO Treatment B TRT_B
6                      WHITE     HISPANIC OR LATINO     Placebo   PBO
7  BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO Treatment B TRT_B
8  BLACK OR AFRICAN AMERICAN     HISPANIC OR LATINO Treatment A TRT_A
9                      ASIAN NOT HISPANIC OR LATINO Treatment A TRT_A
10                     ASIAN NOT HISPANIC OR LATINO     Placebo   PBO
        ACTARM ACTARMCD    RFSTDTC RFENDTC DTHDTC DTHFL
1  Treatment A    TRT_A 2024-01-15
2      Placebo      PBO 2024-01-15
3      Placebo      PBO 2024-01-15
4  Treatment A    TRT_A 2024-01-15
5  Treatment B    TRT_B 2024-01-15
6      Placebo      PBO 2024-01-15
7  Treatment B    TRT_B 2024-01-15
8  Treatment A    TRT_A 2024-01-15
9  Treatment A    TRT_A 2024-01-15
10     Placebo      PBO 2024-01-15                     
# Verify SDTM compliance
cat("\n=== DM Domain Validation ===\n")

=== DM Domain Validation ===
cat("Number of subjects:", nrow(dm_domain), "\n")
Number of subjects: 25 
cat("Required variables present:", all(c("STUDYID", "DOMAIN", "USUBJID") %in% names(dm_domain)), "\n")
Required variables present: TRUE 
cat("Unique subjects:", length(unique(dm_domain$USUBJID)), "\n")
Unique subjects: 25 

Vital Signs (VS) Domain

The VS domain contains vital signs measurements.

# Create SDTM VS (Vital Signs) Domain
vs_domain <- raw_vitals %>%
  
  # Create individual records for each vital sign measurement
  pivot_longer(
    cols = c(systolic_bp, diastolic_bp, heart_rate, weight_kg, height_cm, temperature_c),
    names_to = "vital_sign",
    values_to = "measurement"
  ) %>%
  
  # Map to SDTM variables
  mutate(
    STUDYID = STUDY_ID,
    DOMAIN = "VS", 
    USUBJID = paste(STUDY_ID, subject_id, sep = "-"),
    
    # Test details
    VSTESTCD = case_when(
      vital_sign == "systolic_bp" ~ "SYSBP",
      vital_sign == "diastolic_bp" ~ "DIABP", 
      vital_sign == "heart_rate" ~ "HR",
      vital_sign == "weight_kg" ~ "WEIGHT",
      vital_sign == "height_cm" ~ "HEIGHT",
      vital_sign == "temperature_c" ~ "TEMP",
      TRUE ~ ""
    ),
    
    VSTEST = case_when(
      VSTESTCD == "SYSBP" ~ "Systolic Blood Pressure",
      VSTESTCD == "DIABP" ~ "Diastolic Blood Pressure",
      VSTESTCD == "HR" ~ "Heart Rate",
      VSTESTCD == "WEIGHT" ~ "Weight",
      VSTESTCD == "HEIGHT" ~ "Height", 
      VSTESTCD == "TEMP" ~ "Temperature",
      TRUE ~ ""
    ),
    
    VSORRES = as.character(measurement),
    VSORRESU = case_when(
      VSTESTCD %in% c("SYSBP", "DIABP") ~ "mmHg",
      VSTESTCD == "HR" ~ "beats/min",
      VSTESTCD == "WEIGHT" ~ "kg",
      VSTESTCD == "HEIGHT" ~ "cm",
      VSTESTCD == "TEMP" ~ "C",
      TRUE ~ ""
    ),
    
    VSSTRESC = VSORRES,
    VSSTRESN = as.numeric(measurement),
    VSSTRESU = VSORRESU,
    
    # Visit information
    VISIT = visit,
    VISITNUM = case_when(
      visit == "Screening" ~ 1,
      visit == "Baseline" ~ 2,
      visit == "Week 2" ~ 3,
      visit == "Week 4" ~ 4,
      visit == "Week 8" ~ 5,
      visit == "Week 12" ~ 6,
      visit == "End of Study" ~ 7,
      TRUE ~ 99
    ),
    
    VSDTC = format(visit_date, "%Y-%m-%d"),
    VSDY = as.numeric(visit_date - STUDY_RFSTDTC) + 1,
    
    # Additional variables  
    VSPOS = "SITTING",
    VSLOC = case_when(
      VSTESTCD %in% c("SYSBP", "DIABP") ~ "ARM",
      TRUE ~ ""
    )
  ) %>%
  
  # Filter out missing measurements
  filter(!is.na(measurement)) %>%
  
  # Generate sequence numbers
  group_by(USUBJID, VISIT, VSTESTCD) %>%
  mutate(VSSEQ = row_number()) %>%
  ungroup() %>%
  
  # Select final SDTM variables
  select(STUDYID, DOMAIN, USUBJID, VSSEQ, VSTESTCD, VSTEST, VSORRES, VSORRESU,
         VSSTRESC, VSSTRESN, VSSTRESU, VISIT, VISITNUM, VSDTC, VSDY, VSPOS, VSLOC)

# Display sample of VS domain
print("SDTM VS Domain (first 15 records):")
[1] "SDTM VS Domain (first 15 records):"
print(vs_domain %>% slice(1:15))
# A tibble: 15 × 17
   STUDYID DOMAIN USUBJID     VSSEQ VSTESTCD VSTEST    VSORRES VSORRESU VSSTRESC
   <chr>   <chr>  <chr>       <int> <chr>    <chr>     <chr>   <chr>    <chr>
 1 ABC-123 VS     ABC-123-001     1 SYSBP    Systolic… 118     mmHg     118
 2 ABC-123 VS     ABC-123-001     1 DIABP    Diastoli… 98      mmHg     98
 3 ABC-123 VS     ABC-123-001     1 HR       Heart Ra… 69      beats/m… 69
 4 ABC-123 VS     ABC-123-001     1 WEIGHT   Weight    77.3    kg       77.3
 5 ABC-123 VS     ABC-123-001     1 HEIGHT   Height    178.8   cm       178.8
 6 ABC-123 VS     ABC-123-001     1 TEMP     Temperat… 36.3    C        36.3
 7 ABC-123 VS     ABC-123-001     1 SYSBP    Systolic… 143     mmHg     143
 8 ABC-123 VS     ABC-123-001     1 DIABP    Diastoli… 85      mmHg     85
 9 ABC-123 VS     ABC-123-001     1 HR       Heart Ra… 60      beats/m… 60
10 ABC-123 VS     ABC-123-001     1 WEIGHT   Weight    57.9    kg       57.9
11 ABC-123 VS     ABC-123-001     1 HEIGHT   Height    178.2   cm       178.2
12 ABC-123 VS     ABC-123-001     1 TEMP     Temperat… 36.2    C        36.2
13 ABC-123 VS     ABC-123-001     1 SYSBP    Systolic… 130     mmHg     130
14 ABC-123 VS     ABC-123-001     1 DIABP    Diastoli… 60      mmHg     60
15 ABC-123 VS     ABC-123-001     1 HR       Heart Ra… 63      beats/m… 63
# ℹ 8 more variables: VSSTRESN <dbl>, VSSTRESU <chr>, VISIT <chr>,
#   VISITNUM <dbl>, VSDTC <chr>, VSDY <dbl>, VSPOS <chr>, VSLOC <chr>
# Validation
cat("\n=== VS Domain Validation ===\n")

=== VS Domain Validation ===
cat("Total vital signs records:", nrow(vs_domain), "\n")
Total vital signs records: 834 
cat("Unique test codes:", length(unique(vs_domain$VSTESTCD)), "\n") 
Unique test codes: 6 
cat("Test codes present:", paste(unique(vs_domain$VSTESTCD), collapse = ", "), "\n")
Test codes present: SYSBP, DIABP, HR, WEIGHT, HEIGHT, TEMP 
cat("Date range:", min(vs_domain$VSDTC), "to", max(vs_domain$VSDTC), "\n")
Date range: 2024-01-22 to 2024-05-22 

Quality Control Checks

Data Quality Summary

cat("=== SDTM Data Quality Summary ===\n\n")
=== SDTM Data Quality Summary ===
# DM Domain QC
cat("DM Domain Checks:\n")
DM Domain Checks:
cat("- Complete USUBJID:", all(!is.na(dm_domain$USUBJID) & dm_domain$USUBJID != ""), "\n")
- Complete USUBJID: TRUE 
cat("- Valid AGE range:", all(dm_domain$AGE >= 18 & dm_domain$AGE <= 90), "\n")
- Valid AGE range: TRUE 
cat("- Sex values M/F only:", all(dm_domain$SEX %in% c("M", "F")), "\n")
- Sex values M/F only: TRUE 
cat("- ARM assigned to all:", all(!is.na(dm_domain$ARM) & dm_domain$ARM != ""), "\n")
- ARM assigned to all: TRUE 
# VS Domain QC  
cat("\nVS Domain Checks:\n")

VS Domain Checks:
cat("- All records have USUBJID:", all(!is.na(vs_domain$USUBJID)), "\n")
- All records have USUBJID: TRUE 
cat("- Numeric results present:", all(!is.na(vs_domain$VSSTRESN)), "\n")
- Numeric results present: TRUE 
cat("- Units consistent per test:", 
    vs_domain %>% 
      group_by(VSTESTCD) %>% 
      summarise(unit_count = n_distinct(VSSTRESU)) %>% 
      summarise(all_single_unit = all(unit_count == 1)) %>% 
      pull(all_single_unit), "\n")
- Units consistent per test: TRUE 
# Cross-domain checks
cat("\nCross-Domain Checks:\n") 

Cross-Domain Checks:
common_subjects_dm <- unique(dm_domain$USUBJID)
common_subjects_vs <- unique(vs_domain$USUBJID)
cat("- Subjects in both DM and VS:", length(intersect(common_subjects_dm, common_subjects_vs)), "\n")
- Subjects in both DM and VS: 25 
cat("- Subjects only in DM:", length(setdiff(common_subjects_dm, common_subjects_vs)), "\n")
- Subjects only in DM: 0 
cat("- Subjects only in VS:", length(setdiff(common_subjects_vs, common_subjects_dm)), "\n")
- Subjects only in VS: 0 

Export SDTM Datasets

# Export to SAS transport files (XPT format)
# haven::write_xpt(list(dm = dm_domain), "dm.xpt")
# haven::write_xpt(list(vs = vs_domain), "vs.xpt") 

# Or export to CSV for review
# write.csv(dm_domain, "dm.csv", row.names = FALSE)
# write.csv(vs_domain, "vs.csv", row.names = FALSE)

cat("SDTM datasets ready for export\n")
cat("- DM domain: ", nrow(dm_domain), " records\n")
cat("- VS domain: ", nrow(vs_domain), " records\n")

Key Programming Notes

R vs SAS Equivalencies

  • Data manipulation: dplyr replaces DATA steps
  • Conditional logic: case_when() replaces IF-THEN-ELSE
  • Date calculations: lubridate functions replace SAS date functions
  • Reshaping data: pivot_longer() replaces PROC TRANSPOSE
  • Grouping operations: group_by() %>% summarise() replaces PROC MEANS

CDISC Compliance

  • All required SDTM variables are included
  • Controlled terminology is properly applied
  • Cross-domain relationships are maintained
  • Data types follow SDTM standards

Validation Strategy

  1. Metadata-driven programming using sdtm.oak (when available)
  2. Automated quality checks for data completeness
  3. Cross-domain validation for referential integrity
  4. Controlled terminology validation against CDISC CT

This guide demonstrates production-ready SDTM programming techniques for clinical trials data processing using R.