写在前面
- 使用的是Rstudio
- 其实R已经有生成sdtm相关的package,以下代码仅作为练习R语言的语法,不是高效生成sdtm的方法
- 代码中没有解决的问题包括:EPOCH相关的逻辑没有考虑partial date的情况;在使用arrange() function做-SEQ排序时,关于大小写英文字母的排序机制似乎与SAS语言的sort function有所不同,导致使用相同的排序变量,通过R和SAS排序后,record的顺序会有不同;输出xpt结果是乱码,暂时输出到csv文件中。
- 还没有写生成SUPPAE的代码
- 代码参考了以下材料
Generating .xpt files with SAS, R and Python
https://www.pharmasug.org/proceedings/2021/EP/PharmaSUG-2021-EP-057.pdf
Yotube @mycsg
mycsg TASKS-SDTMGEN
以下是R代码
setwd('C://R_software')
 library(haven)
 library(dplyr)
 library(tidyverse)
 library(sas7bdat)
 library(SASxport)
 library(Hmisc)
# import source data
 raw_ae_001 <- read_sas('C://rawdata/ae_001.sas7bdat')
 raw_meddrathsaurus <- read_sas('C://rawdata/meddrathesaurus.sas7bdat')
 sdtm_dm <- read_sas('C://sdtmdata/SDTM/DM.sas7bdat')
 sdtm_se <- read_sas('C://rawdata/SE.sas7bdat')
# Update the variable name to uppercase, because var name is case sensitive in R
 names(raw_ae_001) <- toupper(names(raw_ae_001))
 names(raw_meddrathsaurus) <- toupper(names(raw_meddrathsaurus))
# Filter ae raw data with AETERM not missing, and keep necessary variables
 ae <- raw_ae_001 %>%
   select(SUBJECT,RECORDPOSITION,AETERM,AESTDAT_RAW,AESTTIM,AEENDAT_RAW,AEENTIM,
          AESEV_STD,AESER_STD,AEACN_STD,AEREL_STD,AEREL_WD_STD,AEPATT_STD,AEOUT_STD,AESCONG_STD,
          AESDISAB_STD,AESDTH_STD,AESHOSP_STD,AESLIFE_STD,AESMIE_STD,AEONGO) %>% 
   filter(AETERM != "") 
 # Update AETERM value to uppercase in order to merge with source MedDRA coding data
 ae$AETERM <- toupper(ae$AETERM)
# Filter MedDRA source data with AE pannel only
 meddra <- raw_meddrathsaurus  %>%
   filter(PANEL=="AE")
# Merge AE and MedDRA data (left join) by AETERM, create AESTDTC/AEENDTC 
 ae1 <- merge (ae, meddra, by.x = c("AETERM"), by.y = c("VERBATIM"), all.x = T) %>%
   # create AESTDTC
   mutate(
     stdayn = suppressWarnings(as.numeric(word(AESTDAT_RAW,1))), ### as.numeric>>input, word>>scan
     stday = if_else(!is.na(stdayn), str_pad(stdayn, width = 2, pad = "0"), "-"), ### is.na>>not missing, !>>not, str_pad>>put xx.
     stmonthc = str_to_upper(word(AESTDAT_RAW, 2)), ### str_to_upper>>uppercase
     stmonth = case_when(
       stmonthc == "JAN" ~ "01",
       stmonthc == "FEB" ~ "02",
       stmonthc == "MAR" ~ "03",
       stmonthc == "APR" ~ "04",
       stmonthc == "MAY" ~ "05",
       stmonthc == "JUN" ~ "06",
       stmonthc == "JUL" ~ "07",
       stmonthc == "AUG" ~ "08",
       stmonthc == "SEP" ~ "09",
       stmonthc == "OCT" ~ "10",
       stmonthc == "NOV" ~ "11",
       stmonthc == "DEC" ~ "12",
       TRUE ~ "-"
     ),
     styear = word(AESTDAT_RAW,3),
     styear1 = if_else((styear == "UNK") | (is.na(styear)), "-", styear), ### | >> or
     aestdate = str_c(styear1, stmonth, stday, sep = "-"), ### str_c >> catx
     AESTDTC = if_else(AESTTIM != "", str_c(aestdate, str_pad(AESTTIM, width = 5, pad = "0"), sep = "T"), aestdate),
     
     AESTDTC = if_else(str_sub(AESTDTC, -5) == "-----", "", AESTDTC),
     AESTDTC = if_else(str_sub(AESTDTC, -4) == "----", str_sub(AESTDTC,end=-5), AESTDTC),
     AESTDTC = if_else(str_sub(AESTDTC, -2) == "--", str_sub(AESTDTC,end=-3), AESTDTC)
   ) %>%
   # create AEENDTC
   mutate(
     endayn = suppressWarnings(as.numeric(word(AEENDAT_RAW,1))), ### as.numeric>>input, word>>scan
     enday = if_else(!is.na(endayn), str_pad(endayn, width = 2, pad = "0"), "-"), ### is.na>>not missing, !>>not, str_pad>>put xx.
     enmonthc = str_to_upper(word(AEENDAT_RAW, 2)), ### str_to_upper>>uppercase
     enmonth = case_when(
       enmonthc == "JAN" ~ "01",
       enmonthc == "FEB" ~ "02",
       enmonthc == "MAR" ~ "03",
       enmonthc == "APR" ~ "04",
       enmonthc == "MAY" ~ "05",
       enmonthc == "JUN" ~ "06",
       enmonthc == "JUL" ~ "07",
       enmonthc == "AUG" ~ "08",
       enmonthc == "SEP" ~ "09",
       enmonthc == "OCT" ~ "10",
       enmonthc == "NOV" ~ "11",
       enmonthc == "DEC" ~ "12",
       TRUE ~ "-"
     ),
     enyear = word(AEENDAT_RAW,3),
     enyear1 = if_else((enyear == "UNK") | (is.na(enyear)), "-", enyear), ### | >> or
     aeendate = str_c(enyear1, enmonth, enday, sep = "-"), ### str_c >> catx
     AEENDTC = if_else(AEENTIM != "", str_c(aeendate, str_pad(AEENTIM, width = 5, pad = "0"), sep = "T"), aeendate),
     
     AEENDTC = if_else(str_sub(AEENDTC, -5) == "-----", "", AEENDTC),
     AEENDTC = if_else(str_sub(AEENDTC, -4) == "----", str_sub(AEENDTC,end=-5), AEENDTC),
     AEENDTC = if_else(str_sub(AEENDTC, -2) == "--", str_sub(AEENDTC,end=-3), AEENDTC)
   )
 # Create AE domain vars
 ae2 <- ae1 %>%
   cbind(
     STUDYID=c("PROTOCOLID"), 
     DOMAIN=c("AE"), 
     USUBJID=str_c(c("PROTOCOLID-0"),substr(ae1$SUBJECT,4,6),c("-00"),substr(ae1$SUBJECT,7,9)), # str_c() is catx() in SAS
     SUBJID=ae1$SUBJECT,
     AESPID=str_c(c("AE_001-"),ae1$RECORDPOSITION),
     AELLT=ae1$LLT_NAME,
     AELLTCD=ae1$LLT_CODE,
     AEDECOD=ae1$PT_NAME,
     AEPTCD=ae1$PT_CODE,
     AEHLT=ae1$HLT_NAME,
     AEHLTCD=ae1$HLT_CODE,
     AEHLGT=ae1$HGT_NAME,
     AEHLGTCD=ae1$HGT_CODE,
     AEBODSYS=ae1$SOC_NAME,
     AEBDSYCD=ae1$SOC_CODE,
     AESOC=ae1$SOC_NAME,
     AESOCCD=ae1$SOC_CODE,
     AESEV=ae1$AESEV_STD,
     AESER=ae1$AESER_STD,
     AEACN=ae1$AEACN_STD,
     AEREL=ae1$AEREL_STD,
     AERELNST=ae1$AEREL_WD_STD,
     AEPATT=ae1$AEPATT_STD,
     AEOUT=ae1$AEOUT_STD,
     AESCONG=ae1$AESCONG_STD,
     AESDISAB=ae1$AESDISAB_STD,
     AESDTH=ae1$AESDTH_STD,
     AESHOSP=ae1$AESHOSP_STD,
     AESLIFE=ae1$AESLIFE_STD,
     AESMIE=ae1$AESMIE_STD
   )  
# Merge AE and SDTM.DM by USUBJID, create AESTDY/AEENDY
 sdtm_dm <- select(sdtm_dm,USUBJID,RFSTDTC,RFENDTC)
ae3 <- merge (ae2, sdtm_dm, by = c("USUBJID"), all.x = T) %>%
   mutate(
     aestdt=as.Date(AESTDTC),
     rfstdt=as.Date(RFSTDTC),
     rfstdate=str_sub(RFSTDTC,1,10),
     rfst_year=str_sub(RFSTDTC,1,4),
     rfst_month=str_sub(RFSTDTC,6,7),
     rfst_day=str_sub(RFSTDTC,9,10),
     AESTDY=ifelse(!is.na(aestdt) & !is.na(rfstdt),
               ifelse((aestdt>=rfstdt),aestdt-rfstdt+1,aestdt-rfstdt), ""
                   )
   ) %>%
   mutate(
     aeendt=as.Date(AEENDTC),
     rfstdt=as.Date(RFSTDTC),
     AEENDY=ifelse(!is.na(aeendt) & !is.na(rfstdt),
                   ifelse((aeendt>=rfstdt),aeendt-rfstdt+1,aeendt-rfstdt), ""
             )
   ) %>%
   # create AEENRTPT, AEENTPT
   mutate(
     AEENRTPT=ifelse(AEONGO==1,"ONGOING",""),
     AEENTPT=ifelse(AEONGO==1,
                    ifelse(is.na(rfstdt)==T,"SCREENING","END OF STUDY"),""
                   )
   )
# prepare SE dataset for creating EPOCH
 sdtm_se <- select(sdtm_se,USUBJID,ETCD,SESTDTC,SEENDTC)
sest <- sdtm_se %>%
   select(USUBJID,ETCD,SESTDTC) %>%
   pivot_wider(names_from=ETCD, values_from=SESTDTC)
colnames(sest) <- c("USUBJID","st1","st2","st3")
 seen <- sdtm_se %>%
   select(USUBJID,ETCD,SEENDTC) %>%
   pivot_wider(names_from=ETCD, values_from=SEENDTC)
colnames(seen) <- c("USUBJID","en1","en2","en3")
sesten <- merge (sest, seen, by = c("USUBJID"))
ae4 <- merge (ae3, sesten, by = c("USUBJID"), all.x = T)
ae5 <- ae4 %>%
   mutate(EPOCH=NA) %>%
   mutate(
     EPOCH=ifelse((st1<=aestdt & aestdt<en1) | (aestdt<=en1 & is.na(st2)==T), "SCREENING",ifelse(st2<=aestdt & aestdt<=en2, "TREATMENT", "FOLLOW-UP"))
   ) %>%
   #mutate(
   #  EPOCH=ifelse(!is.na(EPOCH)==T & !is.na(stday)==T, EPOCH, ifelse())
   #) %>%
   arrange(STUDYID,USUBJID,AEDECOD,AESTDTC,AEENDTC,AESPID) %>%
   group_by(USUBJID) %>%
   mutate(AESEQ=row_number())
# select target vars in AE
 sdtm_ae <- select(ae5,STUDYID,DOMAIN,USUBJID,SUBJID,AESEQ,AESPID,
             AETERM,AELLT,AELLTCD,AEDECOD,AEPTCD,AEHLT,AEHLTCD,AEHLGT,AEHLGTCD,AEBODSYS,AEBDSYCD,AESOC,AESOCCD,
             AESEV,AESER,AEACN,AEREL,AERELNST,AEPATT,AEOUT,AESCONG,AESDISAB,AESDTH,AESHOSP,AESLIFE,AESMIE,
             EPOCH,AESTDTC,AEENDTC,AESTDY,AEENDY,AEENRTPT,AEENTPT)
# convert following vars to numeric per CDSIC definition
 sdtm_ae$AELLTCD <- as.numeric(sdtm_ae$AELLTCD)
 sdtm_ae$AEPTCD <- as.numeric(sdtm_ae$AEPTCD)
 sdtm_ae$AEHLTCD <- as.numeric(sdtm_ae$AEHLTCD)
 sdtm_ae$AEHLGTCD <- as.numeric(sdtm_ae$AEHLGTCD)
 sdtm_ae$AEBDSYCD <- as.numeric(sdtm_ae$AEBDSYCD)
 sdtm_ae$AESOCCD <- as.numeric(sdtm_ae$AESOCCD)
 sdtm_ae$AESTDY <- as.numeric(sdtm_ae$AESTDY)
 sdtm_ae$AEENDY <- as.numeric(sdtm_ae$AEENDY)
# convert NA to null
 sdtm_ae$AESTDY[is.na(sdtm_ae$AESTDY)] <- ""
 sdtm_ae$AEENDY[is.na(sdtm_ae$AEENDY)] <- ""
# add label
 label(sdtm_ae) <- "Adverse Events"
 label(sdtm_ae$STUDYID)   <- "Study Identifier"                       
 label(sdtm_ae$DOMAIN)    <- "Domain Abbreviation"                    
 label(sdtm_ae$USUBJID)   <- "Unique Subject Identifier"              
 label(sdtm_ae$SUBJID)    <- "Subject Identifier for the Study"       
 label(sdtm_ae$AESEQ)     <- "Sequence Number"                      
 label(sdtm_ae$AESPID)    <- "Sponsor-Defined Identifier"             
 label(sdtm_ae$AETERM)    <- "Reported Term for the Adverse Event"    
 label(sdtm_ae$AELLT)     <- "Lowest Level Term"                      
 label(sdtm_ae$AELLTCD)   <- "Lowest Level Term Code"                 
 label(sdtm_ae$AEDECOD)   <- "Dictionary-Derived Term"                
 label(sdtm_ae$AEPTCD)    <- "Preferred Term Code"                    
 label(sdtm_ae$AEHLT)     <- "High Level Term"                        
 label(sdtm_ae$AEHLTCD)   <- "High Level Term Code"                   
 label(sdtm_ae$AEHLGT)    <- "High Level Group Term"                  
 label(sdtm_ae$AEHLGTCD)  <- "High Level Group Term Code"             
 label(sdtm_ae$AEBODSYS)  <- "Body System or Organ Class"             
 label(sdtm_ae$AEBDSYCD)  <- "Body System or Organ Class Code"        
 label(sdtm_ae$AESOC)     <- "Primary System Organ Class"             
 label(sdtm_ae$AESOCCD)   <- "Primary System Organ Class Code"        
 label(sdtm_ae$AESEV)     <- "Severity/Intensity"                     
 label(sdtm_ae$AESER)     <- "Serious Event"                          
 label(sdtm_ae$AEACN)     <- "Action Taken with Study Treatment"      
 label(sdtm_ae$AEREL)     <- "Causality"                              
 label(sdtm_ae$AERELNST)  <- "Relationship to Non-Study Treatment"    
 label(sdtm_ae$AEPATT)    <- "Pattern of Adverse Event"               
 label(sdtm_ae$AEOUT)     <- "Outcome of Adverse Event"               
 label(sdtm_ae$AESCONG)   <- "Congenital Anomaly or Birth Defect"     
 label(sdtm_ae$AESDISAB)  <- "Persist or Signif Disability/Incapacity"
 label(sdtm_ae$AESDTH)    <- "Results in Death"                       
 label(sdtm_ae$AESHOSP)   <- "Requires or Prolongs Hospitalization"   
 label(sdtm_ae$AESLIFE)   <- "Is Life Threatening"                    
 label(sdtm_ae$AESMIE)    <- "Other Medically Important Serious Event"
 label(sdtm_ae$EPOCH)     <- "Epoch"                                 
 label(sdtm_ae$AESTDTC)   <- "Start Date/Time of Adverse Event"       
 label(sdtm_ae$AEENDTC)   <- "End Date/Time of Adverse Event"         
 label(sdtm_ae$AESTDY)    <- "Study Day of Start of Adverse Event"    
 label(sdtm_ae$AEENDY)    <- "Study Day of End of Adverse Event"      
 label(sdtm_ae$AEENRTPT)  <- "End Relative to Reference Time Point"   
 label(sdtm_ae$AEENTPT)   <- "End Reference Time Point"               
  
 #export to xpt  
 write.xport(sdtm_ae, file="C://R_software/ae_R.xpt")
 # export to CSV
 write.csv(sdtm_ae, file="C://R_software/ae.csv")