使用R语言生成CDISC SDTM.AE domain

写在前面

  • 使用的是Rstudio

  • 其实R已经有生成sdtm相关的package,以下代码仅作为练习R语言的语法,不是高效生成sdtm的方法

  • 代码中没有解决的问题包括:EPOCH相关的逻辑没有考虑partial date的情况;在使用arrange() function做-SEQ排序时,关于大小写英文字母的排序机制似乎与SAS语言的sort function有所不同,导致使用相同的排序变量,通过R和SAS排序后,record的顺序会有不同;输出xpt结果是乱码,暂时输出到csv文件中。

  • 还没有写生成SUPPAE的代码

  • 代码参考了以下材料

Generating .xpt files with SAS, R and Python

https://www.pharmasug.org/proceedings/2021/EP/PharmaSUG-2021-EP-057.pdf

Yotube @mycsg

mycsg TASKS-SDTMGEN

以下是R代码

setwd('C://R_software')

library(haven)

library(dplyr)

library(tidyverse)

library(sas7bdat)

library(SASxport)

library(Hmisc)

import source data

raw_ae_001 <- read_sas('C://rawdata/ae_001.sas7bdat')

raw_meddrathsaurus <- read_sas('C://rawdata/meddrathesaurus.sas7bdat')

sdtm_dm <- read_sas('C://sdtmdata/SDTM/DM.sas7bdat')

sdtm_se <- read_sas('C://rawdata/SE.sas7bdat')

Update the variable name to uppercase, because var name is case sensitive in R

names(raw_ae_001) <- toupper(names(raw_ae_001))

names(raw_meddrathsaurus) <- toupper(names(raw_meddrathsaurus))

Filter ae raw data with AETERM not missing, and keep necessary variables

ae <- raw_ae_001 %>%

select(SUBJECT,RECORDPOSITION,AETERM,AESTDAT_RAW,AESTTIM,AEENDAT_RAW,AEENTIM,

AESEV_STD,AESER_STD,AEACN_STD,AEREL_STD,AEREL_WD_STD,AEPATT_STD,AEOUT_STD,AESCONG_STD,

AESDISAB_STD,AESDTH_STD,AESHOSP_STD,AESLIFE_STD,AESMIE_STD,AEONGO) %>%

filter(AETERM != "")

Update AETERM value to uppercase in order to merge with source MedDRA coding data

aeAETERM \<- toupper(aeAETERM)

Filter MedDRA source data with AE pannel only

meddra <- raw_meddrathsaurus %>%

filter(PANEL=="AE")

Merge AE and MedDRA data (left join) by AETERM, create AESTDTC/AEENDTC

ae1 <- merge (ae, meddra, by.x = c("AETERM"), by.y = c("VERBATIM"), all.x = T) %>%

create AESTDTC

mutate(

stdayn = suppressWarnings(as.numeric(word(AESTDAT_RAW,1))), ### as.numeric>>input, word>>scan

stday = if_else(!is.na(stdayn), str_pad(stdayn, width = 2, pad = "0"), "-"), ### is.na>>not missing, !>>not, str_pad>>put xx.

stmonthc = str_to_upper(word(AESTDAT_RAW, 2)), ### str_to_upper>>uppercase

stmonth = case_when(

stmonthc == "JAN" ~ "01",

stmonthc == "FEB" ~ "02",

stmonthc == "MAR" ~ "03",

stmonthc == "APR" ~ "04",

stmonthc == "MAY" ~ "05",

stmonthc == "JUN" ~ "06",

stmonthc == "JUL" ~ "07",

stmonthc == "AUG" ~ "08",

stmonthc == "SEP" ~ "09",

stmonthc == "OCT" ~ "10",

stmonthc == "NOV" ~ "11",

stmonthc == "DEC" ~ "12",

TRUE ~ "-"

),

styear = word(AESTDAT_RAW,3),

styear1 = if_else((styear == "UNK") | (is.na(styear)), "-", styear), ### | >> or

aestdate = str_c(styear1, stmonth, stday, sep = "-"), ### str_c >> catx

AESTDTC = if_else(AESTTIM != "", str_c(aestdate, str_pad(AESTTIM, width = 5, pad = "0"), sep = "T"), aestdate),

AESTDTC = if_else(str_sub(AESTDTC, -5) == "-----", "", AESTDTC),

AESTDTC = if_else(str_sub(AESTDTC, -4) == "----", str_sub(AESTDTC,end=-5), AESTDTC),

AESTDTC = if_else(str_sub(AESTDTC, -2) == "--", str_sub(AESTDTC,end=-3), AESTDTC)

) %>%

create AEENDTC

mutate(

endayn = suppressWarnings(as.numeric(word(AEENDAT_RAW,1))), ### as.numeric>>input, word>>scan

enday = if_else(!is.na(endayn), str_pad(endayn, width = 2, pad = "0"), "-"), ### is.na>>not missing, !>>not, str_pad>>put xx.

enmonthc = str_to_upper(word(AEENDAT_RAW, 2)), ### str_to_upper>>uppercase

enmonth = case_when(

enmonthc == "JAN" ~ "01",

enmonthc == "FEB" ~ "02",

enmonthc == "MAR" ~ "03",

enmonthc == "APR" ~ "04",

enmonthc == "MAY" ~ "05",

enmonthc == "JUN" ~ "06",

enmonthc == "JUL" ~ "07",

enmonthc == "AUG" ~ "08",

enmonthc == "SEP" ~ "09",

enmonthc == "OCT" ~ "10",

enmonthc == "NOV" ~ "11",

enmonthc == "DEC" ~ "12",

TRUE ~ "-"

),

enyear = word(AEENDAT_RAW,3),

enyear1 = if_else((enyear == "UNK") | (is.na(enyear)), "-", enyear), ### | >> or

aeendate = str_c(enyear1, enmonth, enday, sep = "-"), ### str_c >> catx

AEENDTC = if_else(AEENTIM != "", str_c(aeendate, str_pad(AEENTIM, width = 5, pad = "0"), sep = "T"), aeendate),

AEENDTC = if_else(str_sub(AEENDTC, -5) == "-----", "", AEENDTC),

AEENDTC = if_else(str_sub(AEENDTC, -4) == "----", str_sub(AEENDTC,end=-5), AEENDTC),

AEENDTC = if_else(str_sub(AEENDTC, -2) == "--", str_sub(AEENDTC,end=-3), AEENDTC)

)

Create AE domain vars

ae2 <- ae1 %>%

cbind(

STUDYID=c("PROTOCOLID"),

DOMAIN=c("AE"),

USUBJID=str_c(c("PROTOCOLID-0"),substr(ae1SUBJECT,4,6),c("-00"),substr(ae1SUBJECT,7,9)), # str_c() is catx() in SAS

SUBJID=ae1$SUBJECT,

AESPID=str_c(c("AE_001-"),ae1$RECORDPOSITION),

AELLT=ae1$LLT_NAME,

AELLTCD=ae1$LLT_CODE,

AEDECOD=ae1$PT_NAME,

AEPTCD=ae1$PT_CODE,

AEHLT=ae1$HLT_NAME,

AEHLTCD=ae1$HLT_CODE,

AEHLGT=ae1$HGT_NAME,

AEHLGTCD=ae1$HGT_CODE,

AEBODSYS=ae1$SOC_NAME,

AEBDSYCD=ae1$SOC_CODE,

AESOC=ae1$SOC_NAME,

AESOCCD=ae1$SOC_CODE,

AESEV=ae1$AESEV_STD,

AESER=ae1$AESER_STD,

AEACN=ae1$AEACN_STD,

AEREL=ae1$AEREL_STD,

AERELNST=ae1$AEREL_WD_STD,

AEPATT=ae1$AEPATT_STD,

AEOUT=ae1$AEOUT_STD,

AESCONG=ae1$AESCONG_STD,

AESDISAB=ae1$AESDISAB_STD,

AESDTH=ae1$AESDTH_STD,

AESHOSP=ae1$AESHOSP_STD,

AESLIFE=ae1$AESLIFE_STD,

AESMIE=ae1$AESMIE_STD

)

Merge AE and SDTM.DM by USUBJID, create AESTDY/AEENDY

sdtm_dm <- select(sdtm_dm,USUBJID,RFSTDTC,RFENDTC)

ae3 <- merge (ae2, sdtm_dm, by = c("USUBJID"), all.x = T) %>%

mutate(

aestdt=as.Date(AESTDTC),

rfstdt=as.Date(RFSTDTC),

rfstdate=str_sub(RFSTDTC,1,10),

rfst_year=str_sub(RFSTDTC,1,4),

rfst_month=str_sub(RFSTDTC,6,7),

rfst_day=str_sub(RFSTDTC,9,10),

AESTDY=ifelse(!is.na(aestdt) & !is.na(rfstdt),

ifelse((aestdt>=rfstdt),aestdt-rfstdt+1,aestdt-rfstdt), ""

)

) %>%

mutate(

aeendt=as.Date(AEENDTC),

rfstdt=as.Date(RFSTDTC),

AEENDY=ifelse(!is.na(aeendt) & !is.na(rfstdt),

ifelse((aeendt>=rfstdt),aeendt-rfstdt+1,aeendt-rfstdt), ""

)

) %>%

create AEENRTPT, AEENTPT

mutate(

AEENRTPT=ifelse(AEONGO==1,"ONGOING",""),

AEENTPT=ifelse(AEONGO==1,

ifelse(is.na(rfstdt)==T,"SCREENING","END OF STUDY"),""

)

)

prepare SE dataset for creating EPOCH

sdtm_se <- select(sdtm_se,USUBJID,ETCD,SESTDTC,SEENDTC)

sest <- sdtm_se %>%

select(USUBJID,ETCD,SESTDTC) %>%

pivot_wider(names_from=ETCD, values_from=SESTDTC)

colnames(sest) <- c("USUBJID","st1","st2","st3")

seen <- sdtm_se %>%

select(USUBJID,ETCD,SEENDTC) %>%

pivot_wider(names_from=ETCD, values_from=SEENDTC)

colnames(seen) <- c("USUBJID","en1","en2","en3")

sesten <- merge (sest, seen, by = c("USUBJID"))

ae4 <- merge (ae3, sesten, by = c("USUBJID"), all.x = T)

ae5 <- ae4 %>%

mutate(EPOCH=NA) %>%

mutate(

EPOCH=ifelse((st1<=aestdt & aestdt<en1) | (aestdt<=en1 & is.na(st2)==T), "SCREENING",ifelse(st2<=aestdt & aestdt<=en2, "TREATMENT", "FOLLOW-UP"))

) %>%

#mutate(

EPOCH=ifelse(!is.na(EPOCH)==T & !is.na(stday)==T, EPOCH, ifelse())

#) %>%

arrange(STUDYID,USUBJID,AEDECOD,AESTDTC,AEENDTC,AESPID) %>%

group_by(USUBJID) %>%

mutate(AESEQ=row_number())

select target vars in AE

sdtm_ae <- select(ae5,STUDYID,DOMAIN,USUBJID,SUBJID,AESEQ,AESPID,

AETERM,AELLT,AELLTCD,AEDECOD,AEPTCD,AEHLT,AEHLTCD,AEHLGT,AEHLGTCD,AEBODSYS,AEBDSYCD,AESOC,AESOCCD,

AESEV,AESER,AEACN,AEREL,AERELNST,AEPATT,AEOUT,AESCONG,AESDISAB,AESDTH,AESHOSP,AESLIFE,AESMIE,

EPOCH,AESTDTC,AEENDTC,AESTDY,AEENDY,AEENRTPT,AEENTPT)

convert following vars to numeric per CDSIC definition

sdtm_aeAELLTCD \<- as.numeric(sdtm_aeAELLTCD)

sdtm_aeAEPTCD \<- as.numeric(sdtm_aeAEPTCD)

sdtm_aeAEHLTCD \<- as.numeric(sdtm_aeAEHLTCD)

sdtm_aeAEHLGTCD \<- as.numeric(sdtm_aeAEHLGTCD)

sdtm_aeAEBDSYCD \<- as.numeric(sdtm_aeAEBDSYCD)

sdtm_aeAESOCCD \<- as.numeric(sdtm_aeAESOCCD)

sdtm_aeAESTDY \<- as.numeric(sdtm_aeAESTDY)

sdtm_aeAEENDY \<- as.numeric(sdtm_aeAEENDY)

convert NA to null

sdtm_aeAESTDY\[is.na(sdtm_aeAESTDY)] <- ""

sdtm_aeAEENDY\[is.na(sdtm_aeAEENDY)] <- ""

add label

label(sdtm_ae) <- "Adverse Events"

label(sdtm_ae$STUDYID) <- "Study Identifier"

label(sdtm_ae$DOMAIN) <- "Domain Abbreviation"

label(sdtm_ae$USUBJID) <- "Unique Subject Identifier"

label(sdtm_ae$SUBJID) <- "Subject Identifier for the Study"

label(sdtm_ae$AESEQ) <- "Sequence Number"

label(sdtm_ae$AESPID) <- "Sponsor-Defined Identifier"

label(sdtm_ae$AETERM) <- "Reported Term for the Adverse Event"

label(sdtm_ae$AELLT) <- "Lowest Level Term"

label(sdtm_ae$AELLTCD) <- "Lowest Level Term Code"

label(sdtm_ae$AEDECOD) <- "Dictionary-Derived Term"

label(sdtm_ae$AEPTCD) <- "Preferred Term Code"

label(sdtm_ae$AEHLT) <- "High Level Term"

label(sdtm_ae$AEHLTCD) <- "High Level Term Code"

label(sdtm_ae$AEHLGT) <- "High Level Group Term"

label(sdtm_ae$AEHLGTCD) <- "High Level Group Term Code"

label(sdtm_ae$AEBODSYS) <- "Body System or Organ Class"

label(sdtm_ae$AEBDSYCD) <- "Body System or Organ Class Code"

label(sdtm_ae$AESOC) <- "Primary System Organ Class"

label(sdtm_ae$AESOCCD) <- "Primary System Organ Class Code"

label(sdtm_ae$AESEV) <- "Severity/Intensity"

label(sdtm_ae$AESER) <- "Serious Event"

label(sdtm_ae$AEACN) <- "Action Taken with Study Treatment"

label(sdtm_ae$AEREL) <- "Causality"

label(sdtm_ae$AERELNST) <- "Relationship to Non-Study Treatment"

label(sdtm_ae$AEPATT) <- "Pattern of Adverse Event"

label(sdtm_ae$AEOUT) <- "Outcome of Adverse Event"

label(sdtm_ae$AESCONG) <- "Congenital Anomaly or Birth Defect"

label(sdtm_ae$AESDISAB) <- "Persist or Signif Disability/Incapacity"

label(sdtm_ae$AESDTH) <- "Results in Death"

label(sdtm_ae$AESHOSP) <- "Requires or Prolongs Hospitalization"

label(sdtm_ae$AESLIFE) <- "Is Life Threatening"

label(sdtm_ae$AESMIE) <- "Other Medically Important Serious Event"

label(sdtm_ae$EPOCH) <- "Epoch"

label(sdtm_ae$AESTDTC) <- "Start Date/Time of Adverse Event"

label(sdtm_ae$AEENDTC) <- "End Date/Time of Adverse Event"

label(sdtm_ae$AESTDY) <- "Study Day of Start of Adverse Event"

label(sdtm_ae$AEENDY) <- "Study Day of End of Adverse Event"

label(sdtm_ae$AEENRTPT) <- "End Relative to Reference Time Point"

label(sdtm_ae$AEENTPT) <- "End Reference Time Point"

#export to xpt

write.xport(sdtm_ae, file="C://R_software/ae_R.xpt")

export to CSV

write.csv(sdtm_ae, file="C://R_software/ae.csv")

相关推荐
让学习成为一种生活方式5 分钟前
R包下载太慢安装中止的解决策略-R语言003
java·数据库·r语言
有梦想的Frank博士1 天前
R语言*号标识显著性差异判断组间差异是否具有统计意义
开发语言·信息可视化·r语言
逆风远航4 天前
R语言贝叶斯:INLA下的贝叶斯回归、生存分析、随机游走、广义可加模型、极端数据的贝叶斯分析
开发语言·r语言·贝叶斯·生态学·结构方程·环境科学·混合效应
Faxxtty4 天前
【R语言】解决package ‘qvalue’ is not available (for R version 3.6.1)
开发语言·r语言
Cachel wood5 天前
Vue前端框架:Vue前端项目文件目录
java·前端·vue.js·python·算法·r语言·前端框架
邢博士谈科教6 天前
OmicsTools软件和R语言分析环境安装配置答疑汇总最新版
r语言
纪伊路上盛名在6 天前
vscode中提升效率的插件扩展——待更新
linux·服务器·ide·vscode·python·r语言·编辑器
拓端研究室TRL6 天前
银行信贷风控专题:Python、R 语言机器学习数据挖掘应用实例合集:xgboost、决策树、随机森林、贝叶斯等...
python·决策树·机器学习·数据挖掘·r语言
在在进步6 天前
R数据结构&向量基础
数据结构·r语言
琼火hu8 天前
R语言笔记(五):Apply函数
开发语言·笔记·r语言·apply