跳转至

01-数据读入&数据处理模块开发指南

作者:王靖雅
发布日期:2025-03-25
版本:v1.0

Content


Part1: 数据读入

对于MediSum模块开发项目: - 不涉及到对数据读入模块功能的修改,GitLab中会自动配置对应程序,**禁止**修改项目中该部分代码。 - 如需增加其他用户自定义文件的适配,请提供上传的自定义文件EXCEL模板并通知MediSum负责人(王靖雅,jingya.wang@hengrui.com)。 - 可使用该网页上传百奥知项目数据并下载转换完毕的RData格式EDC数据文件,用于数据处理模块部分的本地编码和测试:https://shinyapp.hengrui.com/test_app/readEDCtxt/

对于其他ShinyAPP开发项目: - 涉及到数据读入模块功能修改的项目,可参考百奥知EDC数据读入代码:fct_readEDCtxt.R - 在shiny中调用readEDCtxt函数代码块参考:

  ### Part1: data check and upload ----
  #### check errors:
  ## error1: cannot read the input folder name
  ## error2: folder with no sas file detect
  ## error3: wrong folder address: cannot detect txt file
  error <- ""
  info <- ""
  if (length(input$edcupload) == 0) {
    error <- paste0(error, "ERROR:未检测到EDC文件上传")
    message(error)
  } else {
    ## extract dir and rename file
    tempdir <- dirname(input$edcupload$datapath[1])
    file.rename(input$edcupload$datapath, paste0(tempdir, "/", input$edcupload$name))

    if (min(max(grepl("sas", tools::file_ext(list.files(tempdir)))), max(grepl("txt", tools::file_ext(list.files(tempdir))))) == 0) {
      if (max(grepl("sas", tools::file_ext(list.files(tempdir))))==0) {
        error <- paste0(error, ifelse(error == "", "", "<br/>"), "ERROR:上传的EDC文件中找不到用于转换的sas文件")
      }
      if (max(grepl("txt", tools::file_ext(list.files(tempdir))))==0) {
        error <- paste0(error, ifelse(error == "", "", "<br/>"), "ERROR:上传的EDC文件中找不到txt文件")
      }
      message(error)
      #### check errors end ----
    } else {
      ### uploading part ----
      ## info: no upload ae or cm or subject level file detect
      ## info: cutoff date notice
      #### AE upload ----
      if (length(input$aeupload) == 0) {
        info <- paste0(info, ifelse(info == "", "", "<br/>"), "INFO:检测到未上传不良事件编码文件")
      } else {
        aedata <- reactive({
          req(input$aeupload)
          ext <- tools::file_ext(input$aeupload$name)
          switch(ext,
                 csv = readr::read_csv(input$aeupload$datapath, show_col_types = FALSE),
                 xls = readxl::read_excel(input$aeupload$datapath),
                 xlsx = readxl::read_excel(input$aeupload$datapath),
                 validate("Invalid file; Please upload a .csv, .xls or .xlsx file")
          )
        })
        message(paste0("File ", input$aeupload$name, " loaded."))
      }

      #### EDC txt upload ----
      # Create a Progress object -- progress1 for upload step
      progress1 <- shiny::Progress$new()
      progress1$set(message = "Uploading data...", value = 0)
      # Close the progress when this reactive exits (even if there's an error)
      on.exit(progress1$close())
      updateProgress1 <- function(value = NULL, detail = NULL) {
        if (is.null(value)) {
          value <- progress1$getValue()
          value <- value + (progress1$getMax() - value) / 80
        }
        progress1$set(value = value, detail = detail)
      }
      ## use readEDCtxt to read txt files
      edc_data <- reactive({
        req(input$edcupload)
        data1 <- readEDCtxt(tempdir, updateProgress=updateProgress1)
        if (class(data1) != "character") {
          if (length(input$aeupload) != 0) {data1$AE_CODING <- aedata()}
        }
        return(data1)
      })

      if (class(edc_data()) == "character") {
        ## add success note and shinyalert
        error <- paste0("ERROR: 无法读入数据,请检查上传的EDC数据文件。")
        shinyalert::shinyalert(title= "Upload Fail", text = "无法成功读入数据,请检查上传的EDC数据文件",
                               type = "error",
                               confirmButtonText = "OK", confirmButtonCol = "#93c54b"
                               #confirmButtonCol = "#43C6DB"
        )
        message(error)
      } else {
        ## add success note and shinyalert
        note1 <- paste0("NOTE: ", length(names(edc_data())), "个数据文件载入成功。")
        # 包含:",
        # paste(names(edc_data()), collapse = ", "), "")
        shinyalert::shinyalert(title= "Upload Finish", text = paste0(length(names(edc_data())), "个数据文件已载入"),
                               type = "success", inputId = "edc_upload_done",
                               confirmButtonText = "Next", confirmButtonCol = "#93c54b"
                               #confirmButtonCol = "#43C6DB"
        )
      }
    }
  }


Part2: 数据处理

该部分涉及到ADaM-like格式的中间数据生成,需要依赖于spec进行编写,spec文件将以adxx.json储存,并参照【00-使用{Golem}搭建Rshiny project指南-Part3:添加内置数据】板块内容,转化为RData格式文件及内置数据进行调用,每个独立的adam使用独立的gen_adxx.R函数进行生成。

1. adxx.json格式要求

  • 参考GitLab库中提供的文件:adsl.JSON,adae.JSON
  • 案例如下:
    {
    "datasetName": "ADSL",
    "variables": {
        "SUBJID": { #变量名
          "name": "SUBJID",
          "description": "Subject Identifier for the Study", #变量label
          "type": "Character", #变量类型
          "codelist": [],
          "required": "Y", #是否必须生成,如=Y则会检查rawDataDependency提到的所有变量required=Y的变量在原始数据中是否存在,若不存在会跳出adam生成步骤并提示缺失xx变量无法生成
          "rawDataDependency": [
              {
                  "datasetName": "SUBJECT",
                  "variableName": "SUBJID",
                  "required": "Y" #一般建议在对于能1-1直接对应的数据,均填写为required=Y
              }
          ],
          "adamDataDependency": [],
          "derivation": "=SUBJECT.SUBJID",
          "comment": ""
        },
        "TRTSDT": {
          "name": "TRTSDT",
          "description": "Date of First Exposure to Treatment",
          "type": "Date",
          "codelist": [],
          "required": "Y",
          "rawDataDependency": [ 
              {
                  "datasetName": "EX",
                  "variableName": "EXDSTXT",
                  "required": "N" #这里填写为required=N,是因为会从所有EX开头的数据中抓取对应变量,无法固定数据集名称,无法进行准确核查,故跳过
              },
              {
                  "datasetName": "EX",
                  "variableName": "EXTRT",
                  "required": "N"
              },
              {
                  "datasetName": "EX",
                  "variableName": "EXSTDAT",
                  "required": "N"
              },
              {
                  "datasetName": "EX",
                  "variableName": "EXENDAT",
                  "required": "N"
              }
          ],
          "adamDataDependency": [],
          "derivation": "=min(of EX.EXSTDAT, EX.EXENDAT) with EX.EXDSTXT>0",
          "comment": "extract from all datasets start with EX"
        }
      }
    }
    
  • json需转换为可调用RData格式,参考:read_adam_json.R
  • 该代码会将文件输出为3个文件:
  • adamlike-spec.RData为在内置数据以及gen_adxx函数中直接调用格式。
  • adamlike-spec-df.RData和final-adamlike-spec-v1.xlsx为转化为类似常用spec的格式, 便于开发过程中自行查阅。

2. adxx生成函数代码及相关宏

  • 参考GitLab库中提供的文件:fct_gen_adsl.R,fct_gen_adae.R
  • 在生成前需要调用utils_var_dependency.R中check_dependency_exist函数进行对原始数据dependency check,参考代码:
    check_note <- check_dependency_exist(data = edc_data, spec = domain_spec, genadam = i)
    if (check_note == paste0(toupper(i), ": Pass dependency check")) {
      gen_data <- gen_adae(xxxx)
    }
    

模板及思路解读

  • 错误处理部分:如发生错误返回error并识别
    gen_adxx <- function(data, spec, cutoffdate = Sys.Date()) {
      tryCatch(
        { message("ADXX gen start")
          ... ##中间部分为生成数据集的相关代码
          message("ADXX gen success.")
          return(ADXX)
        },
        error=function(e) {
          message('Error: Fail to generate ADSL (code issue)')
          return("error")
        }
      )
    }
    
  • 第一部分:​数据提取
  • 从 spec 中提取变量依赖关系(数据集名、变量名)。
  • 从原始数据中按依赖关系提取变量。
  • 简单对应关系参考:
    ## STEP1: extract related data dependency information to use ----
    raw <- extract_dependency(spec = spec)
    
    ## STEP2: extract MHC start and gen ----
    ## extract raw data name startwith MHC
    need_ds_mh <- names(data)[startsWith(names(data), unique(raw$datasetName))]
    need_ds <- need_ds_mh[1]
    admhc1 <- data[[need_ds]] %>%
      select(any_of(raw$variableName))
    ## rename
    for (i in nrow(raw):1) {
      names(admhc1)[names(admhc1) == raw$variableName[i]] = raw$name[i]
    }
    
  • ​特殊处理: 对多选项变量(如 AEACNx/AERELx),或基于多个数据集想同变量,需进行额外处理提取。

  • 第二部分:​数据合并

  • 可以尽量避免数据合并,比如ADPR可以根据EDC收录分为ADPRLT,ADPRRT,ADPRTSURG分别生成,在数据处理阶段合并。
  • 对于较大的数据集,比如LB的处理,也可以仅提取出部分需要的PARAM使用,根据变量命名Domain,如ADLB_HBA1C。
  • ​横向合并: 对同一受试者的多源数据使用full_join/left_join等进行合并(如 ADSL 合并人口学、筛选、治疗分组等)
  • 纵向合并: 对同一类数据跨数据集合并,如EXx,需处理变量类型一致性,必要时可提前进行格式转换:mutate(across(everything(), as.character))。

  • 第三部分:​变量衍生

  • ​直接转换: 日期格式处理(lubridate::ymd)、逻辑标志生成(如 SCRNFFL = ifelse(...))
  • ​复杂逻辑: > 对于不确定原始变量是否收集的衍生变量, 需提前做好识别,并提供存在时的处理方法以及不存在时的处理方法
    • 类型转换: 按照spec中的type对变量类型进行处理,同时注意中英文,以及多种收集规范的存在。如字符型/数值型互转(如 AETOXGR)、Yes是/No否 统一为 Y/N。
    • ​日期填补: 缺失日期用参考日期填补(如 AE 开始日期用治疗开始日期补全)
      ## STEP3: AESTDT / missing date imputation [SAP rule]----
      adae2 <- adae1 %>% left_join(adsl %>% select(SUBJID, TRTSDT, TRTEDT, DTHDT) %>% clear_labels(), by="SUBJID") %>%
        mutate(
          AESTDT_Y = suppressWarnings(as.numeric(str_split_i(AESTDTC, "-", i=1))),
          AESTDT_M = suppressWarnings(as.numeric(str_split_i(AESTDTC, "-", i=2))),
          AESTDT_D = suppressWarnings(as.numeric(str_split_i(AESTDTC, "-", i=3))),
          AESTDT = case_when(
            ## full missing or missing part = TRTSDT
            !is.na(AESTDTC) & is.na(AESTDT_Y) ~ TRTSDT,
            AESTDT_Y == lubridate::year(TRTSDT) & is.na(AESTDT_M) & is.na(AESTDT_D) ~ TRTSDT,
            AESTDT_Y == lubridate::year(TRTSDT) & AESTDT_M == lubridate::month(TRTSDT) & is.na(AESTDT_D) ~ TRTSDT,
            ## others, use 1
            !is.na(AESTDTC) & is.na(AESTDT_M) ~ lubridate::make_date(year = AESTDT_Y, month = 1, day = 1),
            !is.na(AESTDTC) & is.na(AESTDT_D) ~ lubridate::make_date(year = AESTDT_Y, month = AESTDT_M, day = 1),
            !is.na(AESTDTC) ~ lubridate::make_date(year = AESTDT_Y, month = AESTDT_M, day = AESTDT_D)
          )
        )
      
    • ​多选项合并1: 如 RELGR1 合并同一个数据集多个相关性变量,注意需要同时考虑中英文匹配,以及多种收集规范的存在。
      ## STEP5: RELGR1 RELGR1N ----
        relat_pattern <- c('Related', 'Possibly Related', 'Unassessable',
                          "肯定有关", "可能有关",  "无法判定",
                          ## new rules new text
                          "Definitely Related", "Probably Related", "很可能有关")
        adae4 <- adae3 %>%
          tidyr::unite(AEREL_all, starts_with("AEREL"), remove = F, sep = ",", na.rm = T) %>%
          mutate(
            AEREL_all = paste0(",", AEREL_all, ","),
            RELGR1 = case_when(
              as.numeric(str_detect(AEREL_all, ",Related,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",Possibly Related,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",Unassessable,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",肯定有关,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",可能有关,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",无法判定,")) == 1 ~ "RELATED",
              ## new rules new text
              as.numeric(str_detect(AEREL_all, ",Definitely Related,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",Probably Related,")) == 1 ~ "RELATED",
              as.numeric(str_detect(AEREL_all, ",很可能有关,")) == 1 ~ "RELATED",
              .default = "UNRELATED"
            ),
            RELGR1N = ifelse(RELGR1 == "RELATED", 1, 0)
          )
      
    • ​多选项合并2: 合并多个数据集中同一个变量进行衍生如TRTSDT、TRTEDT,如需区分,建议保留label或页面等信息作为label。在处理中需要注意是否提前进行数据截取操作。
      #### TRTSDT TRTEDT -- from EXs ----
      ex_raw <-  raw %>% filter(name == "TRTSDT")
      ex_ds <- data[grep("EX", names(data))]
      need_ex_var <- c(subjid, ex_raw$variableName) #"SUBJID" "EXDSTXT" "EXTRT"   "EXSTDAT" "EXENDAT"
      ex_all <- select(ex_ds[[1]], any_of(need_ex_var)) %>% clear_labels() %>% mutate(across(everything(), as.character))
      if (length(ex_ds) > 1){
        for (i in 2:length(ex_ds)) {
          ex_all <- bind_rows(ex_all,
                              select(ex_ds[[i]], any_of(need_ex_var)) %>% clear_labels() %>% mutate(across(everything(), as.character)))
        }
      }
      ### variable rename -- cannot use the original name, thus needed the rename step
      for (n in 1:length(ex_all)) {
        if (n==1) names(ex_all)[names(ex_all) == need_ex_var[n]] <- "id"
        if (n==2) names(ex_all)[names(ex_all) == need_ex_var[n]] <- "EXDOSE"
        if (n==3) names(ex_all)[names(ex_all) == need_ex_var[n]] <- "EXTRT"
        if (n==4) names(ex_all)[names(ex_all) == need_ex_var[n]] <- "EXSTDAT"
        if (n==5) names(ex_all)[names(ex_all) == need_ex_var[n]] <- "EXENDAT"
      }
      attr(ex_all$id,"label") <- NULL
      
      ex_all1 <- ex_all %>% filter(EXDOSE != 0 & EXSTDAT <= cutoffdate) ##### CUTOFF2 - EX ----
      ex_all2 <- ex_all1
      if ("EXENDAT" %in% names(ex_all)) {
        ex_all2 <- ex_all1 %>%
          mutate(EXENDAT = ifelse(EXENDAT>cutoffdate, cutoffdate, EXENDAT))
      } else {
        ex_all2 <- ex_all1 %>%
          mutate(EXENDAT = EXSTDAT)
      }
      dm5_1 <- ex_all2 %>%
        group_by(id) %>%
        summarise(TRTSDT = min(lubridate::ymd(EXSTDAT), lubridate::ymd(EXENDAT), na.rm = T),
                  TRTEDT = max(lubridate::ymd(EXSTDAT), lubridate::ymd(EXENDAT), na.rm = T)) %>%
        select(id, TRTSDT, TRTEDT)
      
  • 第四部分:截断
  • ​时间截断: 基于部门数据截取规范。cutoffdate来源于平台交互的用户输入信息,需注意在写function时留出。
    gen_adxx <- function(data, spec, cutoffdate = Sys.Date()) {
      ...
      ## cutoff ----
      suppressWarnings(
        adxx <- adxx %>%
          filter(ymd(XXSTDTC) <= ymd(cutoffdate) | is.na(ymd(XXSTDTC))) %>%
          mutate(XXENDTC = ifelse(XXENDTC>as.character(cutoffdate), NA, XXENDTC))
      )
    }
    
  • 第五部分:标签处理&输出
  • ​标签处理: 使用 Hmisc::label 添加变量描述,继承spec中标签
  • ​变量选择: 按spec保留所需变量
  • ​排序输出: 按指定顺序进行排序
    ## FINAL OUTPUT & LABEL ----
    dm_final <- dm6_5 %>%
      select(any_of(all_var2)) %>%
      arrange(SUBJID)
    ### label and add eot label
    specdf <- transjson_todf(spec)
    dm_label <- c(specdf$description)
    names(dm_label) <- specdf$name
    final_lable <- c(dm_label, eot_label2, rfs_label)
    Hmisc::label(dm_final) <- as.list(final_lable[match(names(dm_final), names(final_lable))])
    
    注意事项:
  • label会影响数据合并等操作,在label不重要时可清除label属性。
  • 在变量衍生时,需注意同时满足中英文的匹配。
  • 在本地基于多个项目的EDC数据进行生成测试,保证adam数据生成质量以及不同项目兼容性。
  • 当存在一些可无视的warning输出时,可选择用suppressWarnings()。
  • 所有数值型变量衍生时,需注意是否要处理Inf/-Inf值。