04-常见Rpackage介绍¶
主要介绍SHR-A1811-206项目中推荐使用到的package及相关function
安装R包¶
## 常用方法&在本地电脑方法
install.packages("dplyr")
## 远程桌面方法(无法联网,只能从指定路径安装)
options(repos = c(CRAN = "https://nexus.hengrui.com/repository/r-proxy-cran/"))
install.packages("dplyr")
## 本地下载包安装方式
install.packages("C:/Users/wangjy35/Downloads/renv_1.1.8.tar.gz")
spec相关:metacore, metatools¶
{metacore}设置并读取spec和codelist¶
{metacore}:https://atorus-research.github.io/metacore/
{metatools}调用spec¶
{metatools}:https://pharmaverse.github.io/metatools/
本项目中,{metatools}中大部分相关内容已打包至相关function中,详细参考03-内置funcion介绍。
以下是一些常见的function,比如sort_by_key(),可以直接通过读取spec的ds_vars的排序变量顺序,对当前数据进行排序,避免在代码中手动赋值。
add_labels() ## Apply labels to multiple variables on a data frame
add_variables() ## Add Missing Variables
build_from_derived() ## Build a dataset from derived
build_qnam() ## Build the observations for a single QNAM
check_ct_col() ## Check Control Terminology for a Single Column
check_ct_data() ## Check Control Terminology for a Dataset
check_unique_keys() ## Check Uniqueness of Records by Key
check_variables() ## Check Variable Names
combine_supp() ## Combine the Domain and Supplemental Qualifier
convert_var_to_fct() ## Convert Variable to Factor with Levels Set by Control Terms
create_cat_var() ## Create Categorical Variable from Codelist
create_subgrps() ## Create Subgroups
create_var_from_codelist() ## Create Variable from Codelist
drop_unspec_vars() ## Drop Unspecified Variables
get_bad_ct() ## Gets vector of control terminology which should be there
make_supp_qual() ## Make Supplemental Qualifier
metatools_example() ## Get path to pkg example
order_cols() ## Sort Columns by Order
remove_labels() ## Remove labels to multiple variables on a data frame
set_variable_labels() ## Apply labels to a data frame using a metacore object
sort_by_key() ## Sort Rows by Key Sequence
数据处理:dplyr, tidyr, stringr, lubridate¶
{dplyr}:https://dplyr.tidyverse.org/
{tidyr}:https://tidyr.tidyverse.org/
{stringr}:https://stringr.tidyverse.org/
{lubridate}:https://lubridate.tidyverse.org/
1. 数据处理¶
1.1 基础操作¶
SAS
data new_data;
set mydata;
new_var = old_var * 2;
new_var2 = ifc(new_var > 10, "大", "小");
if var1 > 100 and var2 = 'A' then output;
/* 使用if-then-else */
length status $10;
if score >= 90 then status = 'Excellent';
else if score >= 80 then status = 'Good';
else if score >= 60 then status = 'Pass';
else status = 'Fail';
keep var1 var2 new_var;
drop var3 var4;
run;
proc sort data=mydata;
by descending var1 var2;
run;
R
new_data <- mydata %>%
mutate(new_var = old_var * 2, # 创建新变量
new_var2 = ifelse(new_var > 10, "大", "小"), # ifc和ifn在R中可统一用ifelse
status = case_when( # 多个条件判断可用case_when
score >= 90 ~ "Excellent",
score >= 80 ~ "Good",
score >= 60 ~ "Pass",
TRUE ~ "Fail"
)
) %>%
filter(var1 > 100, var2 == "A") %>% # 条件筛选
select(var1, var2, new_var) %>% # 选择/保留变量
select(-var3, -var4) %>% # 删除变量
arrange(desc(var1), var2) # 降序排列用desc()
1.2 数据类型:缺失值,重复值,字符数值转换¶
SAS
R
## 缺失值判断
is.na(var) # 注意na是数据为空,但有些情况中字符型数据为""时,不属于NA,判断时需特别注意
## 处理重复值
deduped <- mydata %>%
distinct() # 删除完全重复的行
deduped_by_id <- mydata %>%
distinct(id_var) # 删除基于关键变量的重复行,当提供具体变量时,数据集中仅保留指定变量,类似于select(xxx) %>% distinct()
unique(mydata$vars) # 查看变量中不重复值
## 数值转换
converted <- mydata %>%
mutate(
num_var = as.numeric(char_var), # 字符转数值
char_var2 = as.character(num_var), # 数值转字符
factor_var = as.factor(category_var) # 因子转换
)
2. 分组汇总 - group_by & summarise¶
group_summary <- mydata %>%
group_by(group_var) %>%
summarise(
n = n(),
mean_var1 = mean(var1, na.rm = TRUE),
sd_var1 = sd(var1, na.rm = TRUE),
median_var1 = median(var1, na.rm = TRUE),
min_var1 = min(var1, na.rm = TRUE),
max_var1 = max(var1, na.rm = TRUE)
) %>%
ungroup() ## 在使用完group_by后一定要注意使用ungroup,避免影响后续其他计算
mydata %>%
group_by(group_var) %>%
mutate(
lag_value = lag(value_var), # 前一行
lead_value = lead(value_var) # 后一行
) %>%
ungroup()
3. 数据连接与转置¶
## 横向合并
merged <- inner_join(data1, data2, by = "key_var")
left_joined <- left_join(data1, data2, by = "key_var")
right_joined <- right_join(data1, data2, by = "key_var")
full_joined <- full_join(data1, data2, by = "key_var")
## 纵向合并,类似set操作
appended <- bind_rows(data1, data2, data3)
## 数据转置(用到tidyr包)
long <- wide %>%
tidyr::pivot_longer(
cols = c(var1, var2, var3),
names_to = "variable",
values_to = "value"
)
wide_data <- long %>%
tidyr::pivot_wider(
names_from = variable,
values_from = value
)
4. 字符串处理 - stringr包¶
SAS
data string_ops;
set mydata;
len = length(string_var); /* 字符长度 */
first_3 = substr(string_var, 1, 3); /* 提取前3个字符 */
pos = index(string_var, "search_text"); /* 查找位置 */
new_str = tranwrd(string_var, "old", "new"); /* 替换 */
new_str2 = scan(string_var, 1, ","); /* 提取逗号前内容 */
upper_str = upcase(string_var); /* 转大写 */
lower_str = lowcase(string_var); /* 转小写 */
proper_str = propcase(string_var); /* 首字母大写 */
left_trim = left(string_var); /* 去除左侧空格 */
right_trim = right(string_var); /* 去除右侧空格 */
both_trim = strip(string_var); /* 去除两侧空格 */
full_name = cat(first_name, " ", last_name); /* 连接字符串 */
full_name2 = catx(" ", first_name, last_name); /* 自动处理分隔符 */
/* 分割字符串 */
length part1 part2 part3 $50;
part1 = scan(string_var, 1, ",");
part2 = scan(string_var, 2, ",");
part3 = scan(string_var, 3, ",");
run;
R
mydata %>%
mutate(len = str_length(string_var),# 字符串长度
first_3 = str_sub(string_var, 1, 3), # 提取前3个字符
last_3 = str_sub(string_var, -3) # 提取最后3个字符
pos = str_locate(string_var, "search_text")[, "start"], # 查找位置
has_pattern = str_detect(string_var, "pattern"), # 检测是否包含模式
extracted = str_extract(string_var, "[0-9]+"), # 提取匹配内容
new_str = str_split_i(string_var, ",", i = 1), # 提取逗号前内容
new_str = str_replace_all(string_var, "old", "new"), # 替换(所有出现)
new_str_first = str_replace(string_var, "old", "new"), # 替换(只针对第一次出现)
upper_str = str_to_upper(string_var), # 转大写
lower_str = str_to_lower(string_var), # 转小写
title_str = str_to_title(string_var), # 标题格式
trim_both = str_trim(string_var), # 去除两侧空格
trim_left = str_trim(string_var, "left"), # 去除左侧空格
trim_right = str_trim(string_var, "right"), # 去除右侧空格
full_name = str_c(first_name, " ", last_name), # 连接字符串
full_name = paste0(first_name, " ", last_name) # 连接字符串
) %>%
tidyr::separate( # 分割为多列(用到tidyr包)
string_var,
into = c("part1", "part2", "part3"),
sep = ",",
extra = "merge", # 处理多余部分
fill = "right" # 处理不足部分
)
5. 日期处理¶
5.1 日期转换¶
SAS
5.2 日期提取¶
SAS
5.3 日期计算¶
SAS
data date_calc;
set mydata;
/ 加减天数 */
next_day = date_var + 1;
prev_day = date_var - 1;
/* 加减月份 */
next_month = intnx('month', date_var, 1);
prev_month = intnx('month', date_var, -1);
/* 日期差 */
days_diff = date2 - date1;
months_diff = intck('month', date1, date2);
years_diff = intck('year', date1, date2);
run;
R
mydata <- mydata %>%
mutate(
# 加减时间
next_day = date_var + days(1),
prev_day = date_var - days(1),
next_week = date_var + weeks(1),
next_month = date_var + months(1),
next_year = date_var + years(1),
# 到月末/月初
month_start = floor_date(date_var, "month"),
month_end = ceiling_date(date_var, "month") - days(1),
# 日期差
days_diff = as.numeric(date2 - date1), ## 这里的date2和date1需为Date格式,而非字符型
weeks_diff = interval(date1, date2) / weeks(1)
)
数据读入和输出¶
1. 读入SAS数据:haven包¶
{haven}:https://haven.tidyverse.org/
2. 输出SAS数据:xportr包¶
3. EXCEL数据¶
{readxl}:https://readxl.tidyverse.org/ {openxlsx}:https://ycphs.github.io/openxlsx/ 注意:readxl包只能读入数据,无法写出数据,如果需要写出数据到excel文件,可以使用openxlsx包
mydata <- readxl::read_excel("/path/to/input.xlsx", sheet = "Sheet1", # 指定工作表
col_names = TRUE,
na = c("", "NA"))
mydata <- readxl::read_excel("/path/to/input.xlsx", col_types = "text") # 直接将所有列读入为字符型,避免readxl自动判断数据类型导致的错误
mydata <- readxl::read_excel("/path/to/input.xlsx", guess_max = Inf) # 增加guess_max参数,默认值为1000,增加后可以让readxl查看更多行数据来判断数据类型,避免前面几行数据类型单一导致的错误判断
mydata <- metacore::read_all_sheets("/path/to/input.xlsx") # 批量读入,内部调用的也是read_excel(., col_types = "text")
mydata <- openxlsx::read.xlsx("/path/to/input.xlsx", sheet = "Sheet1",
colNames = TRUE, na.strings = c("", "NA")) # openxlsx的读入方法,功能和readxl类似,速度更快
openxlsx::write.xlsx(mydata, "/path/to/output.xlsx")
4. CSV数据¶
{readr}:https://readr.tidyverse.org/
mydata <- read.csv("/path/to/input.csv", header = TRUE, na.strings = c("", "NA")) # base R的读入方法 mydata <- readr::read_csv("/path/to/input.csv", col_names = TRUE, na = c("", "NA")) # readr包的读入方法 write.csv(mydata, "/path/to/output.csv", row.names = FALSE, na = "") # base R的写出方法,na=""可以将R中的NA写出为空字符串,避免在csv中出现NA字符 readr::write_csv(mydata, "/path/to/output.csv", na = "") # readr包的写出方法,功能和write.csv类似,速度更快
5. r数据格式¶
# 保存单个对象
saveRDS(mydata, file = "/path/to/output.rds") # 保存为rds格式
mydata <- readRDS("/path/to/output.rds") # 读取rds格式
# 保存多个对象
save(mydata, file = "/path/to/output.RData") # 保存为RData格式
load("/path/to/output.RData") # 读取RData格式
数据对比:diffdf¶
diffdf::diffdf(df1, df2)
## 显示数据集中比对差异
diff <- diffdf::diffdf(df1, df2, suppress_warnings = TRUE)
diffdf::diffdf_issuerows(df1, diff)
diffdf::diffdf_issuerows(df2, diff)
一些其他用法¶
1. 循环¶
# FOR循环
for(i in 1:10) {
squared <- i^2
print(paste(i, "squared is", squared))
}
# WHILE循环
i <- 1
while(i <= 10) {
squared <- i^2
print(paste(i, "squared is", squared))
i <- i + 1
}
2. 条件执行¶
# if语句
x <- 10
if(x > 5) {
print("x is greater than 5")
} else if(x == 5) {
print("x is equal to 5")
} else {
print("x is less than 5")
}
# switch语句
day <- "Monday"
result <- switch(day,
"Monday" = "Start of the week",
"Friday" = "End of the week",
"Weekend")
print(result)
进阶推荐¶
apply, lapply, sapply等函数,适合对数据框的每一列或每一行进行操作,替代使用for循环¶
purrr: map系列函数,适合对列表或数据框的每一行进行操作,替代使用for循环¶
admiral: CDISC数据处理包,包含很多针对CDISC数据处理的function¶
https://pharmaverse.github.io/admiral/cran-release/index.html