1)str_length() 统计字符串长度
str_length("hello world")
10 #这个字符串的长度,包括空格和一些符号在内
2)str_split() 拆封字符串
str_split("hello world"," ")
[[1]]
[1] "hello" "world"
⚠️注意:str_spilt的第二个参数,写你想分割的符号,上面代码“hello world”的分割是空格,因此输入“ ”,同样也可以是其他符号。
> y = c("jimmy 150","nicker 140","tony 152")
> str_split(y," ")
#按照空格进行拆分,会变成list格式
[[1]]
[1] "jimmy" "150"
[[2]]
[1] "nicker" "140"
[[3]]
[1] "tony" "152"
> str_split(y,",")
#按照逗号进行拆分
[[1]]
[1] "jimmy 150"
[[2]]
[1] "nicker 140"
[[3]]
[1] "tony 152"
> z<-str_split(y," ",simplify = T)
#加上simplity参数后,输出的结果变成矩阵
> z
[,1] [,2]
[1,] "jimmy" "150"
[2,] "nicker" "140"
[3,] "tony" "152"
> class(z)
[1] "matrix" "array"
3)str_sub() 按位置取字符串
x <- "The birch canoe slid on the smooth planks."
str_sub(x,5,9) #取x字符串第五到第九位
[1] "birch"
4)str_detect() 查找字节
x2 = str_split(x," ")[[1]];x2
[1] "The" "birch" "canoe" "slid"
[5] "on" "the" "smooth" "planks."
str_detect(x2,"h")
[1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE
[8] FALSE
根据搜索的内容会返回true or false的值
5)str_replace() / str_replace_all() 字符替换
x2
[1] "The" "birch" "canoe" "slid"
[5] "on" "the" "smooth" "planks."
str_replace(x2,"o","A") #同一个字符串只替换了一次
[1] "The" "birch" "canAe" "slid"
[5] "An" "the" "smAoth" "planks."
str_replace_all(x2,"o","A") #全部替换
[1] "The" "birch" "canAe" "slid"
[5] "An" "the" "smAAth" "planks."
6) str_remove() / str_remove_all () 字符删除
> x
[1] "The birch canoe slid on the smooth planks."
> str_remove(x,"o")
#只删一个字符
[1] "The birch cane slid on the smooth planks."
> str_remove_all(x,"o")
[1] "The birch cane slid n the smth planks."
#数据处理
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL # 去掉行名,NULL是“什么都没有”
test
1)arrange() 排序
library(dplyr)
arrange(test, Sepal.Length) #从小到大
arrange(test, desc(Sepal.Length)) #从大到小
2)distinct()去重复
distinct(test,Species,.keep_all = T) #把Species列的重复去掉
3)数据框新增一列
mutate(test, new = Sepal.Length * Sepal.Width)
test$new<-test$Sepal.Length*test$Sepal.Width
‼️‼️管道符的妙用-----%>%
x = iris %>%
filter(Sepal.Width>3) %>%
select(Sepal.Length,Sepal.Width)%>%
arrange(Sepal.Length)
x定义为iris数据集——筛选出数据集中Sepal.Width大于3的值——将这些值的Sepal.Length和Sepal.Width列输出——从小到大排序
1)if 条件语句
基本格式:
if (i>1) {print("+")
else
{print( "-")}
2)‼️重点函数:ifelse()
ifelse(x,yes,no)
x:逻辑值或逻辑向量
yes:逻辑值为true时的返回值
no:逻辑值是false时的返回值
x = rnorm(3)
x
[1] 0.9616716 -0.1292150 1.7251983
ifelse(x>0,"+","-")
[1] "+" "-" "+"
⚠️ifelse()和str_detect()函数连用的超牛用途
samples = c("tumor1","tumor2","tumor3","normal1","normal2","normal3")
k1 = str_detect(samples,"tumor");k
1
[1] TRUE TRUE TRUE FALSE FALSE FALSE
ifelse(k1,"tumor","normal")
[1] "tumor" "tumor" "tumor" "normal" "normal"
[6] "normal"
(以后可以方便对数据进行分组)
3)多个条件
i=0
if(i>0){print("+")}
else if (i==o) {print("0")
else if (i<0) {print("-")}
[1] "0"
#当然也可以使用ifelse函数
ifelse(i>0,'+',ifelse(i<0,'-','0'))
[1] "0"
4)for循环
#元素循环
x <- c(5,6,0,3)
s=0
for (i in x)
{
s=s+i
print(c(i,s))
}
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
#下标循环
x <- c(5,6,0,3)
s = 0
for (i in 1:length(x)){
s=s+x[[i]]
print(c(x[[i]],s))
}
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
#储存结果
s = 0
(上面s已经变成14,重新设置一下)
result = list()
for(i in 1:length(x)){
s=s+x[[i]]
result[[i]] = c(x[[i]],s)
}
result
[[1]]
[1] 5 5
[[2]]
[1] 6 11
[[3]]
[1] 0 11
[[4]]
[1] 3 14
1)apply(x,margin,function)
其中x代表数据框或矩阵,margin=1代表行;margin=2代表列;function表示对行或列采取的函数
test<- iris[1:6,1:4]
apply(test, 2, mean)
#计算出每列的平均值
Sepal.Length Sepal.Width Petal.Length
Petal.Width
4.9500000 3.3833333 1.4500000
0.2333333
2)对列表中的元素进行操作的 lapply()
test <- list(x = 36:33,y = 32:35,z = 30:27);test
$x
[1] 36 35 34 33
$y
[1] 32 33 34 35
$z
[1] 30 29 28 27
lapply(test,mean)
$x
[1] 34.5
$y
[1] 33.5
$z
[1] 28.5
3)简化的隐式循环
(由于lapply输出的格式也是列表不便于观看,因此可以使用sapply函数)
sapply(test,mean)
#输出形式是矩阵
x y z
34.5 33.5 28.5
数据设置
test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'), blood_type = c("A","B","O","AB"))
test1
name blood_type
1 jimmy A
2 nicker B
3 Damon O
4 Sophie AB
test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
test2
name group vision
1 Damon group1 4.2
2 jimmy group1 4.3
3 nicker group2 4.9
4 tony group2 4.5
> library(dplyr)
1)inner_join() 根据共同的列名取交集再合并
inner_join(test1,test2,by="name")
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
2)left_join() 左连接(保留左边数据所有,相同的会被合并,空的数据为NA)
left_join(test1,test2,by="name")
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
4 Sophie AB <NA> NA
3)right_join( )右连接 (保留左边数据所有,相同的会被合并,空的数据为NA)
right_join(test1,test2,by="name")
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
4 tony <NA> group2 4.5
4)full_join() 全连接(所有数据按命令列连接)
full_join(test1,test2,by="name")
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
4 Sophie AB <NA> NA
5 tony <NA> group2 4.5
5)semi_join ()半连接 (前一个数据中选出共同列名的值)
semi_join(test1,test2,by="name")
name。 blood_type
1 jimmy A
2 nicker B
3 Damon O
6)anti_join( ) 反连接 (输出前一个数据中除外共同列名的数据)
anti_join(test1,test2,by="name")
name blood_type
1 Sophie AB
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。