我有如下所示的数据框架
df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15",
"7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA,
-3L), class = "data.frame")
如下所示
> df
# s1 s2
#1 3-4 2-4
#2 4-1 3-15
#3 5-4 7-16
我想要做的是首先搜索并找到那些类似的值,例如,这里的4位于s1的第一行、第一行s2和第三行s1中。
-The第二列显示找到这些值的次数。 -The第三列显示其中多少来自df的第一列。 -The第四列显示其中多少来自df的第二列。 -The第五条是来自第一列的字符串 -The第六行是来自第二列的字符串。
输出如下所示
Value repeated s1N s1N ss1 ss2
4 3 2 1 3,5 2
1 1 1 - 4 -
15 1 - 1 - 3
16 1 - 1 - 7
发布于 2016-03-01 17:46:22
令人惊讶的棘手问题。将其分解为几个逻辑步骤是很好的:
## 1: split into (val,ss) pairs, and capture ci (column index) association
res <- setNames(do.call(rbind,lapply(seq_along(df),function(ci)
do.call(rbind,lapply(strsplit(as.character(df[[ci]]),'-'),function(x)
data.frame(x[2L],x[1L],ci,stringsAsFactors=F)
))
)),c('val','ss','ci'));
res;
## val ss ci
## 1 4 3 1
## 2 1 4 1
## 3 4 5 1
## 4 4 2 2
## 5 15 3 2
## 6 16 7 2
## 2: aggregate ss (joining on comma) by (val,ci), and capture record count as n
res <- do.call(rbind,by(res,res[c('val','ci')],function(x)
data.frame(val=x$val[1L],ci=x$ci[1L],n=nrow(x),ss=paste(x$ss,collapse=','),stringsAsFactors=F)
));
res;
## val ci n ss
## 1 1 1 1 4
## 2 4 1 2 3,5
## 3 15 2 1 3
## 4 16 2 1 7
## 5 4 2 1 2
## 3: reshape to wide format
res <- reshape(res,idvar='val',timevar='ci',dir='w');
res;
## val n.1 ss.1 n.2 ss.2
## 1 1 1 4 NA <NA>
## 2 4 2 3,5 1 2
## 3 15 NA <NA> 1 3
## 4 16 NA <NA> 1 7
## 4: add repeated column; can be calculated by summing all n.* columns
## note: leveraging psum() from <http://stackoverflow.com/questions/12139431/add-variables-whilst-ignoring-nas-using-transform-function>
psum <- function(...,na.rm=F) { x <- list(...); rowSums(matrix(unlist(x),ncol=length(x)),na.rm=na.rm); };
res$repeated <- do.call(psum,c(res[grep('^n\\.[0-9]+$',names(res))],na.rm=T));
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 NA <NA> 1
## 2 4 2 3,5 1 2 3
## 3 15 NA <NA> 1 3 1
## 4 16 NA <NA> 1 7 1
关于NAs,如果您愿意,可以在以后修复它们。但是,我建议正确的n.*
列类型是整数,因为它们表示计数,因此使用'-'
(如您的示例输出)来表示空单元格是不合适的。我建议用零代替。对于ss.*
列来说,破折号很好,因为它们是字符串。以下是你如何做到的:
n.cis <- grep('^n\\.[0-9]+$',names(res));
ss.cis <- grep('^ss\\.[0-9]+$',names(res));
res[n.cis][is.na(res[n.cis])] <- 0L;
res[ss.cis][is.na(res[ss.cis])] <- '-';
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 0 - 1
## 2 4 2 3,5 1 2 3
## 3 15 0 - 1 3 1
## 4 16 0 - 1 7 1
发布于 2016-03-01 16:40:13
您需要做的第一件事是从字符串中提取数字。跑步:
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
在第一行中拆分字符串,并在第二行将它们转换为数值。结果是一个三维矩阵,你可以用它来提取你的值.
首先创建一个新的dataframe:
#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,])))
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))
最明显的方法(但可能不是最有效的)是循环并匹配您需要的任何东西:
dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
if(is.na(dfout$s1N[i])){
dfout$s1N[i] <- 0
}
dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
if(is.na(dfout$s2N[i])){
dfout$s2N[i] <- 0
}
getID <- which(newdf[2,,1]==dfout$Value[i])
if(length(getID)>0){
dfout$ss1[i] <- toString(newdf[1,,1][getID])
} else {
dfout$ss1[i] <- 0
}
getID <- which(newdf[2,,2]==dfout$Value[i])
if(length(getID)>0){
dfout$ss2[i] <- toString(newdf[1,,2][getID])
} else {
dfout$ss2[i] <- 0
}
}
dfout
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7
编辑以循环n个s值
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
#get their names, paste N or s and add them to the names array
names[2+j] <- paste(names(df)[j],"N",sep="")
names[2+j+slength] <- paste("s",names(df)[j],sep="")
#loop to get the other values
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
if(is.na(dfout[i,2+j])){
dfout[i,2+j] <- 0
}
getID <- which(newdf[2,,j]==dfout[i,1])
if(length(getID)>0){
dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
} else {
dfout[i,2+j+slength] <- 0
}
}
}
colnames(dfout)<-names
as.data.frame(dfout)
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7
发布于 2016-03-01 18:51:52
我已经根据你的真实数据完全重写了所有的代码,并在我的机器上进行了测试。因为它是一个相当大的数据,它需要一些时间来运行,在我看来,循环是不可避免的。
# function to split the strings
myfun<-function(x){
x<-strsplit(as.character(x), '-')
x1<-unlist(x)
x.new<-as.data.frame(matrix(x1, byrow = T, length(x)))
return(x.new)
}
# this returns a list of dataframes
list.v<-lapply(df[1:dim(df)[2]], myfun)
# like this
head(list.v[[17]])
# try to combine all the dfs, produced an error of mismatching # of columns
df2<-do.call(rbind, list.v)
# some of the dfs in list.v are all NA's, they should be dropped
sum<-summary(list.v)
list.v<-list.v[-which(sum[,1] != "2")] # this excludes those all-NA datafrmes in list.v
# now combine all dfs for indexing purposes
df2<-do.call(rbind, list.v)
# create "value", "repeated" column in the desired result df.
# These codes are same as my previous answer
value<-names(table(df2[,2]))
repeated<-as.vector(table(df2[,2]))
# create an empty list to store the counts columns
list.count<-vector("list", length = length(list.v))
# every df in list.v has same number of rows, get the row number
rownum<-nrow(list.v[[1]])
# use a for loop to fill out list.count
for(i in 0:(length(list.count)-1)){
row.start<-i*rownum+1 # it is kind of tricky here
row.end<-(i+1)*rownum # same as above
list.count[[i + 1]]<-as.vector(table(df2[,2][row.start:row.end]))
}
# combine the vectors in list.count and assing names
count.df<-do.call(cbind, list.count)
count.df<-as.data.frame(count.df)
# create & assign colum names in the format of "s_n", and "_" is filled with corresponding original column name
names.cnt<-character()
for(i in 1:length(names(list.v))){
names.cnt[i]<-paste("s", names(list.v)[i], "n", sep="")
}
names(count.df)<-names.cnt
# this is a very long loop to concatenate the strings and store them into a matrix, but it gets the job done here.
ss.store<-matrix(,nrow = length(value), ncol = length(list.v), byrow = FALSE)
for(i in 1:length(list.v)){
for(j in 1:length(value)){
ss.store[j,i]<-paste(list.v[[i]][,1][which(list.v[[i]][,2] == value[j])], collapse =",")
}
}
# create a df for strings
string.df<-as.data.frame(ss.store, stringsAsFactors = FALSE)
# create & assign names to the df
names.str<-character()
for(i in 1:length(names(list.v))){
names.str[i]<-paste("s", "s", names(list.v)[i], sep="")
}
names(string.df)<-names.str
# combine everything and form the new data frame
new.df<-cbind(value, repeated, count.df, string.df, stringAsFactors = FALSE)
new.df[1:10, 1:15]
value repeated sAn sF1n sF2n sF3n sF4n sF5n sF6n sF7n sF8n sF9n sF10n sF11n sF12n
1 100 155 3 0 0 0 0 0 0 0 0 0 0 0 0
2 1005 14 1 0 0 0 0 0 0 0 0 0 0 0 0
3 1006 50 1 0 0 0 0 0 0 0 0 0 0 0 0
4 1023 1 1 0 0 0 0 0 0 0 0 0 0 0 0
5 1025 38 1 0 0 0 0 0 0 0 0 0 0 0 0
6 1030 624 1 0 1 2 0 0 0 0 0 0 1 0 0
7 1035 1 1 0 0 0 0 0 0 0 0 0 0 0 0
8 104 165 2 0 0 0 0 0 0 0 0 0 0 0 0
9 1076 186 2 0 0 0 0 0 0 0 0 0 0 0 0
10 1078 333 3 0 0 0 0 0 0 0 0 0 0 0 0
https://stackoverflow.com/questions/35726892
复制相似问题