文章/答案/技术大牛

发布

问从数据帧中提取信息
EN

Stack Overflow用户

提问于 2016-03-01 15:30:02

回答 4查看 931关注 0票数 1

我有如下所示的数据框架

df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
    ), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15", 
    "7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA, 
    -3L), class = "data.frame")

如下所示

> df
#   s1   s2
#1 3-4  2-4
#2 4-1 3-15
#3 5-4 7-16

我想要做的是首先搜索并找到那些类似的值，例如，这里的4位于s1的第一行、第一行s2和第三行s1中。

-The第二列显示找到这些值的次数。 -The第三列显示其中多少来自df的第一列。 -The第四列显示其中多少来自df的第二列。 -The第五条是来自第一列的字符串 -The第六行是来自第二列的字符串。

输出如下所示

Value    repeated     s1N   s1N   ss1    ss2
4           3         2      1    3,5     2
1           1         1      -     4      -
15          1         -      1     -      3
16          1         -      1     -      7

dataframe

回答 4

Stack Overflow用户

回答已采纳

发布于 2016-03-01 17:46:22

令人惊讶的棘手问题。将其分解为几个逻辑步骤是很好的：

## 1: split into (val,ss) pairs, and capture ci (column index) association
res <- setNames(do.call(rbind,lapply(seq_along(df),function(ci)
    do.call(rbind,lapply(strsplit(as.character(df[[ci]]),'-'),function(x)
        data.frame(x[2L],x[1L],ci,stringsAsFactors=F)
    ))
)),c('val','ss','ci'));
res;
##   val ss ci
## 1   4  3  1
## 2   1  4  1
## 3   4  5  1
## 4   4  2  2
## 5  15  3  2
## 6  16  7  2

## 2: aggregate ss (joining on comma) by (val,ci), and capture record count as n
res <- do.call(rbind,by(res,res[c('val','ci')],function(x)
    data.frame(val=x$val[1L],ci=x$ci[1L],n=nrow(x),ss=paste(x$ss,collapse=','),stringsAsFactors=F)
));
res;
##   val ci n  ss
## 1   1  1 1   4
## 2   4  1 2 3,5
## 3  15  2 1   3
## 4  16  2 1   7
## 5   4  2 1   2

## 3: reshape to wide format
res <- reshape(res,idvar='val',timevar='ci',dir='w');
res;
##   val n.1 ss.1 n.2 ss.2
## 1   1   1    4  NA <NA>
## 2   4   2  3,5   1    2
## 3  15  NA <NA>   1    3
## 4  16  NA <NA>   1    7

## 4: add repeated column; can be calculated by summing all n.* columns
## note: leveraging psum() from <http://stackoverflow.com/questions/12139431/add-variables-whilst-ignoring-nas-using-transform-function>
psum <- function(...,na.rm=F) { x <- list(...); rowSums(matrix(unlist(x),ncol=length(x)),na.rm=na.rm); };
res$repeated <- do.call(psum,c(res[grep('^n\\.[0-9]+$',names(res))],na.rm=T));
res;
##   val n.1 ss.1 n.2 ss.2 repeated
## 1   1   1    4  NA <NA>        1
## 2   4   2  3,5   1    2        3
## 3  15  NA <NA>   1    3        1
## 4  16  NA <NA>   1    7        1

关于NAs，如果您愿意，可以在以后修复它们。但是，我建议正确的n.*列类型是整数，因为它们表示计数，因此使用'-' (如您的示例输出)来表示空单元格是不合适的。我建议用零代替。对于ss.*列来说，破折号很好，因为它们是字符串。以下是你如何做到的：

n.cis <- grep('^n\\.[0-9]+$',names(res));
ss.cis <- grep('^ss\\.[0-9]+$',names(res));
res[n.cis][is.na(res[n.cis])] <- 0L;
res[ss.cis][is.na(res[ss.cis])] <- '-';
res;
##   val n.1 ss.1 n.2 ss.2 repeated
## 1   1   1    4   0    -        1
## 2   4   2  3,5   1    2        3
## 3  15   0    -   1    3        1
## 4  16   0    -   1    7        1

票数 1

Stack Overflow用户

发布于 2016-03-01 16:40:13

您需要做的第一件事是从字符串中提取数字。跑步：

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)

在第一行中拆分字符串，并在第二行将它们转换为数值。结果是一个三维矩阵，你可以用它来提取你的值.

首先创建一个新的dataframe：

#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,]))) 
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))

最明显的方法(但可能不是最有效的)是循环并匹配您需要的任何东西：

dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
  getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
  dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
  dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
  if(is.na(dfout$s1N[i])){
    dfout$s1N[i] <- 0
  }
  dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
  if(is.na(dfout$s2N[i])){
    dfout$s2N[i] <- 0
  }
  getID <- which(newdf[2,,1]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss1[i] <- toString(newdf[1,,1][getID])
  } else {
    dfout$ss1[i] <- 0
  }
  getID <- which(newdf[2,,2]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss2[i] <- toString(newdf[1,,2][getID])
  } else {
    dfout$ss2[i] <- 0
  }
}
dfout
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

编辑以循环n个s值

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
  #get their names, paste N or s and add them to the names array
  names[2+j] <- paste(names(df)[j],"N",sep="")
  names[2+j+slength] <- paste("s",names(df)[j],sep="")
  #loop to get the other values
  for(i in 1:dflength){
    getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
    dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
    dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
    if(is.na(dfout[i,2+j])){
      dfout[i,2+j] <- 0
    }
    getID <- which(newdf[2,,j]==dfout[i,1])
    if(length(getID)>0){
      dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
    } else {
      dfout[i,2+j+slength] <- 0
    }
  }
}
colnames(dfout)<-names
as.data.frame(dfout)
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

票数 1

Stack Overflow用户

发布于 2016-03-01 18:51:52

我已经根据你的真实数据完全重写了所有的代码，并在我的机器上进行了测试。因为它是一个相当大的数据，它需要一些时间来运行，在我看来，循环是不可避免的。

# function to split the strings
myfun<-function(x){
  x<-strsplit(as.character(x), '-')
  x1<-unlist(x)
  x.new<-as.data.frame(matrix(x1, byrow = T, length(x)))
  return(x.new)
}

# this returns a list of dataframes 
list.v<-lapply(df[1:dim(df)[2]], myfun)
# like this
head(list.v[[17]])

# try to combine all the dfs, produced an error of mismatching # of columns
df2<-do.call(rbind, list.v) 

# some of the dfs in list.v are all NA's, they should be dropped
sum<-summary(list.v)        
list.v<-list.v[-which(sum[,1] != "2")]  # this excludes those all-NA datafrmes in list.v

# now combine all dfs for indexing purposes
df2<-do.call(rbind, list.v)

# create "value", "repeated" column in the desired result df. 
# These codes are same as my previous answer
value<-names(table(df2[,2]))
repeated<-as.vector(table(df2[,2]))

# create an empty list to store the counts columns
list.count<-vector("list", length = length(list.v))

# every df in list.v has same number of rows, get the row number
rownum<-nrow(list.v[[1]])

# use a for loop to fill out list.count
for(i in 0:(length(list.count)-1)){
  row.start<-i*rownum+1   # it is kind of tricky here
  row.end<-(i+1)*rownum   # same as above
  list.count[[i + 1]]<-as.vector(table(df2[,2][row.start:row.end]))
}

# combine the vectors in list.count and assing names
count.df<-do.call(cbind, list.count)
count.df<-as.data.frame(count.df)

# create & assign colum names in the format of "s_n", and "_" is filled with corresponding original column name
names.cnt<-character()
for(i in 1:length(names(list.v))){
  names.cnt[i]<-paste("s", names(list.v)[i], "n", sep="")
}          
names(count.df)<-names.cnt

# this is a very long loop to concatenate the strings and store them into a matrix, but it gets the job done here.
ss.store<-matrix(,nrow = length(value), ncol = length(list.v), byrow = FALSE)
for(i in 1:length(list.v)){
  for(j in 1:length(value)){
  ss.store[j,i]<-paste(list.v[[i]][,1][which(list.v[[i]][,2] == value[j])], collapse =",")
  }
}

# create a df for strings
string.df<-as.data.frame(ss.store, stringsAsFactors = FALSE)

# create & assign names to the df
names.str<-character()
for(i in 1:length(names(list.v))){
  names.str[i]<-paste("s", "s", names(list.v)[i], sep="")
} 
names(string.df)<-names.str

# combine everything and form the new data frame
new.df<-cbind(value, repeated, count.df, string.df, stringAsFactors = FALSE)

new.df[1:10, 1:15]
   value repeated sAn sF1n sF2n sF3n sF4n sF5n sF6n sF7n sF8n sF9n sF10n sF11n sF12n
1    100      155   3    0    0    0    0    0    0    0    0    0     0     0     0
2   1005       14   1    0    0    0    0    0    0    0    0    0     0     0     0
3   1006       50   1    0    0    0    0    0    0    0    0    0     0     0     0
4   1023        1   1    0    0    0    0    0    0    0    0    0     0     0     0
5   1025       38   1    0    0    0    0    0    0    0    0    0     0     0     0
6   1030      624   1    0    1    2    0    0    0    0    0    0     1     0     0
7   1035        1   1    0    0    0    0    0    0    0    0    0     0     0     0
8    104      165   2    0    0    0    0    0    0    0    0    0     0     0     0
9   1076      186   2    0    0    0    0    0    0    0    0    0     0     0     0
10  1078      333   3    0    0    0    0    0    0    0    0    0     0     0     0

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/35726892

复制

相似问题

问从数据帧中提取信息
EN

回答 4

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问从数据帧中提取信息EN

回答 4

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问从数据帧中提取信息
EN