



####zhaoyunfei
####20251115
library(Seurat)
library(NMF)
library(cluster)
library(dplyr)
library(ggplot2)
#' @param seurat_obj Seurat对象
#' @param k_values k值范围,默认2:18
#' @param n_top_genes 每个程序保留的top基因数,默认50
#' @return 包含所有NMF程序结果的列表
run_multiple_k_nmf <- function(seurat_obj, k_values = 2:18, n_top_genes = 50) {
# 获取归一化的表达矩阵
norm_data <- GetAssayData(seurat_obj, slot = "data")
norm_data <- as.matrix(norm_data)
# 移除零表达的基因
norm_data <- norm_data[rowSums(norm_data) > 0, ]
nmf_results <- list()
program_counter <- 1
for(k in k_values) {
cat("Running NMF with k =", k, "\n")
tryCatch({
# 运行NMF
nmf_fit <- nmf(norm_data, rank = k, method = "brunet", nrun = 10)
# 提取特征基因矩阵
feature_matrix <- basis(nmf_fit)
# 为每个程序提取top基因
for(i in 1:k) {
program_genes <- rownames(feature_matrix)[order(feature_matrix[, i], decreasing = TRUE)[1:n_top_genes]]
nmf_results[[program_counter]] <- list(
k_value = k,
program_id = paste0("k", k, "_program", i),
genes = program_genes,
scores = feature_matrix[program_genes, i]
)
program_counter <- program_counter + 1
}
}, error = function(e) {
cat("Error with k =", k, ":", e$message, "\n")
})
}
return(nmf_results)
}
#' @param setA 基因集合A
#' @param setB 基因集合B
#' @return Jaccard index
jaccard_index <- function(setA, setB) {
intersection <- length(intersect(setA, setB))
union <- length(union(setA, setB))
return(intersection / union)
}
#' 筛选稳健的NMF程序
#' @param nmf_results NMF结果列表
#' @param min_k_overlap 不同k值间最小基因重叠数,默认35
#' @param redundancy_threshold 冗余阈值,默认0.1
#' @param cross_sample_threshold 跨样本一致性阈值,默认0.1
#' @return 筛选后的NMF程序列表
filter_robust_programs <- function(nmf_results, min_k_overlap = 35,
redundancy_threshold = 0.1,
cross_sample_threshold = 0.1) {
# 按样本分组(这里假设所有程序来自同一个样本,如需多样本需调整)
programs_by_k <- split(nmf_results, sapply(nmf_results, function(x) x$k_value))
robust_programs <- list()
retained_programs <- c()
# 第一步:样本内筛选 - 不同k值间的重叠
for(program in nmf_results) {
program_genes <- program$genes
# 检查与其他k值的程序的重叠
overlap_count <- 0
for(other_program in nmf_results) {
if(other_program$k_value != program$k_value) {
overlap <- length(intersect(program_genes, other_program$genes))
if(overlap >= min_k_overlap) {
overlap_count <- overlap_count + 1
break
}
}
}
if(overlap_count > 0) {
robust_programs[[length(robust_programs) + 1]] <- program
retained_programs <- c(retained_programs, program$program_id)
}
}
# 第二步:去除冗余程序
non_redundant_programs <- list()
processed_pairs <- character()
for(i in 1:length(robust_programs)) {
redundant <- FALSE
prog_i <- robust_programs[[i]]
for(j in 1:length(robust_programs)) {
if(i != j) {
prog_j <- robust_programs[[j]]
pair_id <- paste(sort(c(prog_i$program_id, prog_j$program_id)), collapse = "_")
if(!pair_id %in% processed_pairs) {
jaccard <- jaccard_index(prog_i$genes, prog_j$genes)
if(jaccard >= redundancy_threshold) {
redundant <- TRUE
# 保留基因数较多或得分较高的程序
if(length(prog_i$genes) < length(prog_j$genes) ||
mean(prog_i$scores) < mean(prog_j$scores)) {
break
}
}
processed_pairs <- c(processed_pairs, pair_id)
}
}
}
if(!redundant) {
non_redundant_programs[[length(non_redundant_programs) + 1]] <- prog_i
}
}
return(non_redundant_programs)
}
#' 聚类NMF程序生成共识MPs
#' @param filtered_programs 筛选后的NMF程序
#' @param min_overlap 最小重叠阈值,默认0.1
#' @return 共识MPs列表
generate_consensus_mps <- function(filtered_programs, min_overlap = 0.1) {
if(length(filtered_programs) == 0) {
stop("No programs to cluster")
}
# 创建基因集合列表
gene_sets <- lapply(filtered_programs, function(x) x$genes)
names(gene_sets) <- sapply(filtered_programs, function(x) x$program_id)
# 计算杰卡德距离矩阵
n_programs <- length(gene_sets)
dist_matrix <- matrix(0, nrow = n_programs, ncol = n_programs)
for(i in 1:n_programs) {
for(j in 1:n_programs) {
if(i != j) {
jaccard <- jaccard_index(gene_sets[[i]], gene_sets[[j]])
dist_matrix[i, j] <- 1 - jaccard # 转换为距离
}
}
}
# 使用层次聚类
hc <- hclust(as.dist(dist_matrix), method = "average")
# 自动确定聚类数量(这里使用简单的启发式方法)
# 您可以根据需要调整聚类切割高度
clusters <- cutree(hc, h = 0.8) # 调整h参数以获得合适的聚类数
# 生成共识MPs
consensus_mps <- list()
unique_clusters <- unique(clusters)
for(cluster_id in unique_clusters) {
cluster_programs <- which(clusters == cluster_id)
cluster_genes <- unique(unlist(lapply(filtered_programs[cluster_programs], function(x) x$genes)))
# 计算基因频率
gene_freq <- table(unlist(lapply(filtered_programs[cluster_programs], function(x) x$genes)))
consensus_mps[[paste0("MP", cluster_id)]] <- list(
genes = names(sort(gene_freq, decreasing = TRUE)),
gene_frequency = sort(gene_freq, decreasing = TRUE),
n_programs = length(cluster_programs)
)
}
return(consensus_mps)
}
#' 计算MPs的GSVA分数
#' @param seurat_obj Seurat对象
#' @param consensus_mps 共识MPs
#' @return 包含MP分数的矩阵
calculate_mp_scores <- function(seurat_obj, consensus_mps) {
# 简化版的MP评分计算(实际GSVA需要更复杂的实现)
# 这里使用简单的平均表达量作为示例
norm_data <- GetAssayData(seurat_obj, slot = "data")
mp_scores <- matrix(0, nrow = length(consensus_mps), ncol = ncol(norm_data))
rownames(mp_scores) <- names(consensus_mps)
colnames(mp_scores) <- colnames(norm_data)
for(mp_name in names(consensus_mps)) {
mp_genes <- consensus_mps[[mp_name]]$genes[1:50] # 取前50个基因
available_genes <- mp_genes[mp_genes %in% rownames(norm_data)]
if(length(available_genes) > 0) {
mp_scores[mp_name, ] <- colMeans(norm_data[available_genes, , drop = FALSE])
}
}
return(mp_scores)
}
# 主分析流程
main_nmf_analysis <- function(seurat_obj) {
cat("Step 1: Running multiple k NMF\n")
nmf_results <- run_multiple_k_nmf(seurat_obj)
cat("Step 2: Filtering robust programs\n")
filtered_programs <- filter_robust_programs(nmf_results)
cat("Step 3: Generating consensus MPs\n")
consensus_mps <- generate_consensus_mps(filtered_programs)
cat("Step 4: Calculating MP scores\n")
mp_scores <- calculate_mp_scores(seurat_obj, consensus_mps)
cat("Step 5: Assigning MP labels to spots\n")
# 为每个spot分配MP标签
mp_assignments <- apply(mp_scores, 2, function(x) {
if(all(is.na(x)) || max(x) == 0) return(NA)
rownames(mp_scores)[which.max(x)]
})
# 将结果添加到Seurat对象
seurat_obj$MP_assignment <- mp_assignments
# 计算每个MP在每个样本中的比例
sample_mp_proportions <- table(seurat_obj$MP_assignment, seurat_obj$orig.ident)
sample_mp_proportions <- prop.table(sample_mp_proportions, margin = 2)
# 计算MP间的相关性
mp_correlations <- cor(t(sample_mp_proportions), method = "spearman")
return(list(
seurat_obj = seurat_obj,
nmf_results = nmf_results,
filtered_programs = filtered_programs,
consensus_mps = consensus_mps,
mp_scores = mp_scores,
sample_mp_proportions = sample_mp_proportions,
mp_correlations = mp_correlations
))
}
# 使用示例
# Seurat对象名为 seurat_obj
# results <- main_nmf_analysis(seurat_obj)
# 保存结果
# saveRDS(results, "nmf_analysis_results.rds")
# 可视化MP相关性
plot_mp_correlations <- function(mp_correlations) {
corr_melt <- reshape2::melt(mp_correlations)
ggplot(corr_melt, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limits = c(-1, 1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Meta-Programs", y = "Meta-Programs", fill = "Correlation")
}
# 可视化MP在样本中的分布
plot_mp_distribution <- function(sample_mp_proportions) {
dist_melt <- reshape2::melt(sample_mp_proportions)
ggplot(dist_melt, aes(x = Var2, y = value, fill = Var1)) +
geom_bar(stat = "identity", position = "fill") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Samples", y = "Proportion", fill = "Meta-Programs")
}
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。