from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import snapatac2 as snap
import bioquest as bq
import sckit as sk
基因组注释文件
gff_file="~/DataHub/Genomics/GENCODE/hg38.v43.chr_patch_hapl_scaff.basic.annotation.gff3.gz"
h5ad_files = []
for i in Path("data").glob("**/fragments.tsv.gz"):
_s = str(i).split('/')[1]
print(_s)
_a = snap.pp.import_data(fragment_file=i, gff_file=gff_file,
chrom_size=sk.utils.Chrom_size.hg38, min_tsse=7, min_num_fragments=1000)
# create a cell by bin matrix containing insertion
# counts across genome-wide 500-bp bins.
snap.pp.add_tile_matrix(_a)
# feature selection
snap.pp.select_features(_a)
# Doublet removal
snap.pp.scrublet(_a)
snap.pp.call_doublets(_a)
h5ad_filename = f"{_s}.h5ad"
_a.write(h5ad_filename)
h5ad_files.append((_s, h5ad_filename))
# pbmc_5k
# 2023-03-27 20:01:46 - INFO - Simulating doublets...
# 2023-03-27 20:01:47 - INFO - Spectral embedding ...
# 2023-03-27 20:01:54 - INFO - Calculating doublet scores...
# pbmc_10k
# 2023-03-27 20:10:42 - INFO - Simulating doublets...
# 2023-03-27 20:10:43 - INFO - Spectral embedding ...
# 2023-03-27 20:11:21 - INFO - Calculating doublet scores...
AnnDataSet不同于anndata的是,AnnDataSet关联了两个本地文件pbmc_5k.h5ad和pbmc_10k.h5ad,因此在不用这个对象的时候需要手动关闭adata.close()
,类似open函数。
dataset = snap.AnnDataSet(adatas=h5ad_files, filename="pbmc.h5ads")
dataset.obs_names = dataset.obs['sample'].to_numpy() + ":" + np.array(dataset.obs_names)
dataset
# AnnDataSet object with n_obs x n_vars = 15957 x 6176550 backed at 'pbmc.h5ads'
# contains 2 AnnData objects with keys: 'pbmc_5k', 'pbmc_10k'
# obs: 'sample'
# var: 'selected'
# uns: 'AnnDataSet'
snap.pp.select_features(adata)
降维
randomly selects 10,000 cells(sample_size) as the landmarks to get an initial embedding, and then uses the Nystrom method to compute the embedding for the rest of cells.
snap.tl.spectral(adata, sample_size = 10000)
snap.pl.spectral_eigenvalues(adata, interactive=False)
snap.tl.umap(adata, use_dims = 12)
snap.pl.umap(adata, color="sample", interactive=False, width = 800)
使用harmony去除批次效应
snap.pp.harmony(adata, "sample", use_dims = 12, max_iter_harmony = 20)
snap.tl.umap(adata, use_rep="X_spectral_harmony")
snap.pl.umap(adata, color="sample", interactive=False, width=800)
snap.pp.harmony(adata, "sample", use_dims = 12, max_iter_harmony = 20)
snap.tl.umap(adata, use_rep="X_spectral_harmony")
snap.pl.umap(adata, color="sample", interactive=False, width=800)
使用leiden对细胞聚类
snap.pp.knn(adata, use_rep="X_spectral_harmony", use_dims = 12)
snap.tl.leiden(adata, resolution = 1.0)
snap.pl.umap(adata, color="leiden", interactive=False, width=600)
adata.close()