一个简单的功能,将adata中存放的稀疏的表达量矩阵保存为可以用 Seurat::Read10X_h5
读取的H5文件,代码如下
import h5py
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
def write_10X_h5(adata, file):
"""Writes an AnnData object to a 10X Genomics formatted HDF5 file.
This function creates a file compatible with Seurat's Read10X_h5 function.
It writes the sparse matrix data and associated metadata while ensuring correct data types and attributes.
Args:
adata: AnnData object containing the matrix and metadata.
file: Output file path. Appends '.h5' if not present.
Raises:
FileExistsError: If the output file already exists.
"""
# Ensure file extension is .h5
file = file if file.endswith('.h5') else f'{file}.h5'
# Check if the file already exists
if Path(file).exists():
raise FileExistsError(f"There already is a file `{file}`.")
# Helper function to calculate max integer size
def int_max(x):
return int(max(np.floor(np.log10(max(x)+1)), 1) * 4)
# Helper function to calculate max string size
def str_max(x):
return max(len(str(i)) for i in x) + 1 # +1 for null termination
# must transpose
X = adata.X.T
# Create file and write data
with h5py.File(file, 'w') as w:
grp = w.create_group("matrix")
#
grp.create_dataset("data", data=X.data, dtype=np.float32)
grp.create_dataset("indices", data=X.indices, dtype=np.int32)
grp.create_dataset("indptr", data=X.indptr, dtype=np.int32)
grp.create_dataset("shape", data=np.array(X.shape, dtype=np.int32))
# Handling barcodes and features
grp.create_dataset("barcodes", data=np.array(adata.obs_names, dtype=f'S{str_max(adata.obs_names)}'))
ftrs = grp.create_group("features")
ftrs.create_dataset("id", data=np.array(adata.var_names, dtype=f'S{str_max(adata.var_names)}'))
ftrs.create_dataset("name", data=np.array(adata.var_names, dtype=f'S{str_max(adata.var_names)}'))
# Optionally add more metadata fields if needed
if 'genome' in adata.var.columns:
ftrs.create_dataset("genome", data=np.array(adata.var['genome'], dtype=f'S{str_max(adata.var["genome"])}'))
# set feature_type
if 'feature_type' in adata.var.columns:
ftrs.create_dataset("feature_type", data=np.array(adata.var['feature_type'], dtype=f'S{str_max(adata.var["feature_type"])}'))
else:
adata.var['feature_type'] = 'Gene Expression'
ftrs.create_dataset("feature_type", data=np.array(adata.var['feature_type'], dtype=f'S{str_max(adata.var["feature_type"])}'))
使用方法如下,
write_10X_h5(adata, "adata.h5")
使用要求:adata必须没有经过特别的数据预处理,也就是说adata.X存放的就是稀疏矩阵
使用场景:当只有adata对象,没有10x输出的h5文件时,可以使用。