实现anndata的read_10x_h5的逆操作write_10X_h5

一个简单的功能,将adata中存放的稀疏的表达量矩阵保存为可以用 Seurat::Read10X_h5读取的H5文件,代码如下

import h5py
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path

def write_10X_h5(adata, file):
    """Writes an AnnData object to a 10X Genomics formatted HDF5 file.
    
    This function creates a file compatible with Seurat's Read10X_h5 function. 
    It writes the sparse matrix data and associated metadata while ensuring correct data types and attributes.
    
    Args:
        adata: AnnData object containing the matrix and metadata.
        file: Output file path. Appends '.h5' if not present.
        
    Raises:
        FileExistsError: If the output file already exists.
    """
    
    # Ensure file extension is .h5
    file = file if file.endswith('.h5') else f'{file}.h5'
    
    # Check if the file already exists
    if Path(file).exists():
        raise FileExistsError(f"There already is a file `{file}`.")
    
    # Helper function to calculate max integer size
    def int_max(x):
        return int(max(np.floor(np.log10(max(x)+1)), 1) * 4)
    
    # Helper function to calculate max string size
    def str_max(x):
        return max(len(str(i)) for i in x) + 1  # +1 for null termination
    
    # must transpose
    X = adata.X.T
    # Create file and write data
    with h5py.File(file, 'w') as w:
        grp = w.create_group("matrix")
        #
        grp.create_dataset("data", data=X.data, dtype=np.float32)
        grp.create_dataset("indices", data=X.indices, dtype=np.int32)
        grp.create_dataset("indptr", data=X.indptr, dtype=np.int32)
        grp.create_dataset("shape", data=np.array(X.shape, dtype=np.int32))

        # Handling barcodes and features
        grp.create_dataset("barcodes", data=np.array(adata.obs_names, dtype=f'S{str_max(adata.obs_names)}'))
        
        ftrs = grp.create_group("features")
        ftrs.create_dataset("id", data=np.array(adata.var_names, dtype=f'S{str_max(adata.var_names)}'))
        ftrs.create_dataset("name", data=np.array(adata.var_names, dtype=f'S{str_max(adata.var_names)}'))

        # Optionally add more metadata fields if needed
        if 'genome' in adata.var.columns:
            ftrs.create_dataset("genome", data=np.array(adata.var['genome'], dtype=f'S{str_max(adata.var["genome"])}'))
        # set feature_type
        if 'feature_type' in adata.var.columns:
            ftrs.create_dataset("feature_type", data=np.array(adata.var['feature_type'], dtype=f'S{str_max(adata.var["feature_type"])}'))
        else:
            adata.var['feature_type'] = 'Gene Expression'
            ftrs.create_dataset("feature_type", data=np.array(adata.var['feature_type'], dtype=f'S{str_max(adata.var["feature_type"])}'))
            

使用方法如下,

write_10X_h5(adata, "adata.h5")

使用要求:adata必须没有经过特别的数据预处理,也就是说adata.X存放的就是稀疏矩阵

使用场景:当只有adata对象,没有10x输出的h5文件时,可以使用。

# 单细胞 

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×