anndata by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill anndataAnnData 是一个用于处理带注释数据矩阵的 Python 包,它将实验测量数据 (X) 与观测元数据 (obs)、变量元数据 (var) 以及多维注释 (obsm, varm, obsp, varp, uns) 一同存储。最初通过 Scanpy 为单细胞基因组学设计,现已成为处理任何需要高效存储、操作和分析的带注释数据的通用框架。
在以下情况时使用此技能:
uv pip install anndata
# 安装可选依赖项
uv pip install anndata[dev,test,doc]
import anndata as ad
import numpy as np
import pandas as pd
# 最小化创建
X = np.random.rand(100, 2000) # 100 个细胞 × 2000 个基因
adata = ad.AnnData(X)
# 包含元数据
obs = pd.DataFrame({
'cell_type': ['T cell', 'B cell'] * 50,
'sample': ['A', 'B'] * 50
}, index=[f'cell_{i}' for i in range(100)])
var = pd.DataFrame({
'gene_name': [f'Gene_{i}' for i in range(2000)]
}, index=[f'ENSG{i:05d}' for i in range(2000)])
adata = ad.AnnData(X=X, obs=obs, var=var)
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 读取 h5ad 文件
adata = ad.read_h5ad('data.h5ad')
# 使用后端模式读取(针对大文件)
adata = ad.read_h5ad('large_data.h5ad', backed='r')
# 读取其他格式
adata = ad.read_csv('data.csv')
adata = ad.read_loom('data.loom')
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# 写入 h5ad 文件
adata.write_h5ad('output.h5ad')
# 使用压缩写入
adata.write_h5ad('output.h5ad', compression='gzip')
# 写入其他格式
adata.write_zarr('output.zarr')
adata.write_csvs('output_dir/')
# 按条件划分子集
t_cells = adata[adata.obs['cell_type'] == 'T cell']
# 按索引划分子集
subset = adata[0:50, 0:100]
# 添加元数据
adata.obs['quality_score'] = np.random.rand(adata.n_obs)
adata.var['highly_variable'] = np.random.rand(adata.n_vars) > 0.8
# 访问维度
print(f"{adata.n_obs} observations × {adata.n_vars} variables")
理解 AnnData 对象结构,包括 X、obs、var、layers、obsm、varm、obsp、varp、uns 和 raw 组件。
参见:references/data_structure.md 获取关于以下内容的全面信息:
以多种格式读写数据,支持压缩、后端模式和云存储。
参见:references/io_operations.md 获取关于以下内容的详细信息:
常用命令:
# 读/写 h5ad
adata = ad.read_h5ad('data.h5ad', backed='r')
adata.write_h5ad('output.h5ad', compression='gzip')
# 读取 10X 数据
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# 读取 MTX 格式
adata = ad.read_mtx('matrix.mtx').T
沿观测或变量方向灵活组合多个 AnnData 对象,支持多种连接策略。
参见:references/concatenation.md 获取关于以下内容的全面介绍:
常用命令:
# 拼接观测(合并样本)
adata = ad.concat(
[adata1, adata2, adata3],
axis=0,
join='inner',
label='batch',
keys=['batch1', 'batch2', 'batch3']
)
# 拼接变量(合并模态)
adata = ad.concat([adata_rna, adata_protein], axis=1)
# 惰性拼接
from anndata.experimental import AnnCollection
collection = AnnCollection(
['data1.h5ad', 'data2.h5ad'],
join_obs='outer',
label='dataset'
)
高效地转换、划分子集、过滤和重组数据。
参见:references/manipulation.md 获取关于以下内容的详细指导:
常用命令:
# 按元数据划分子集
filtered = adata[adata.obs['quality_score'] > 0.8]
hv_genes = adata[:, adata.var['highly_variable']]
# 转置
adata_T = adata.T
# 复制与视图
view = adata[0:100, :] # 视图(轻量级引用)
copy = adata[0:100, :].copy() # 独立副本
# 将字符串转换为分类变量
adata.strings_to_categoricals()
遵循内存效率、性能和可重复性方面的推荐模式。
参见:references/best_practices.md 获取关于以下内容的指导原则:
关键建议:
# 对稀疏数据使用稀疏矩阵
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
# 将字符串转换为分类变量
adata.strings_to_categoricals()
# 对大文件使用后端模式
adata = ad.read_h5ad('large.h5ad', backed='r')
# 在过滤前存储原始数据
adata.raw = adata.copy()
adata = adata[:, adata.var['highly_variable']]
AnnData 作为 scverse 生态系统的基础数据结构:
import scanpy as sc
# 预处理
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
# 降维
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_neighbors=15)
sc.tl.umap(adata)
sc.tl.leiden(adata)
# 可视化
sc.pl.umap(adata, color=['cell_type', 'leiden'])
import muon as mu
# 合并 RNA 和蛋白质数据
mdata = mu.MuData({'rna': adata_rna, 'protein': adata_protein})
from anndata.experimental import AnnLoader
# 为深度学习创建 DataLoader
dataloader = AnnLoader(adata, batch_size=128, shuffle=True)
for batch in dataloader:
X = batch.X
# 训练模型
import anndata as ad
import scanpy as sc
# 1. 加载数据
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# 2. 质量控制
adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
adata.obs['n_counts'] = adata.X.sum(axis=1)
adata = adata[adata.obs['n_genes'] > 200]
adata = adata[adata.obs['n_counts'] < 50000]
# 3. 存储原始数据
adata.raw = adata.copy()
# 4. 归一化和过滤
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var['highly_variable']]
# 5. 保存处理后的数据
adata.write_h5ad('processed.h5ad')
# 加载多个批次
adata1 = ad.read_h5ad('batch1.h5ad')
adata2 = ad.read_h5ad('batch2.h5ad')
adata3 = ad.read_h5ad('batch3.h5ad')
# 使用批次标签拼接
adata = ad.concat(
[adata1, adata2, adata3],
label='batch',
keys=['batch1', 'batch2', 'batch3'],
join='inner'
)
# 应用批次校正
import scanpy as sc
sc.pp.combat(adata, key='batch')
# 继续分析
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# 以后端模式打开
adata = ad.read_h5ad('100GB_dataset.h5ad', backed='r')
# 基于元数据过滤(不加载数据)
high_quality = adata[adata.obs['quality_score'] > 0.8]
# 加载过滤后的子集
adata_subset = high_quality.to_memory()
# 处理子集
process(adata_subset)
# 或分块处理
chunk_size = 1000
for i in range(0, adata.n_obs, chunk_size):
chunk = adata[i:i+chunk_size, :].to_memory()
process(chunk)
使用后端模式或转换为稀疏矩阵:
# 后端模式
adata = ad.read_h5ad('file.h5ad', backed='r')
# 稀疏矩阵
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
使用压缩和适当的格式:
# 优化存储
adata.strings_to_categoricals()
adata.write_h5ad('file.h5ad', compression='gzip')
# 对云存储使用 Zarr
adata.write_zarr('file.zarr', chunks=(1000, 1000))
始终在索引上对齐外部数据:
# 错误做法
adata.obs['new_col'] = external_data['values']
# 正确做法
adata.obs['new_col'] = external_data.set_index('cell_id').loc[adata.obs_names, 'values']
每周安装量
129
仓库
GitHub 星标数
22.6K
首次出现
2026年1月21日
安全审计
安装于
claude-code106
opencode103
gemini-cli99
cursor98
antigravity88
codex87
AnnData is a Python package for handling annotated data matrices, storing experimental measurements (X) alongside observation metadata (obs), variable metadata (var), and multi-dimensional annotations (obsm, varm, obsp, varp, uns). Originally designed for single-cell genomics through Scanpy, it now serves as a general-purpose framework for any annotated data requiring efficient storage, manipulation, and analysis.
Use this skill when:
uv pip install anndata
# With optional dependencies
uv pip install anndata[dev,test,doc]
import anndata as ad
import numpy as np
import pandas as pd
# Minimal creation
X = np.random.rand(100, 2000) # 100 cells × 2000 genes
adata = ad.AnnData(X)
# With metadata
obs = pd.DataFrame({
'cell_type': ['T cell', 'B cell'] * 50,
'sample': ['A', 'B'] * 50
}, index=[f'cell_{i}' for i in range(100)])
var = pd.DataFrame({
'gene_name': [f'Gene_{i}' for i in range(2000)]
}, index=[f'ENSG{i:05d}' for i in range(2000)])
adata = ad.AnnData(X=X, obs=obs, var=var)
# Read h5ad file
adata = ad.read_h5ad('data.h5ad')
# Read with backed mode (for large files)
adata = ad.read_h5ad('large_data.h5ad', backed='r')
# Read other formats
adata = ad.read_csv('data.csv')
adata = ad.read_loom('data.loom')
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# Write h5ad file
adata.write_h5ad('output.h5ad')
# Write with compression
adata.write_h5ad('output.h5ad', compression='gzip')
# Write other formats
adata.write_zarr('output.zarr')
adata.write_csvs('output_dir/')
# Subset by conditions
t_cells = adata[adata.obs['cell_type'] == 'T cell']
# Subset by indices
subset = adata[0:50, 0:100]
# Add metadata
adata.obs['quality_score'] = np.random.rand(adata.n_obs)
adata.var['highly_variable'] = np.random.rand(adata.n_vars) > 0.8
# Access dimensions
print(f"{adata.n_obs} observations × {adata.n_vars} variables")
Understand the AnnData object structure including X, obs, var, layers, obsm, varm, obsp, varp, uns, and raw components.
See : references/data_structure.md for comprehensive information on:
Read and write data in various formats with support for compression, backed mode, and cloud storage.
See : references/io_operations.md for details on:
Common commands:
# Read/write h5ad
adata = ad.read_h5ad('data.h5ad', backed='r')
adata.write_h5ad('output.h5ad', compression='gzip')
# Read 10X data
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# Read MTX format
adata = ad.read_mtx('matrix.mtx').T
Combine multiple AnnData objects along observations or variables with flexible join strategies.
See : references/concatenation.md for comprehensive coverage of:
Common commands:
# Concatenate observations (combine samples)
adata = ad.concat(
[adata1, adata2, adata3],
axis=0,
join='inner',
label='batch',
keys=['batch1', 'batch2', 'batch3']
)
# Concatenate variables (combine modalities)
adata = ad.concat([adata_rna, adata_protein], axis=1)
# Lazy concatenation
from anndata.experimental import AnnCollection
collection = AnnCollection(
['data1.h5ad', 'data2.h5ad'],
join_obs='outer',
label='dataset'
)
Transform, subset, filter, and reorganize data efficiently.
See : references/manipulation.md for detailed guidance on:
Common commands:
# Subset by metadata
filtered = adata[adata.obs['quality_score'] > 0.8]
hv_genes = adata[:, adata.var['highly_variable']]
# Transpose
adata_T = adata.T
# Copy vs view
view = adata[0:100, :] # View (lightweight reference)
copy = adata[0:100, :].copy() # Independent copy
# Convert strings to categoricals
adata.strings_to_categoricals()
Follow recommended patterns for memory efficiency, performance, and reproducibility.
See : references/best_practices.md for guidelines on:
Key recommendations:
# Use sparse matrices for sparse data
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
# Convert strings to categoricals
adata.strings_to_categoricals()
# Use backed mode for large files
adata = ad.read_h5ad('large.h5ad', backed='r')
# Store raw before filtering
adata.raw = adata.copy()
adata = adata[:, adata.var['highly_variable']]
AnnData serves as the foundational data structure for the scverse ecosystem:
import scanpy as sc
# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
# Dimensionality reduction
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_neighbors=15)
sc.tl.umap(adata)
sc.tl.leiden(adata)
# Visualization
sc.pl.umap(adata, color=['cell_type', 'leiden'])
import muon as mu
# Combine RNA and protein data
mdata = mu.MuData({'rna': adata_rna, 'protein': adata_protein})
from anndata.experimental import AnnLoader
# Create DataLoader for deep learning
dataloader = AnnLoader(adata, batch_size=128, shuffle=True)
for batch in dataloader:
X = batch.X
# Train model
import anndata as ad
import scanpy as sc
# 1. Load data
adata = ad.read_10x_h5('filtered_feature_bc_matrix.h5')
# 2. Quality control
adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
adata.obs['n_counts'] = adata.X.sum(axis=1)
adata = adata[adata.obs['n_genes'] > 200]
adata = adata[adata.obs['n_counts'] < 50000]
# 3. Store raw
adata.raw = adata.copy()
# 4. Normalize and filter
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var['highly_variable']]
# 5. Save processed data
adata.write_h5ad('processed.h5ad')
# Load multiple batches
adata1 = ad.read_h5ad('batch1.h5ad')
adata2 = ad.read_h5ad('batch2.h5ad')
adata3 = ad.read_h5ad('batch3.h5ad')
# Concatenate with batch labels
adata = ad.concat(
[adata1, adata2, adata3],
label='batch',
keys=['batch1', 'batch2', 'batch3'],
join='inner'
)
# Apply batch correction
import scanpy as sc
sc.pp.combat(adata, key='batch')
# Continue analysis
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# Open in backed mode
adata = ad.read_h5ad('100GB_dataset.h5ad', backed='r')
# Filter based on metadata (no data loading)
high_quality = adata[adata.obs['quality_score'] > 0.8]
# Load filtered subset
adata_subset = high_quality.to_memory()
# Process subset
process(adata_subset)
# Or process in chunks
chunk_size = 1000
for i in range(0, adata.n_obs, chunk_size):
chunk = adata[i:i+chunk_size, :].to_memory()
process(chunk)
Use backed mode or convert to sparse matrices:
# Backed mode
adata = ad.read_h5ad('file.h5ad', backed='r')
# Sparse matrices
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
Use compression and appropriate formats:
# Optimize for storage
adata.strings_to_categoricals()
adata.write_h5ad('file.h5ad', compression='gzip')
# Use Zarr for cloud storage
adata.write_zarr('file.zarr', chunks=(1000, 1000))
Always align external data on index:
# Wrong
adata.obs['new_col'] = external_data['values']
# Correct
adata.obs['new_col'] = external_data.set_index('cell_id').loc[adata.obs_names, 'values']
Weekly Installs
129
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code106
opencode103
gemini-cli99
cursor98
antigravity88
codex87
FastAPI官方技能:Python Web开发最佳实践与CLI工具使用指南
1,200 周安装