cellxgene-census by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill cellxgene-censusCZ CELLxGENE 普查提供了对来自 CZ CELLxGENE Discover 的标准化单细胞基因组学数据的全面、版本化集合的程序化访问。此技能支持高效查询和分析数千个数据集中的数百万个细胞。
普查包含:
此技能应在以下情况下使用:
安装 Census API:
uv pip install cellxgene-census
对于机器学习工作流,安装额外的依赖项:
uv pip install cellxgene-census[experimental]
始终使用上下文管理器以确保正确清理资源:
import cellxgene_census
# 打开最新的稳定版本
with cellxgene_census.open_soma() as census:
# 使用普查数据
# 为可重复性打开特定版本
with cellxgene_census.open_soma(census_version="2023-07-25") as census:
# 使用普查数据
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
关键点:
with 语句)进行自动清理census_version 以实现可重复分析在查询表达数据之前,先探索可用的数据集和元数据。
访问摘要信息:
# 获取摘要统计信息
summary = census["census_info"]["summary"].read().concat().to_pandas()
print(f"总细胞数:{summary['total_cell_count'][0]}")
# 获取所有数据集
datasets = census["census_info"]["datasets"].read().concat().to_pandas()
# 按条件筛选数据集
covid_datasets = datasets[datasets["disease"].str.contains("COVID", na=False)]
查询细胞元数据以了解可用数据:
# 获取组织中唯一的细胞类型
cell_metadata = cellxgene_census.get_obs(
census,
"homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["cell_type"]
)
unique_cell_types = cell_metadata["cell_type"].unique()
print(f"在大脑中发现了 {len(unique_cell_types)} 种细胞类型")
# 按组织统计细胞数
tissue_counts = cell_metadata.groupby("tissue_general").size()
重要提示: 除非专门分析重复细胞,否则始终筛选 is_primary_data == True 以避免重复计数细胞。
对于返回 < 100k 个细胞且适合内存的查询,使用 get_anndata():
# 带有细胞类型和组织筛选器的基本查询
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens", # 或 "Mus musculus"
obs_value_filter="cell_type == 'B cell' and tissue_general == 'lung' and is_primary_data == True",
obs_column_names=["assay", "disease", "sex", "donor_id"],
)
# 使用多个筛选器查询特定基因
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A', 'CD19', 'FOXP3']",
obs_value_filter="cell_type == 'T cell' and disease == 'COVID-19' and is_primary_data == True",
obs_column_names=["cell_type", "tissue_general", "donor_id"],
)
筛选语法:
obs_value_filter 进行细胞筛选var_value_filter 进行基因筛选and、or 组合条件in 处理多个值:tissue in ['lung', 'liver']obs_column_names 仅选择需要的列单独获取元数据:
# 查询细胞元数据
cell_metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="disease == 'COVID-19' and is_primary_data == True",
column_names=["cell_type", "tissue_general", "donor_id"]
)
# 查询基因元数据
gene_metadata = cellxgene_census.get_var(
census, "homo_sapiens",
value_filter="feature_name in ['CD4', 'CD8A']",
column_names=["feature_id", "feature_name", "feature_length"]
)
对于超过可用 RAM 的查询,使用 axis_query() 进行迭代处理:
import tiledbsoma as soma
# 创建轴查询
query = census["census_data"]["homo_sapiens"].axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(
value_filter="tissue_general == 'brain' and is_primary_data == True"
),
var_query=soma.AxisQuery(
value_filter="feature_name in ['FOXP2', 'TBR1', 'SATB2']"
)
)
# 分块迭代表达矩阵
iterator = query.X("raw").tables()
for batch in iterator:
# batch 是一个 pyarrow.Table,包含列:
# - soma_data:表达值
# - soma_dim_0:细胞(obs)坐标
# - soma_dim_1:基因(var)坐标
process_batch(batch)
计算增量统计信息:
# 示例:计算平均表达量
n_observations = 0
sum_values = 0.0
iterator = query.X("raw").tables()
for batch in iterator:
values = batch["soma_data"].to_numpy()
n_observations += len(values)
sum_values += values.sum()
mean_expression = sum_values / n_observations
对于训练模型,使用实验性的 PyTorch 集成:
from cellxgene_census.experimental.ml import experiment_dataloader
with cellxgene_census.open_soma() as census:
# 创建数据加载器
dataloader = experiment_dataloader(
census["census_data"]["homo_sapiens"],
measurement_name="RNA",
X_name="raw",
obs_value_filter="tissue_general == 'liver' and is_primary_data == True",
obs_column_names=["cell_type"],
batch_size=128,
shuffle=True,
)
# 训练循环
for epoch in range(num_epochs):
for batch in dataloader:
X = batch["X"] # 基因表达张量
labels = batch["obs"]["cell_type"] # 细胞类型标签
# 前向传播
outputs = model(X)
loss = criterion(outputs, labels)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
训练/测试分割:
from cellxgene_census.experimental.ml import ExperimentDataset
# 从实验创建数据集
dataset = ExperimentDataset(
experiment_axis_query,
layer_name="raw",
obs_column_names=["cell_type"],
batch_size=128,
)
# 分割为训练集和测试集
train_dataset, test_dataset = dataset.random_split(
split=[0.8, 0.2],
seed=42
)
将普查数据无缝集成到 scanpy 工作流中:
import scanpy as sc
# 从普查加载数据
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'neuron' and tissue_general == 'cortex' and is_primary_data == True",
)
# 标准 scanpy 工作流
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
# 降维
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# 可视化
sc.pl.umap(adata, color=["cell_type", "tissue", "disease"])
查询并集成多个数据集:
# 策略 1:分别查询多个组织
tissues = ["lung", "liver", "kidney"]
adatas = []
for tissue in tissues:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter=f"tissue_general == '{tissue}' and is_primary_data == True",
)
adata.obs["tissue"] = tissue
adatas.append(adata)
# 连接
combined = adatas[0].concatenate(adatas[1:])
# 策略 2:直接查询多个数据集
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="tissue_general in ['lung', 'liver', 'kidney'] and is_primary_data == True",
)
除非分析重复数据,否则始终在查询中包含 is_primary_data == True 以避免多次计数细胞:
obs_value_filter="cell_type == 'B cell' and is_primary_data == True"
在生产分析中始终指定普查版本:
census = cellxgene_census.open_soma(census_version="2023-07-25")
对于大型查询,首先检查细胞数量以避免内存问题:
# 获取细胞计数
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["soma_joinid"]
)
n_cells = len(metadata)
print(f"查询将返回 {n_cells:,} 个细胞")
# 如果太大(>100k),使用核外处理
tissue_general 字段提供了比 tissue 更粗略的类别,适用于跨组织分析:
# 更广泛的分组
obs_value_filter="tissue_general == 'immune system'"
# 特定组织
obs_value_filter="tissue == 'peripheral blood mononuclear cell'"
通过指定仅需要的元数据列来最小化数据传输:
obs_column_names=["cell_type", "tissue_general", "disease"] # 不是所有列
分析特定基因时,验证哪些数据集测量了它们:
presence = cellxgene_census.get_presence_matrix(
census,
"homo_sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A']"
)
首先探索元数据以了解可用数据,然后查询表达:
# 步骤 1:探索可用的内容
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="disease == 'COVID-19' and is_primary_data == True",
column_names=["cell_type", "tissue_general"]
)
print(metadata.value_counts())
# 步骤 2:基于发现进行查询
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="disease == 'COVID-19' and cell_type == 'T cell' and is_primary_data == True",
)
用于筛选的关键字段:
cell_type, cell_type_ontology_term_idtissue, tissue_general, tissue_ontology_term_iddisease, disease_ontology_term_idassay, assay_ontology_term_iddonor_id, sex, self_reported_ethnicitydevelopment_stage, development_stage_ontology_term_iddataset_idis_primary_data(布尔值:True = 唯一细胞)feature_id(Ensembl 基因 ID,例如 "ENSG00000161798")feature_name(基因符号,例如 "FOXP2")feature_length(基因长度,以碱基对为单位)此技能包含详细的参考文档:
全面的文档,涵盖:
何时阅读: 当您需要详细的模式信息、完整的元数据字段列表或复杂的筛选器语法时。
示例和模式,涵盖:
何时阅读: 当实现特定查询模式、寻找代码示例或排查常见问题时。
with cellxgene_census.open_soma() as census:
cells = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="tissue_general == 'lung' and is_primary_data == True",
column_names=["cell_type"]
)
print(cells["cell_type"].value_counts())
with cellxgene_census.open_soma() as census:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A', 'CD19']",
obs_value_filter="cell_type in ['T cell', 'B cell'] and is_primary_data == True",
)
from cellxgene_census.experimental.ml import experiment_dataloader
with cellxgene_census.open_soma() as census:
dataloader = experiment_dataloader(
census["census_data"]["homo_sapiens"],
measurement_name="RNA",
X_name="raw",
obs_value_filter="is_primary_data == True",
obs_column_names=["cell_type"],
batch_size=128,
shuffle=True,
)
# 训练模型
for epoch in range(epochs):
for batch in dataloader:
# 训练逻辑
pass
with cellxgene_census.open_soma() as census:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'macrophage' and tissue_general in ['lung', 'liver', 'brain'] and is_primary_data == True",
)
# 分析跨组织的巨噬细胞差异
sc.tl.rank_genes_groups(adata, groupby="tissue_general")
tissue 而不是 tissue_general 以获得更细的粒度dataset_id 筛选var_value_filter 选择更少的基因axis_query() 进行核外处理is_primary_data == Truefeature_id 而不是 feature_name 的 Ensembl IDcensus_version每周安装
121
仓库
GitHub Stars
22.6K
首次出现
Jan 21, 2026
安全审计
安装于
claude-code101
opencode95
gemini-cli89
cursor88
antigravity83
codex78
The CZ CELLxGENE Census provides programmatic access to a comprehensive, versioned collection of standardized single-cell genomics data from CZ CELLxGENE Discover. This skill enables efficient querying and analysis of millions of cells across thousands of datasets.
The Census includes:
This skill should be used when:
Install the Census API:
uv pip install cellxgene-census
For machine learning workflows, install additional dependencies:
uv pip install cellxgene-census[experimental]
Always use the context manager to ensure proper resource cleanup:
import cellxgene_census
# Open latest stable version
with cellxgene_census.open_soma() as census:
# Work with census data
# Open specific version for reproducibility
with cellxgene_census.open_soma(census_version="2023-07-25") as census:
# Work with census data
Key points:
with statement) for automatic cleanupcensus_version for reproducible analysesBefore querying expression data, explore available datasets and metadata.
Access summary information:
# Get summary statistics
summary = census["census_info"]["summary"].read().concat().to_pandas()
print(f"Total cells: {summary['total_cell_count'][0]}")
# Get all datasets
datasets = census["census_info"]["datasets"].read().concat().to_pandas()
# Filter datasets by criteria
covid_datasets = datasets[datasets["disease"].str.contains("COVID", na=False)]
Query cell metadata to understand available data:
# Get unique cell types in a tissue
cell_metadata = cellxgene_census.get_obs(
census,
"homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["cell_type"]
)
unique_cell_types = cell_metadata["cell_type"].unique()
print(f"Found {len(unique_cell_types)} cell types in brain")
# Count cells by tissue
tissue_counts = cell_metadata.groupby("tissue_general").size()
Important: Always filter for is_primary_data == True to avoid counting duplicate cells unless specifically analyzing duplicates.
For queries returning < 100k cells that fit in memory, use get_anndata():
# Basic query with cell type and tissue filters
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens", # or "Mus musculus"
obs_value_filter="cell_type == 'B cell' and tissue_general == 'lung' and is_primary_data == True",
obs_column_names=["assay", "disease", "sex", "donor_id"],
)
# Query specific genes with multiple filters
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A', 'CD19', 'FOXP3']",
obs_value_filter="cell_type == 'T cell' and disease == 'COVID-19' and is_primary_data == True",
obs_column_names=["cell_type", "tissue_general", "donor_id"],
)
Filter syntax:
obs_value_filter for cell filteringvar_value_filter for gene filteringand, orin for multiple values: tissue in ['lung', 'liver']obs_column_namesGetting metadata separately:
# Query cell metadata
cell_metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="disease == 'COVID-19' and is_primary_data == True",
column_names=["cell_type", "tissue_general", "donor_id"]
)
# Query gene metadata
gene_metadata = cellxgene_census.get_var(
census, "homo_sapiens",
value_filter="feature_name in ['CD4', 'CD8A']",
column_names=["feature_id", "feature_name", "feature_length"]
)
For queries exceeding available RAM, use axis_query() with iterative processing:
import tiledbsoma as soma
# Create axis query
query = census["census_data"]["homo_sapiens"].axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(
value_filter="tissue_general == 'brain' and is_primary_data == True"
),
var_query=soma.AxisQuery(
value_filter="feature_name in ['FOXP2', 'TBR1', 'SATB2']"
)
)
# Iterate through expression matrix in chunks
iterator = query.X("raw").tables()
for batch in iterator:
# batch is a pyarrow.Table with columns:
# - soma_data: expression value
# - soma_dim_0: cell (obs) coordinate
# - soma_dim_1: gene (var) coordinate
process_batch(batch)
Computing incremental statistics:
# Example: Calculate mean expression
n_observations = 0
sum_values = 0.0
iterator = query.X("raw").tables()
for batch in iterator:
values = batch["soma_data"].to_numpy()
n_observations += len(values)
sum_values += values.sum()
mean_expression = sum_values / n_observations
For training models, use the experimental PyTorch integration:
from cellxgene_census.experimental.ml import experiment_dataloader
with cellxgene_census.open_soma() as census:
# Create dataloader
dataloader = experiment_dataloader(
census["census_data"]["homo_sapiens"],
measurement_name="RNA",
X_name="raw",
obs_value_filter="tissue_general == 'liver' and is_primary_data == True",
obs_column_names=["cell_type"],
batch_size=128,
shuffle=True,
)
# Training loop
for epoch in range(num_epochs):
for batch in dataloader:
X = batch["X"] # Gene expression tensor
labels = batch["obs"]["cell_type"] # Cell type labels
# Forward pass
outputs = model(X)
loss = criterion(outputs, labels)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
Train/test splitting:
from cellxgene_census.experimental.ml import ExperimentDataset
# Create dataset from experiment
dataset = ExperimentDataset(
experiment_axis_query,
layer_name="raw",
obs_column_names=["cell_type"],
batch_size=128,
)
# Split into train and test
train_dataset, test_dataset = dataset.random_split(
split=[0.8, 0.2],
seed=42
)
Seamlessly integrate Census data with scanpy workflows:
import scanpy as sc
# Load data from Census
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'neuron' and tissue_general == 'cortex' and is_primary_data == True",
)
# Standard scanpy workflow
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
# Dimensionality reduction
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
# Visualization
sc.pl.umap(adata, color=["cell_type", "tissue", "disease"])
Query and integrate multiple datasets:
# Strategy 1: Query multiple tissues separately
tissues = ["lung", "liver", "kidney"]
adatas = []
for tissue in tissues:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter=f"tissue_general == '{tissue}' and is_primary_data == True",
)
adata.obs["tissue"] = tissue
adatas.append(adata)
# Concatenate
combined = adatas[0].concatenate(adatas[1:])
# Strategy 2: Query multiple datasets directly
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="tissue_general in ['lung', 'liver', 'kidney'] and is_primary_data == True",
)
Unless analyzing duplicates, always include is_primary_data == True in queries to avoid counting cells multiple times:
obs_value_filter="cell_type == 'B cell' and is_primary_data == True"
Always specify the Census version in production analyses:
census = cellxgene_census.open_soma(census_version="2023-07-25")
For large queries, first check the number of cells to avoid memory issues:
# Get cell count
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["soma_joinid"]
)
n_cells = len(metadata)
print(f"Query will return {n_cells:,} cells")
# If too large (>100k), use out-of-core processing
The tissue_general field provides coarser categories than tissue, useful for cross-tissue analyses:
# Broader grouping
obs_value_filter="tissue_general == 'immune system'"
# Specific tissue
obs_value_filter="tissue == 'peripheral blood mononuclear cell'"
Minimize data transfer by specifying only required metadata columns:
obs_column_names=["cell_type", "tissue_general", "disease"] # Not all columns
When analyzing specific genes, verify which datasets measured them:
presence = cellxgene_census.get_presence_matrix(
census,
"homo_sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A']"
)
First explore metadata to understand available data, then query expression:
# Step 1: Explore what's available
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="disease == 'COVID-19' and is_primary_data == True",
column_names=["cell_type", "tissue_general"]
)
print(metadata.value_counts())
# Step 2: Query based on findings
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="disease == 'COVID-19' and cell_type == 'T cell' and is_primary_data == True",
)
Key fields for filtering:
cell_type, cell_type_ontology_term_idtissue, tissue_general, tissue_ontology_term_iddisease, disease_ontology_term_idassay, assay_ontology_term_iddonor_id, sex, feature_id (Ensembl gene ID, e.g., "ENSG00000161798")feature_name (Gene symbol, e.g., "FOXP2")feature_length (Gene length in base pairs)This skill includes detailed reference documentation:
Comprehensive documentation of:
When to read: When you need detailed schema information, full list of metadata fields, or complex filter syntax.
Examples and patterns for:
When to read: When implementing specific query patterns, looking for code examples, or troubleshooting common issues.
with cellxgene_census.open_soma() as census:
cells = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="tissue_general == 'lung' and is_primary_data == True",
column_names=["cell_type"]
)
print(cells["cell_type"].value_counts())
with cellxgene_census.open_soma() as census:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A', 'CD19']",
obs_value_filter="cell_type in ['T cell', 'B cell'] and is_primary_data == True",
)
from cellxgene_census.experimental.ml import experiment_dataloader
with cellxgene_census.open_soma() as census:
dataloader = experiment_dataloader(
census["census_data"]["homo_sapiens"],
measurement_name="RNA",
X_name="raw",
obs_value_filter="is_primary_data == True",
obs_column_names=["cell_type"],
batch_size=128,
shuffle=True,
)
# Train model
for epoch in range(epochs):
for batch in dataloader:
# Training logic
pass
with cellxgene_census.open_soma() as census:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'macrophage' and tissue_general in ['lung', 'liver', 'brain'] and is_primary_data == True",
)
# Analyze macrophage differences across tissues
sc.tl.rank_genes_groups(adata, groupby="tissue_general")
tissue instead of tissue_general for finer granularitydataset_id if knownvar_value_filteraxis_query()is_primary_data == True in filtersfeature_id instead of feature_namecensus_version explicitlyWeekly Installs
121
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code101
opencode95
gemini-cli89
cursor88
antigravity83
codex78
Excel财务建模规范与xlsx文件处理指南:专业格式、零错误公式与数据分析
45,800 周安装
Playwright CLI 终端浏览器自动化技能 - 无需全局安装,快速实现网页交互与测试
1,300 周安装
WordPress插件开发指南:架构、安全、设置API与生命周期管理
1,300 周安装
Element Plus Vue3 使用指南:安装、配置、组件详解与问题排查
1,200 周安装
App Store Connect 自动化创建应用指南:asc-app-create-ui 技能详解
1,200 周安装
内容营销实战指南:23位产品领导者框架,打造高效SEO与品牌内容策略
1,200 周安装
品牌叙事指南:30位专家教你打造难忘品牌故事 | 品牌营销与内容创作
1,200 周安装
self_reported_ethnicitydevelopment_stage, development_stage_ontology_term_iddataset_idis_primary_data (Boolean: True = unique cell)