pyvene-interventions by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill pyvene-interventionspyvene 是斯坦福 NLP 用于对 PyTorch 模型执行因果干预的库。它提供了一个基于字典的声明式框架,用于激活修补、因果追踪和交换干预训练,使得干预实验可重现且可共享。
GitHub : stanfordnlp/pyvene (840+ stars) 论文 : pyvene: A Library for Understanding and Improving PyTorch Models via Interventions (NAACL 2024)
在以下情况下使用 pyvene:
在以下情况下考虑替代方案:
pip install pyvene
标准导入:
import pyvene as pv
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
包装任何 PyTorch 模型并赋予其干预能力的主要类:
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# 定义干预配置
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8,
component="block_output",
intervention_type=pv.VanillaIntervention,
)
]
)
# 创建可干预模型
intervenable = pv.IntervenableModel(config, model)
| 类型 | 描述 | 用例 |
|---|---|---|
VanillaIntervention | 在不同运行之间交换激活 | 激活修补 |
AdditionIntervention | 向基础运行添加激活 | 引导、消融 |
SubtractionIntervention | 减去激活 | 消融 |
ZeroIntervention | 将激活置零 | 组件敲除 |
RotatedSpaceIntervention | DAS 可训练干预 | 因果发现 |
CollectIntervention | 收集激活 | 探测、分析 |
# 可干预的可用组件
components = [
"block_input", # Transformer 块的输入
"block_output", # Transformer 块的输出
"mlp_input", # MLP 的输入
"mlp_output", # MLP 的输出
"mlp_activation", # MLP 隐藏层激活
"attention_input", # 注意力机制的输入
"attention_output", # 注意力机制的输出
"attention_value_output", # 注意力值向量
"query_output", # 查询向量
"key_output", # 键向量
"value_output", # 值向量
"head_attention_value_output", # 每个头的值向量
]
通过破坏输入并恢复激活来定位事实关联的存储位置。
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
# 1. 定义干净和损坏的输入
clean_prompt = "The Space Needle is in downtown"
corrupted_prompt = "The ##### ###### ## ## ########" # 噪声
clean_tokens = tokenizer(clean_prompt, return_tensors="pt")
corrupted_tokens = tokenizer(corrupted_prompt, return_tensors="pt")
# 2. 获取干净激活(源)
with torch.no_grad():
clean_outputs = model(**clean_tokens, output_hidden_states=True)
clean_states = clean_outputs.hidden_states
# 3. 定义恢复干预
def run_causal_trace(layer, position):
"""在特定层和位置恢复干净激活。"""
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=layer,
component="block_output",
intervention_type=pv.VanillaIntervention,
unit="pos",
max_number_of_units=1,
)
]
)
intervenable = pv.IntervenableModel(config, model)
# 运行干预
_, patched_outputs = intervenable(
base=corrupted_tokens,
sources=[clean_tokens],
unit_locations={"sources->base": ([[[position]]], [[[position]]])},
output_original_output=True,
)
# 返回正确标记的概率
probs = torch.softmax(patched_outputs.logits[0, -1], dim=-1)
seattle_token = tokenizer.encode(" Seattle")[0]
return probs[seattle_token].item()
# 4. 遍历层和位置
n_layers = model.config.n_layer
seq_len = clean_tokens["input_ids"].shape[1]
results = torch.zeros(n_layers, seq_len)
for layer in range(n_layers):
for pos in range(seq_len):
results[layer, pos] = run_causal_trace(layer, pos)
# 5. 可视化(层 x 位置热图)
# 高值表示因果重要性
测试哪些组件对特定行为是必需的。
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# IOI 任务设置
clean_prompt = "When John and Mary went to the store, Mary gave a bottle to"
corrupted_prompt = "When John and Mary went to the store, John gave a bottle to"
clean_tokens = tokenizer(clean_prompt, return_tensors="pt")
corrupted_tokens = tokenizer(corrupted_prompt, return_tensors="pt")
john_token = tokenizer.encode(" John")[0]
mary_token = tokenizer.encode(" Mary")[0]
def logit_diff(logits):
"""IO - S 对数差。"""
return logits[0, -1, john_token] - logits[0, -1, mary_token]
# 在每一层修补注意力输出
def patch_attention(layer):
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=layer,
component="attention_output",
intervention_type=pv.VanillaIntervention,
)
]
)
intervenable = pv.IntervenableModel(config, model)
_, patched_outputs = intervenable(
base=corrupted_tokens,
sources=[clean_tokens],
)
return logit_diff(patched_outputs.logits).item()
# 找出哪些层重要
results = []
for layer in range(model.config.n_layer):
diff = patch_attention(layer)
results.append(diff)
print(f"Layer {layer}: logit diff = {diff:.3f}")
训练干预以发现因果结构。
import pyvene as pv
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2")
# 1. 定义可训练干预
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=6,
component="block_output",
intervention_type=pv.RotatedSpaceIntervention, # 可训练
low_rank_dimension=64, # 学习 64 维子空间
)
]
)
intervenable = pv.IntervenableModel(config, model)
# 2. 设置训练
optimizer = torch.optim.Adam(
intervenable.get_trainable_parameters(),
lr=1e-4
)
# 3. 训练循环(简化版)
for base_input, source_input, target_output in dataloader:
optimizer.zero_grad()
_, outputs = intervenable(
base=base_input,
sources=[source_input],
)
loss = criterion(outputs.logits, target_output)
loss.backward()
optimizer.step()
# 4. 分析学习到的干预
# 旋转矩阵揭示了因果子空间
rotation = intervenable.interventions["layer.6.block_output"][0].rotate_layer
# 低秩旋转寻找可解释的子空间
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8,
component="block_output",
intervention_type=pv.LowRankRotatedSpaceIntervention,
low_rank_dimension=1, # 寻找 1D 因果方向
)
]
)
在生成过程中引导模型行为。
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# 加载预训练的引导干预
intervenable = pv.IntervenableModel.load(
"zhengxuanzenwu/intervenable_honest_llama2_chat_7B",
model=model,
)
# 使用引导进行生成
prompt = "Is the earth flat?"
inputs = tokenizer(prompt, return_tensors="pt")
# 在生成过程中应用干预
outputs = intervenable.generate(
inputs,
max_new_tokens=100,
do_sample=False,
)
print(tokenizer.decode(outputs[0]))
# 本地保存
intervenable.save("./my_intervention")
# 从本地加载
intervenable = pv.IntervenableModel.load(
"./my_intervention",
model=model,
)
# 在 HuggingFace 上共享
intervenable.save_intervention("username/my-intervention")
# 从 HuggingFace 加载
intervenable = pv.IntervenableModel.load(
"username/my-intervention",
model=model,
)
# 错误:组件名称不正确
config = pv.RepresentationConfig(
component="mlp", # 无效!
)
# 正确:使用确切的组件名称
config = pv.RepresentationConfig(
component="mlp_output", # 有效
)
# 确保源和基础具有兼容的形状
# 对于位置特定的干预:
config = pv.RepresentationConfig(
unit="pos",
max_number_of_units=1, # 干预单个位置
)
# 明确指定位置
intervenable(
base=base_tokens,
sources=[source_tokens],
unit_locations={"sources->base": ([[[5]]], [[[5]]])}, # 位置 5
)
# 使用梯度检查点
model.gradient_checkpointing_enable()
# 或者干预更少的组件
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8, # 单层而不是所有层
component="block_output",
)
]
)
# pyvene v0.1.8+ 支持将 LoRA 作为干预
config = pv.RepresentationConfig(
intervention_type=pv.LoRAIntervention,
low_rank_dimension=16,
)
| 类 | 用途 |
|---|---|
IntervenableModel | 干预的主要包装器 |
IntervenableConfig | 配置容器 |
RepresentationConfig | 单个干预规范 |
VanillaIntervention | 激活交换 |
RotatedSpaceIntervention | 可训练的 DAS 干预 |
CollectIntervention | 激活收集 |
pyvene 适用于任何 PyTorch 模型。已在以下模型上测试:
有关详细的 API 文档、教程和高级用法,请参阅 references/ 文件夹:
| 文件 | 内容 |
|---|---|
| references/README.md | 概述和快速入门指南 |
| references/api.md | IntervenableModel、干预类型、配置的完整 API 参考 |
| references/tutorials.md | 因果追踪、激活修补、DAS 的分步教程 |
| 特性 | pyvene | TransformerLens | nnsight |
|---|---|---|---|
| 声明式配置 | 是 | 否 | 否 |
| HuggingFace 共享 | 是 | 否 | 否 |
| 可训练干预 | 是 | 有限 | 是 |
| 任何 PyTorch 模型 | 是 | 仅限 Transformers | 是 |
| 远程执行 | 否 | 否 | 是 (NDIF) |
每周安装次数
144
仓库
GitHub Stars
22.6K
首次出现
Jan 21, 2026
安全审计
安装于
claude-code119
opencode115
gemini-cli107
cursor106
codex95
antigravity94
pyvene is Stanford NLP's library for performing causal interventions on PyTorch models. It provides a declarative, dict-based framework for activation patching, causal tracing, and interchange intervention training - making intervention experiments reproducible and shareable.
GitHub : stanfordnlp/pyvene (840+ stars) Paper : pyvene: A Library for Understanding and Improving PyTorch Models via Interventions (NAACL 2024)
Use pyvene when you need to:
Consider alternatives when:
pip install pyvene
Standard import:
import pyvene as pv
The main class that wraps any PyTorch model with intervention capabilities:
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load base model
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Define intervention configuration
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8,
component="block_output",
intervention_type=pv.VanillaIntervention,
)
]
)
# Create intervenable model
intervenable = pv.IntervenableModel(config, model)
| Type | Description | Use Case |
|---|---|---|
VanillaIntervention | Swap activations between runs | Activation patching |
AdditionIntervention | Add activations to base run | Steering, ablation |
SubtractionIntervention | Subtract activations | Ablation |
ZeroIntervention | Zero out activations | Component knockout |
RotatedSpaceIntervention |
# Available components to intervene on
components = [
"block_input", # Input to transformer block
"block_output", # Output of transformer block
"mlp_input", # Input to MLP
"mlp_output", # Output of MLP
"mlp_activation", # MLP hidden activations
"attention_input", # Input to attention
"attention_output", # Output of attention
"attention_value_output", # Attention value vectors
"query_output", # Query vectors
"key_output", # Key vectors
"value_output", # Value vectors
"head_attention_value_output", # Per-head values
]
Locate where factual associations are stored by corrupting inputs and restoring activations.
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
# 1. Define clean and corrupted inputs
clean_prompt = "The Space Needle is in downtown"
corrupted_prompt = "The ##### ###### ## ## ########" # Noise
clean_tokens = tokenizer(clean_prompt, return_tensors="pt")
corrupted_tokens = tokenizer(corrupted_prompt, return_tensors="pt")
# 2. Get clean activations (source)
with torch.no_grad():
clean_outputs = model(**clean_tokens, output_hidden_states=True)
clean_states = clean_outputs.hidden_states
# 3. Define restoration intervention
def run_causal_trace(layer, position):
"""Restore clean activation at specific layer and position."""
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=layer,
component="block_output",
intervention_type=pv.VanillaIntervention,
unit="pos",
max_number_of_units=1,
)
]
)
intervenable = pv.IntervenableModel(config, model)
# Run with intervention
_, patched_outputs = intervenable(
base=corrupted_tokens,
sources=[clean_tokens],
unit_locations={"sources->base": ([[[position]]], [[[position]]])},
output_original_output=True,
)
# Return probability of correct token
probs = torch.softmax(patched_outputs.logits[0, -1], dim=-1)
seattle_token = tokenizer.encode(" Seattle")[0]
return probs[seattle_token].item()
# 4. Sweep over layers and positions
n_layers = model.config.n_layer
seq_len = clean_tokens["input_ids"].shape[1]
results = torch.zeros(n_layers, seq_len)
for layer in range(n_layers):
for pos in range(seq_len):
results[layer, pos] = run_causal_trace(layer, pos)
# 5. Visualize (layer x position heatmap)
# High values indicate causal importance
Test which components are necessary for a specific behavior.
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# IOI task setup
clean_prompt = "When John and Mary went to the store, Mary gave a bottle to"
corrupted_prompt = "When John and Mary went to the store, John gave a bottle to"
clean_tokens = tokenizer(clean_prompt, return_tensors="pt")
corrupted_tokens = tokenizer(corrupted_prompt, return_tensors="pt")
john_token = tokenizer.encode(" John")[0]
mary_token = tokenizer.encode(" Mary")[0]
def logit_diff(logits):
"""IO - S logit difference."""
return logits[0, -1, john_token] - logits[0, -1, mary_token]
# Patch attention output at each layer
def patch_attention(layer):
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=layer,
component="attention_output",
intervention_type=pv.VanillaIntervention,
)
]
)
intervenable = pv.IntervenableModel(config, model)
_, patched_outputs = intervenable(
base=corrupted_tokens,
sources=[clean_tokens],
)
return logit_diff(patched_outputs.logits).item()
# Find which layers matter
results = []
for layer in range(model.config.n_layer):
diff = patch_attention(layer)
results.append(diff)
print(f"Layer {layer}: logit diff = {diff:.3f}")
Train interventions to discover causal structure.
import pyvene as pv
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("gpt2")
# 1. Define trainable intervention
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=6,
component="block_output",
intervention_type=pv.RotatedSpaceIntervention, # Trainable
low_rank_dimension=64, # Learn 64-dim subspace
)
]
)
intervenable = pv.IntervenableModel(config, model)
# 2. Set up training
optimizer = torch.optim.Adam(
intervenable.get_trainable_parameters(),
lr=1e-4
)
# 3. Training loop (simplified)
for base_input, source_input, target_output in dataloader:
optimizer.zero_grad()
_, outputs = intervenable(
base=base_input,
sources=[source_input],
)
loss = criterion(outputs.logits, target_output)
loss.backward()
optimizer.step()
# 4. Analyze learned intervention
# The rotation matrix reveals causal subspace
rotation = intervenable.interventions["layer.6.block_output"][0].rotate_layer
# Low-rank rotation finds interpretable subspaces
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8,
component="block_output",
intervention_type=pv.LowRankRotatedSpaceIntervention,
low_rank_dimension=1, # Find 1D causal direction
)
]
)
Steer model behavior during generation.
import pyvene as pv
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Load pre-trained steering intervention
intervenable = pv.IntervenableModel.load(
"zhengxuanzenwu/intervenable_honest_llama2_chat_7B",
model=model,
)
# Generate with steering
prompt = "Is the earth flat?"
inputs = tokenizer(prompt, return_tensors="pt")
# Intervention applied during generation
outputs = intervenable.generate(
inputs,
max_new_tokens=100,
do_sample=False,
)
print(tokenizer.decode(outputs[0]))
# Save locally
intervenable.save("./my_intervention")
# Load from local
intervenable = pv.IntervenableModel.load(
"./my_intervention",
model=model,
)
# Share on HuggingFace
intervenable.save_intervention("username/my-intervention")
# Load from HuggingFace
intervenable = pv.IntervenableModel.load(
"username/my-intervention",
model=model,
)
# WRONG: Incorrect component name
config = pv.RepresentationConfig(
component="mlp", # Not valid!
)
# RIGHT: Use exact component name
config = pv.RepresentationConfig(
component="mlp_output", # Valid
)
# Ensure source and base have compatible shapes
# For position-specific interventions:
config = pv.RepresentationConfig(
unit="pos",
max_number_of_units=1, # Intervene on single position
)
# Specify locations explicitly
intervenable(
base=base_tokens,
sources=[source_tokens],
unit_locations={"sources->base": ([[[5]]], [[[5]]])}, # Position 5
)
# Use gradient checkpointing
model.gradient_checkpointing_enable()
# Or intervene on fewer components
config = pv.IntervenableConfig(
representations=[
pv.RepresentationConfig(
layer=8, # Single layer instead of all
component="block_output",
)
]
)
# pyvene v0.1.8+ supports LoRAs as interventions
config = pv.RepresentationConfig(
intervention_type=pv.LoRAIntervention,
low_rank_dimension=16,
)
| Class | Purpose |
|---|---|
IntervenableModel | Main wrapper for interventions |
IntervenableConfig | Configuration container |
RepresentationConfig | Single intervention specification |
VanillaIntervention | Activation swapping |
RotatedSpaceIntervention | Trainable DAS intervention |
CollectIntervention |
pyvene works with any PyTorch model. Tested on:
For detailed API documentation, tutorials, and advanced usage, see the references/ folder:
| File | Contents |
|---|---|
| references/README.md | Overview and quick start guide |
| references/api.md | Complete API reference for IntervenableModel, intervention types, configurations |
| references/tutorials.md | Step-by-step tutorials for causal tracing, activation patching, DAS |
| Feature | pyvene | TransformerLens | nnsight |
|---|---|---|---|
| Declarative config | Yes | No | No |
| HuggingFace sharing | Yes | No | No |
| Trainable interventions | Yes | Limited | Yes |
| Any PyTorch model | Yes | Transformers only | Yes |
| Remote execution | No | No | Yes (NDIF) |
Weekly Installs
144
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code119
opencode115
gemini-cli107
cursor106
codex95
antigravity94
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
46,500 周安装
| DAS trainable intervention |
| Causal discovery |
CollectIntervention | Collect activations | Probing, analysis |
| Activation collection |