nnsight-remote-interpretability by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill nnsight-remote-interpretabilitynnsight (/ɛn.saɪt/) 使研究人员能够解释和操作任何 PyTorch 模型的内部状态,其独特之处在于能够通过 NDIF 在本地的小模型上或远程的大规模模型(70B+)上运行相同的代码。
GitHub : ndif-team/nnsight (730+ stars) 论文 : NNsight and NDIF: Democratizing Access to Foundation Model Internals (ICLR 2025)
一次编写,随处运行:相同的可解释性代码可在本地 GPT-2 或远程 Llama-3.1-405B 上运行。只需切换 remote=True 即可。
# 本地执行(小模型)
with model.trace("Hello world"):
hidden = model.transformer.h[5].output[0].save()
# 远程执行(大模型)- 相同的代码!
with model.trace("Hello world", remote=True):
hidden = model.model.layers[40].output[0].save()
在以下情况下使用 nnsight:
在以下情况下考虑替代方案:
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 基本安装
pip install nnsight
# 如需 vLLM 支持
pip install "nnsight[vllm]"
对于远程 NDIF 执行,请在 login.ndif.us 注册以获取 API 密钥。
from nnsight import LanguageModel
# 加载模型(底层使用 HuggingFace)
model = LanguageModel("openai-community/gpt2", device_map="auto")
# 对于更大的模型
model = LanguageModel("meta-llama/Llama-3.1-8B", device_map="auto")
trace 上下文管理器启用延迟执行 - 操作被收集到一个计算图中:
from nnsight import LanguageModel
model = LanguageModel("gpt2", device_map="auto")
with model.trace("The Eiffel Tower is in") as tracer:
# 访问任何模块的输出
hidden_states = model.transformer.h[5].output[0].save()
# 访问注意力模式
attn = model.transformer.h[5].attn.attn_dropout.input[0][0].save()
# 修改激活值
model.transformer.h[8].output[0][:] = 0 # 将第 8 层输出置零
# 获取最终输出
logits = model.output.save()
# 上下文退出后,访问保存的值
print(hidden_states.shape) # [batch, seq, hidden]
在 trace 内部,模块访问返回记录操作的代理对象:
with model.trace("Hello"):
# 这些都是代理对象 - 操作被延迟执行
h5_out = model.transformer.h[5].output[0] # Proxy
h5_mean = h5_out.mean(dim=-1) # Proxy
h5_saved = h5_mean.save() # 保存以供后续访问
from nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
prompt = "The capital of France is"
with model.trace(prompt) as tracer:
# 1. 从多个层收集激活值
layer_outputs = []
for i in range(12): # GPT-2 有 12 层
layer_out = model.transformer.h[i].output[0].save()
layer_outputs.append(layer_out)
# 2. 获取注意力模式
attn_patterns = []
for i in range(12):
# 访问注意力权重(softmax 之后)
attn = model.transformer.h[i].attn.attn_dropout.input[0][0].save()
attn_patterns.append(attn)
# 3. 获取最终 logits
logits = model.output.save()
# 4. 在上下文外部分析
for i, layer_out in enumerate(layer_outputs):
print(f"Layer {i} output shape: {layer_out.shape}")
print(f"Layer {i} norm: {layer_out.norm().item():.3f}")
# 5. 查找 top 预测
probs = torch.softmax(logits[0, -1], dim=-1)
top_tokens = probs.topk(5)
for token, prob in zip(top_tokens.indices, top_tokens.values):
print(f"{model.tokenizer.decode(token)}: {prob.item():.3f}")
.save().shape、.norm() 等进行数据分析from nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
clean_prompt = "The Eiffel Tower is in"
corrupted_prompt = "The Colosseum is in"
# 1. 获取干净激活值
with model.trace(clean_prompt) as tracer:
clean_hidden = model.transformer.h[8].output[0].save()
# 2. 将干净激活值修补到损坏的运行中
with model.trace(corrupted_prompt) as tracer:
# 用干净激活值替换第 8 层输出
model.transformer.h[8].output[0][:] = clean_hidden
patched_logits = model.output.save()
# 3. 比较预测结果
paris_token = model.tokenizer.encode(" Paris")[0]
rome_token = model.tokenizer.encode(" Rome")[0]
patched_probs = torch.softmax(patched_logits[0, -1], dim=-1)
print(f"Paris prob: {patched_probs[paris_token].item():.3f}")
print(f"Rome prob: {patched_probs[rome_token].item():.3f}")
def patch_layer_position(layer, position, clean_cache, corrupted_prompt):
"""将单个层/位置的干净激活值修补到损坏提示中。"""
with model.trace(corrupted_prompt) as tracer:
# 获取当前激活值
current = model.transformer.h[layer].output[0]
# 仅修补特定位置
current[:, position, :] = clean_cache[layer][:, position, :]
logits = model.output.save()
return logits
# 扫描所有层和位置
results = torch.zeros(12, seq_len)
for layer in range(12):
for pos in range(seq_len):
logits = patch_layer_position(layer, pos, clean_hidden, corrupted)
results[layer, pos] = compute_metric(logits)
无需本地 GPU 即可在大型模型上运行相同的实验。
from nnsight import LanguageModel
# 1. 加载大模型(将远程运行)
model = LanguageModel("meta-llama/Llama-3.1-70B")
# 2. 相同的代码,只需添加 remote=True
with model.trace("The meaning of life is", remote=True) as tracer:
# 访问 70B 模型的内部状态!
layer_40_out = model.model.layers[40].output[0].save()
logits = model.output.save()
# 3. 从 NDIF 返回的结果
print(f"Layer 40 shape: {layer_40_out.shape}")
# 4. 带干预的生成
with model.trace(remote=True) as tracer:
with tracer.invoke("What is 2+2?"):
# 在生成过程中进行干预
model.model.layers[20].output[0][:, -1, :] *= 1.5
output = model.generate(max_new_tokens=50)
import os
os.environ["NDIF_API_KEY"] = "your_key"
# 或直接配置
from nnsight import CONFIG
CONFIG.API_KEY = "your_key"
在单个 trace 中在不同输入之间共享激活值。
from nnsight import LanguageModel
model = LanguageModel("gpt2", device_map="auto")
with model.trace() as tracer:
# 第一个提示
with tracer.invoke("The cat sat on the"):
cat_hidden = model.transformer.h[6].output[0].save()
# 第二个提示 - 注入猫的激活值
with tracer.invoke("The dog ran through the"):
# 在第 6 层用猫的激活值替换
model.transformer.h[6].output[0][:] = cat_hidden
dog_with_cat = model.output.save()
# 狗的提示现在具有猫的内部表示
在反向传播过程中访问梯度。
from nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
with model.trace("The quick brown fox") as tracer:
# 保存激活值并启用梯度
hidden = model.transformer.h[5].output[0].save()
hidden.retain_grad()
logits = model.output
# 计算特定令牌的损失
target_token = model.tokenizer.encode(" jumps")[0]
loss = -logits[0, -1, target_token]
# 反向传播
loss.backward()
# 访问梯度
grad = hidden.grad
print(f"Gradient shape: {grad.shape}")
print(f"Gradient norm: {grad.norm().item():.3f}")
注意:vLLM 或远程执行不支持梯度访问。
# GPT-2 结构
model.transformer.h[5].output[0]
# LLaMA 结构
model.model.layers[5].output[0]
# 解决方案:检查模型结构
print(model._model) # 查看实际的模块名称
# 错误:值在 trace 外部无法访问
with model.trace("Hello"):
hidden = model.transformer.h[5].output[0] # 未保存!
print(hidden) # 错误或错误的值
# 正确:调用 .save()
with model.trace("Hello"):
hidden = model.transformer.h[5].output[0].save()
print(hidden) # 正常工作!
# 对于长时间操作,增加超时时间
with model.trace("prompt", remote=True, timeout=300) as tracer:
# 长时间操作...
# 只保存需要的内容
with model.trace("prompt"):
# 不要保存所有内容
for i in range(100):
model.transformer.h[i].output[0].save() # 内存消耗大!
# 更好:保存特定层
key_layers = [0, 5, 11]
for i in key_layers:
model.transformer.h[i].output[0].save()
# vLLM 不支持梯度
# 对于梯度分析,使用标准执行
model = LanguageModel("gpt2", device_map="auto") # 不使用 vLLM
| 方法/属性 | 用途 |
|---|---|
model.trace(prompt, remote=False) | 开始追踪上下文 |
proxy.save() | 保存值以供追踪后访问 |
proxy[:] | 切片/索引代理(赋值进行修补) |
tracer.invoke(prompt) | 在追踪中添加提示 |
model.generate(...) | 带干预的生成 |
model.output | 最终模型输出 logits |
model._model | 底层的 HuggingFace 模型 |
| 功能 | nnsight | TransformerLens | pyvene |
|---|---|---|---|
| 任意架构 | 是 | 仅限 Transformers | 是 |
| 远程执行 | 是 (NDIF) | 否 | 否 |
| 一致的 API | 否 | 是 | 是 |
| 延迟执行 | 是 | 否 | 否 |
| HuggingFace 原生 | 是 | 重新实现 | 是 |
| 可共享配置 | 否 | 否 | 是 |
有关详细的 API 文档、教程和高级用法,请参阅 references/ 文件夹:
| 文件 | 内容 |
|---|---|
| references/README.md | 概述和快速入门指南 |
| references/api.md | LanguageModel、追踪、代理对象的完整 API 参考 |
| references/tutorials.md | 本地和远程可解释性的分步教程 |
nnsight 适用于任何 PyTorch 模型:
关键在于了解模块结构以访问正确的组件。
每周安装量
156
代码仓库
GitHub Stars
23.4K
首次出现
Jan 21, 2026
安全审计
安装于
claude-code132
opencode128
gemini-cli122
cursor119
codex109
antigravity106
nnsight (/ɛn.saɪt/) enables researchers to interpret and manipulate the internals of any PyTorch model, with the unique capability of running the same code locally on small models or remotely on massive models (70B+) via NDIF.
GitHub : ndif-team/nnsight (730+ stars) Paper : NNsight and NDIF: Democratizing Access to Foundation Model Internals (ICLR 2025)
Write once, run anywhere : The same interpretability code works on GPT-2 locally or Llama-3.1-405B remotely. Just toggle remote=True.
# Local execution (small model)
with model.trace("Hello world"):
hidden = model.transformer.h[5].output[0].save()
# Remote execution (massive model) - same code!
with model.trace("Hello world", remote=True):
hidden = model.model.layers[40].output[0].save()
Use nnsight when you need to:
Consider alternatives when:
# Basic installation
pip install nnsight
# For vLLM support
pip install "nnsight[vllm]"
For remote NDIF execution, sign up at login.ndif.us for an API key.
from nnsight import LanguageModel
# Load model (uses HuggingFace under the hood)
model = LanguageModel("openai-community/gpt2", device_map="auto")
# For larger models
model = LanguageModel("meta-llama/Llama-3.1-8B", device_map="auto")
The trace context manager enables deferred execution - operations are collected into a computation graph:
from nnsight import LanguageModel
model = LanguageModel("gpt2", device_map="auto")
with model.trace("The Eiffel Tower is in") as tracer:
# Access any module's output
hidden_states = model.transformer.h[5].output[0].save()
# Access attention patterns
attn = model.transformer.h[5].attn.attn_dropout.input[0][0].save()
# Modify activations
model.transformer.h[8].output[0][:] = 0 # Zero out layer 8
# Get final output
logits = model.output.save()
# After context exits, access saved values
print(hidden_states.shape) # [batch, seq, hidden]
Inside trace, module accesses return Proxy objects that record operations:
with model.trace("Hello"):
# These are all Proxy objects - operations are deferred
h5_out = model.transformer.h[5].output[0] # Proxy
h5_mean = h5_out.mean(dim=-1) # Proxy
h5_saved = h5_mean.save() # Save for later access
from nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
prompt = "The capital of France is"
with model.trace(prompt) as tracer:
# 1. Collect activations from multiple layers
layer_outputs = []
for i in range(12): # GPT-2 has 12 layers
layer_out = model.transformer.h[i].output[0].save()
layer_outputs.append(layer_out)
# 2. Get attention patterns
attn_patterns = []
for i in range(12):
# Access attention weights (after softmax)
attn = model.transformer.h[i].attn.attn_dropout.input[0][0].save()
attn_patterns.append(attn)
# 3. Get final logits
logits = model.output.save()
# 4. Analyze outside context
for i, layer_out in enumerate(layer_outputs):
print(f"Layer {i} output shape: {layer_out.shape}")
print(f"Layer {i} norm: {layer_out.norm().item():.3f}")
# 5. Find top predictions
probs = torch.softmax(logits[0, -1], dim=-1)
top_tokens = probs.topk(5)
for token, prob in zip(top_tokens.indices, top_tokens.values):
print(f"{model.tokenizer.decode(token)}: {prob.item():.3f}")
.save() on values you need after context.shape, .norm(), etc. for analysisfrom nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
clean_prompt = "The Eiffel Tower is in"
corrupted_prompt = "The Colosseum is in"
# 1. Get clean activations
with model.trace(clean_prompt) as tracer:
clean_hidden = model.transformer.h[8].output[0].save()
# 2. Patch clean into corrupted run
with model.trace(corrupted_prompt) as tracer:
# Replace layer 8 output with clean activations
model.transformer.h[8].output[0][:] = clean_hidden
patched_logits = model.output.save()
# 3. Compare predictions
paris_token = model.tokenizer.encode(" Paris")[0]
rome_token = model.tokenizer.encode(" Rome")[0]
patched_probs = torch.softmax(patched_logits[0, -1], dim=-1)
print(f"Paris prob: {patched_probs[paris_token].item():.3f}")
print(f"Rome prob: {patched_probs[rome_token].item():.3f}")
def patch_layer_position(layer, position, clean_cache, corrupted_prompt):
"""Patch single layer/position from clean to corrupted."""
with model.trace(corrupted_prompt) as tracer:
# Get current activation
current = model.transformer.h[layer].output[0]
# Patch only specific position
current[:, position, :] = clean_cache[layer][:, position, :]
logits = model.output.save()
return logits
# Sweep over all layers and positions
results = torch.zeros(12, seq_len)
for layer in range(12):
for pos in range(seq_len):
logits = patch_layer_position(layer, pos, clean_hidden, corrupted)
results[layer, pos] = compute_metric(logits)
Run the same experiments on massive models without local GPUs.
from nnsight import LanguageModel
# 1. Load large model (will run remotely)
model = LanguageModel("meta-llama/Llama-3.1-70B")
# 2. Same code, just add remote=True
with model.trace("The meaning of life is", remote=True) as tracer:
# Access internals of 70B model!
layer_40_out = model.model.layers[40].output[0].save()
logits = model.output.save()
# 3. Results returned from NDIF
print(f"Layer 40 shape: {layer_40_out.shape}")
# 4. Generation with interventions
with model.trace(remote=True) as tracer:
with tracer.invoke("What is 2+2?"):
# Intervene during generation
model.model.layers[20].output[0][:, -1, :] *= 1.5
output = model.generate(max_new_tokens=50)
import os
os.environ["NDIF_API_KEY"] = "your_key"
# Or configure directly
from nnsight import CONFIG
CONFIG.API_KEY = "your_key"
Share activations between different inputs in a single trace.
from nnsight import LanguageModel
model = LanguageModel("gpt2", device_map="auto")
with model.trace() as tracer:
# First prompt
with tracer.invoke("The cat sat on the"):
cat_hidden = model.transformer.h[6].output[0].save()
# Second prompt - inject cat's activations
with tracer.invoke("The dog ran through the"):
# Replace with cat's activations at layer 6
model.transformer.h[6].output[0][:] = cat_hidden
dog_with_cat = model.output.save()
# The dog prompt now has cat's internal representations
Access gradients during backward pass.
from nnsight import LanguageModel
import torch
model = LanguageModel("gpt2", device_map="auto")
with model.trace("The quick brown fox") as tracer:
# Save activations and enable gradient
hidden = model.transformer.h[5].output[0].save()
hidden.retain_grad()
logits = model.output
# Compute loss on specific token
target_token = model.tokenizer.encode(" jumps")[0]
loss = -logits[0, -1, target_token]
# Backward pass
loss.backward()
# Access gradients
grad = hidden.grad
print(f"Gradient shape: {grad.shape}")
print(f"Gradient norm: {grad.norm().item():.3f}")
Note : Gradient access not supported for vLLM or remote execution.
# GPT-2 structure
model.transformer.h[5].output[0]
# LLaMA structure
model.model.layers[5].output[0]
# Solution: Check model structure
print(model._model) # See actual module names
# WRONG: Value not accessible outside trace
with model.trace("Hello"):
hidden = model.transformer.h[5].output[0] # Not saved!
print(hidden) # Error or wrong value
# RIGHT: Call .save()
with model.trace("Hello"):
hidden = model.transformer.h[5].output[0].save()
print(hidden) # Works!
# For long operations, increase timeout
with model.trace("prompt", remote=True, timeout=300) as tracer:
# Long operation...
# Only save what you need
with model.trace("prompt"):
# Don't save everything
for i in range(100):
model.transformer.h[i].output[0].save() # Memory heavy!
# Better: save specific layers
key_layers = [0, 5, 11]
for i in key_layers:
model.transformer.h[i].output[0].save()
# vLLM doesn't support gradients
# Use standard execution for gradient analysis
model = LanguageModel("gpt2", device_map="auto") # Not vLLM
| Method/Property | Purpose |
|---|---|
model.trace(prompt, remote=False) | Start tracing context |
proxy.save() | Save value for access after trace |
proxy[:] | Slice/index proxy (assignment patches) |
tracer.invoke(prompt) | Add prompt within trace |
model.generate(...) | Generate with interventions |
model.output |
| Feature | nnsight | TransformerLens | pyvene |
|---|---|---|---|
| Any architecture | Yes | Transformers only | Yes |
| Remote execution | Yes (NDIF) | No | No |
| Consistent API | No | Yes | Yes |
| Deferred execution | Yes | No | No |
| HuggingFace native | Yes | Reimplemented | Yes |
| Shareable configs | No | No | Yes |
For detailed API documentation, tutorials, and advanced usage, see the references/ folder:
| File | Contents |
|---|---|
| references/README.md | Overview and quick start guide |
| references/api.md | Complete API reference for LanguageModel, tracing, proxy objects |
| references/tutorials.md | Step-by-step tutorials for local and remote interpretability |
nnsight works with any PyTorch model:
The key is knowing the module structure to access the right components.
Weekly Installs
156
Repository
GitHub Stars
23.4K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code132
opencode128
gemini-cli122
cursor119
codex109
antigravity106
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
47,700 周安装
| Final model output logits |
model._model | Underlying HuggingFace model |