pytorch-lightning by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill pytorch-lightningPyTorch Lightning 组织 PyTorch 代码,消除样板代码,同时保持灵活性。
安装:
pip install lightning
将 PyTorch 转换为 Lightning(3个步骤):
import lightning as L
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
# 步骤 1:定义 LightningModule(组织你的 PyTorch 代码)
class LitModel(L.LightningModule):
def __init__(self, hidden_size=128):
super().__init__()
self.model = nn.Sequential(
nn.Linear(28 * 28, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 10)
)
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
loss = nn.functional.cross_entropy(y_hat, y)
self.log('train_loss', loss) # 自动记录到 TensorBoard
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3)
# 步骤 2:创建数据
train_loader = DataLoader(train_dataset, batch_size=32)
# 步骤 3:使用 Trainer 进行训练(处理其他所有事情!)
trainer = L.Trainer(max_epochs=10, accelerator='gpu', devices=2)
model = LitModel()
trainer.fit(model, train_loader)
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
就这样! Trainer 处理:
原始 PyTorch 代码:
model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
model.to('cuda')
for epoch in range(max_epochs):
for batch in train_loader:
batch = batch.to('cuda')
optimizer.zero_grad()
loss = model(batch)
loss.backward()
optimizer.step()
Lightning 版本:
class LitModel(L.LightningModule):
def __init__(self):
super().__init__()
self.model = MyModel()
def training_step(self, batch, batch_idx):
loss = self.model(batch) # 不需要 .to('cuda')!
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters())
# 训练
trainer = L.Trainer(max_epochs=10, accelerator='gpu')
trainer.fit(LitModel(), train_loader)
优势:40+ 行 → 15 行,无需设备管理,自动分布式
class LitModel(L.LightningModule):
def __init__(self):
super().__init__()
self.model = MyModel()
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
loss = nn.functional.cross_entropy(y_hat, y)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
val_loss = nn.functional.cross_entropy(y_hat, y)
acc = (y_hat.argmax(dim=1) == y).float().mean()
self.log('val_loss', val_loss)
self.log('val_acc', acc)
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
test_loss = nn.functional.cross_entropy(y_hat, y)
self.log('test_loss', test_loss)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3)
# 带验证的训练
trainer = L.Trainer(max_epochs=10)
trainer.fit(model, train_loader, val_loader)
# 测试
trainer.test(model, test_loader)
自动功能:
# 与单 GPU 代码相同!
model = LitModel()
# 8 个 GPU 使用 DDP(自动!)
trainer = L.Trainer(
accelerator='gpu',
devices=8,
strategy='ddp' # 或 'fsdp'、'deepspeed'
)
trainer.fit(model, train_loader)
启动:
# 单条命令,Lightning 处理其余事情
python train.py
无需更改:
num_nodes=2)from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
# 创建回调函数
checkpoint = ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=3,
filename='model-{epoch:02d}-{val_loss:.2f}'
)
early_stop = EarlyStopping(
monitor='val_loss',
patience=5,
mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='epoch')
# 添加到 Trainer
trainer = L.Trainer(
max_epochs=100,
callbacks=[checkpoint, early_stop, lr_monitor]
)
trainer.fit(model, train_loader, val_loader)
结果:
class LitModel(L.LightningModule):
# ...(training_step 等)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
# 余弦退火
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=100,
eta_min=1e-5
)
return {
'optimizer': optimizer,
'lr_scheduler': {
'scheduler': scheduler,
'interval': 'epoch', # 每个 epoch 更新
'frequency': 1
}
}
# 学习率自动记录!
trainer = L.Trainer(max_epochs=100)
trainer.fit(model, train_loader)
使用 PyTorch Lightning 当:
关键优势:
使用替代方案当:
问题:损失不下降
检查数据和模型设置:
# 添加到 training_step
def training_step(self, batch, batch_idx):
if batch_idx == 0:
print(f"Batch shape: {batch[0].shape}")
print(f"Labels: {batch[1]}")
loss = ...
return loss
问题:内存不足
减少批次大小或使用梯度累积:
trainer = L.Trainer(
accumulate_grad_batches=4, # 有效批次大小 = batch_size × 4
precision='bf16' # 或 'fp16',减少内存 50%
)
问题:验证未运行
确保传递 val_loader:
# 错误
trainer.fit(model, train_loader)
# 正确
trainer.fit(model, train_loader, val_loader)
问题:DDP 意外产生多个进程
Lightning 自动检测 GPU。显式设置设备:
# 首先在 CPU 上测试
trainer = L.Trainer(accelerator='cpu', devices=1)
# 然后在 GPU 上
trainer = L.Trainer(accelerator='gpu', devices=1)
回调函数:请参阅 references/callbacks.md 了解 EarlyStopping、ModelCheckpoint、自定义回调函数和回调钩子。
分布式策略:请参阅 references/distributed.md 了解 DDP、FSDP、DeepSpeed ZeRO 集成、多节点设置。
超参数调优:请参阅 references/hyperparameter-tuning.md 了解与 Optuna、Ray Tune 和 WandB sweeps 的集成。
精度选项:
每周安装量
355
仓库
GitHub 星标数
23.4K
首次出现
2026年1月21日
安全审计
安装于
claude-code277
opencode245
gemini-cli229
codex222
cursor209
github-copilot195
PyTorch Lightning organizes PyTorch code to eliminate boilerplate while maintaining flexibility.
Installation :
pip install lightning
Convert PyTorch to Lightning (3 steps):
import lightning as L
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
# Step 1: Define LightningModule (organize your PyTorch code)
class LitModel(L.LightningModule):
def __init__(self, hidden_size=128):
super().__init__()
self.model = nn.Sequential(
nn.Linear(28 * 28, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 10)
)
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
loss = nn.functional.cross_entropy(y_hat, y)
self.log('train_loss', loss) # Auto-logged to TensorBoard
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3)
# Step 2: Create data
train_loader = DataLoader(train_dataset, batch_size=32)
# Step 3: Train with Trainer (handles everything else!)
trainer = L.Trainer(max_epochs=10, accelerator='gpu', devices=2)
model = LitModel()
trainer.fit(model, train_loader)
That's it! Trainer handles:
Original PyTorch code :
model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
model.to('cuda')
for epoch in range(max_epochs):
for batch in train_loader:
batch = batch.to('cuda')
optimizer.zero_grad()
loss = model(batch)
loss.backward()
optimizer.step()
Lightning version :
class LitModel(L.LightningModule):
def __init__(self):
super().__init__()
self.model = MyModel()
def training_step(self, batch, batch_idx):
loss = self.model(batch) # No .to('cuda') needed!
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters())
# Train
trainer = L.Trainer(max_epochs=10, accelerator='gpu')
trainer.fit(LitModel(), train_loader)
Benefits : 40+ lines → 15 lines, no device management, automatic distributed
class LitModel(L.LightningModule):
def __init__(self):
super().__init__()
self.model = MyModel()
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
loss = nn.functional.cross_entropy(y_hat, y)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
val_loss = nn.functional.cross_entropy(y_hat, y)
acc = (y_hat.argmax(dim=1) == y).float().mean()
self.log('val_loss', val_loss)
self.log('val_acc', acc)
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self.model(x)
test_loss = nn.functional.cross_entropy(y_hat, y)
self.log('test_loss', test_loss)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3)
# Train with validation
trainer = L.Trainer(max_epochs=10)
trainer.fit(model, train_loader, val_loader)
# Test
trainer.test(model, test_loader)
Automatic features :
# Same code as single GPU!
model = LitModel()
# 8 GPUs with DDP (automatic!)
trainer = L.Trainer(
accelerator='gpu',
devices=8,
strategy='ddp' # Or 'fsdp', 'deepspeed'
)
trainer.fit(model, train_loader)
Launch :
# Single command, Lightning handles the rest
python train.py
No changes needed :
num_nodes=2)from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
# Create callbacks
checkpoint = ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=3,
filename='model-{epoch:02d}-{val_loss:.2f}'
)
early_stop = EarlyStopping(
monitor='val_loss',
patience=5,
mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='epoch')
# Add to Trainer
trainer = L.Trainer(
max_epochs=100,
callbacks=[checkpoint, early_stop, lr_monitor]
)
trainer.fit(model, train_loader, val_loader)
Result :
class LitModel(L.LightningModule):
# ... (training_step, etc.)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
# Cosine annealing
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=100,
eta_min=1e-5
)
return {
'optimizer': optimizer,
'lr_scheduler': {
'scheduler': scheduler,
'interval': 'epoch', # Update per epoch
'frequency': 1
}
}
# Learning rate auto-logged!
trainer = L.Trainer(max_epochs=100)
trainer.fit(model, train_loader)
Use PyTorch Lightning when :
Key advantages :
Use alternatives instead :
Issue: Loss not decreasing
Check data and model setup:
# Add to training_step
def training_step(self, batch, batch_idx):
if batch_idx == 0:
print(f"Batch shape: {batch[0].shape}")
print(f"Labels: {batch[1]}")
loss = ...
return loss
Issue: Out of memory
Reduce batch size or use gradient accumulation:
trainer = L.Trainer(
accumulate_grad_batches=4, # Effective batch = batch_size × 4
precision='bf16' # Or 'fp16', reduces memory 50%
)
Issue: Validation not running
Ensure you pass val_loader:
# WRONG
trainer.fit(model, train_loader)
# CORRECT
trainer.fit(model, train_loader, val_loader)
Issue: DDP spawns multiple processes unexpectedly
Lightning auto-detects GPUs. Explicitly set devices:
# Test on CPU first
trainer = L.Trainer(accelerator='cpu', devices=1)
# Then GPU
trainer = L.Trainer(accelerator='gpu', devices=1)
Callbacks : See references/callbacks.md for EarlyStopping, ModelCheckpoint, custom callbacks, and callback hooks.
Distributed strategies : See references/distributed.md for DDP, FSDP, DeepSpeed ZeRO integration, multi-node setup.
Hyperparameter tuning : See references/hyperparameter-tuning.md for integration with Optuna, Ray Tune, and WandB sweeps.
Precision options :
Weekly Installs
355
Repository
GitHub Stars
23.4K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code277
opencode245
gemini-cli229
codex222
cursor209
github-copilot195
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
56,200 周安装