knowledge-distillation by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill knowledge-distillation在以下情况下使用知识蒸馏:
关键技术:温度缩放、软目标、反向 KLD (MiniLLM)、逻辑值蒸馏、响应蒸馏
论文:Hinton 等人 2015 (arXiv 1503.02531)、MiniLLM (arXiv 2306.08543)、KD 综述 (arXiv 2402.13116)
# 标准 transformers
pip install transformers datasets accelerate
# 用于训练
pip install torch deepspeed wandb
# 可选:MiniLLM 实现
git clone https://github.com/microsoft/LMOps
cd LMOps/minillm
pip install -e .
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
# 1. 加载教师(大)和学生(小)模型
teacher = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b-hf", # 大型教师
torch_dtype=torch.float16,
device_map="auto"
)
student = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", # 小型学生
torch_dtype=torch.float16,
device_map="cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf")
# 2. 定义蒸馏损失
def distillation_loss(student_logits, teacher_logits, labels, temperature=2.0, alpha=0.5):
"""
结合硬损失(交叉熵)与软损失(KL 散度)。
参数:
temperature: 软化概率分布(值越高越软)
alpha: 蒸馏损失的权重(1-alpha 用于硬损失)
"""
# 硬损失:与真实标签的标准交叉熵
hard_loss = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), labels.view(-1))
# 软损失:学生与教师之间的 KL 散度
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
# 组合损失
return alpha * soft_loss + (1 - alpha) * hard_loss
# 3. 训练循环
for batch in dataloader:
# 教师前向传播(无梯度)
with torch.no_grad():
teacher_outputs = teacher(**batch)
teacher_logits = teacher_outputs.logits
# 学生前向传播
student_outputs = student(**batch)
student_logits = student_outputs.logits
# 计算蒸馏损失
loss = distillation_loss(
student_logits,
teacher_logits,
batch['labels'],
temperature=2.0,
alpha=0.7 # 70% 软损失,30% 硬损失
)
# 反向传播和优化
loss.backward()
optimizer.step()
optimizer.zero_grad()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
来源:arXiv 2306.08543 (2024)
创新点:使用反向 KLD 而非前向 KLD,以获得更好的生成模型蒸馏效果。
def reverse_kl_loss(student_logits, teacher_logits, temperature=1.0):
"""
反向 KL 散度:KL(教师 || 学生)
对于生成模型比前向 KL 更好。
"""
# 教师分布(目标)
p_teacher = F.softmax(teacher_logits / temperature, dim=-1)
# 学生分布(模型)
log_p_student = F.log_softmax(student_logits / temperature, dim=-1)
# 反向 KL:对教师求和,学生学习覆盖教师的所有模式
reverse_kl = -(p_teacher * log_p_student).sum(dim=-1).mean()
return reverse_kl * (temperature ** 2)
# 使用 MiniLLM 训练
for batch in dataloader:
with torch.no_grad():
teacher_logits = teacher(**batch).logits
student_logits = student(**batch).logits
# 反向 KLD(更适合生成任务)
loss = reverse_kl_loss(student_logits, teacher_logits, temperature=1.0)
loss.backward()
optimizer.step()
为什么使用反向 KL?
# 从教师生成合成数据,训练学生模仿
# 1. 从教师生成合成响应
prompts = ["解释人工智能:", "什么是机器学习?", "定义自然语言处理:"]
teacher_responses = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors='pt').to(teacher.device)
outputs = teacher.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
teacher_responses.append(response)
# 2. 在教师的响应上训练学生(标准微调)
train_dataset = [
{"text": f"{prompt}\n{response}"}
for prompt, response in zip(prompts, teacher_responses)
]
# 3. 微调学生
trainer = Trainer(
model=student,
args=TrainingArguments(output_dir="./student", num_train_epochs=3, learning_rate=2e-5),
train_dataset=train_dataset,
)
trainer.train()
目的:软化概率分布以暴露教师的不确定性。
# 低温 (T=1):尖锐分布
logits = [3.0, 2.0, 1.0]
probs_T1 = softmax(logits / 1.0) # [0.67, 0.24, 0.09]
# 高温 (T=4):平滑分布
probs_T4 = softmax(logits / 4.0) # [0.42, 0.34, 0.24]
# 更高的 T 揭示了更多关于相对排名的信息
规则:蒸馏时使用 T=2-5(2 是常见默认值)。
# 总损失 = alpha * soft_loss + (1 - alpha) * hard_loss
# 软损失:从教师知识中学习
soft_loss = KL(学生 || 教师)
# 硬损失:从真实标签中学习
hard_loss = CrossEntropy(学生输出, 真实标签)
# 典型值:
alpha = 0.5 # 平衡
alpha = 0.7 # 更强调教师
alpha = 0.3 # 更强调标签
# 前向 KL:KL(学生 || 教师)
# - 学生匹配教师的平均行为
# - 模式寻求:学生专注于教师的最高概率模式
# - 适用于分类任务
# 反向 KL:KL(教师 || 学生)
# - 学生覆盖教师的所有行为
# - 模式覆盖:学生学习多样化的行为
# - 适用于生成任务 (MiniLLM)
# 训练学生直接匹配教师的逻辑值
def logit_distillation_trainer(student, teacher, dataloader, temperature=2.0):
optimizer = torch.optim.AdamW(student.parameters(), lr=2e-5)
for epoch in range(3):
for batch in dataloader:
# 获取逻辑值
with torch.no_grad():
teacher_logits = teacher(**batch).logits
student_logits = student(**batch).logits
# 逻辑值上的 MSE(KLD 的替代方案)
loss = F.mse_loss(student_logits, teacher_logits)
# 或使用 KLD
# loss = F.kl_div(
# F.log_softmax(student_logits/temperature, dim=-1),
# F.softmax(teacher_logits/temperature, dim=-1),
# reduction='batchmean'
# ) * (temperature ** 2)
loss.backward()
optimizer.step()
optimizer.zero_grad()
return student
# 阶段 1:从教师蒸馏
student = distill(teacher, student, epochs=5)
# 阶段 2:在任务特定数据上微调
student = fine_tune(student, task_data, epochs=3)
# 结果比单阶段获得更好的任务性能
# 从多个专家教师学习
def multi_teacher_distillation(student, teachers, batch):
"""从教师集合中蒸馏。"""
teacher_logits_list = []
# 从所有教师获取逻辑值
with torch.no_grad():
for teacher in teachers:
logits = teacher(**batch).logits
teacher_logits_list.append(logits)
# 平均教师预测
avg_teacher_logits = torch.stack(teacher_logits_list).mean(dim=0)
# 学生从集合中学习
student_logits = student(**batch).logits
loss = F.kl_div(
F.log_softmax(student_logits, dim=-1),
F.softmax(avg_teacher_logits, dim=-1),
reduction='batchmean'
)
return loss
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
def train_distilled_model(
teacher_name="meta-llama/Llama-2-70b-hf",
student_name="meta-llama/Llama-2-7b-hf",
output_dir="./distilled-llama-7b",
temperature=2.0,
alpha=0.7,
):
# 加载模型
teacher = AutoModelForCausalLM.from_pretrained(teacher_name, torch_dtype=torch.float16, device_map="auto")
student = AutoModelForCausalLM.from_pretrained(student_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(teacher_name)
# 带有蒸馏的自定义训练器
class DistillationTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
# 学生前向传播
outputs_student = model(**inputs)
student_logits = outputs_student.logits
# 教师前向传播(无梯度)
with torch.no_grad():
outputs_teacher = teacher(**inputs)
teacher_logits = outputs_teacher.logits
# 蒸馏损失
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
# 硬损失
hard_loss = outputs_student.loss
# 组合
loss = alpha * soft_loss + (1 - alpha) * hard_loss
return (loss, outputs_student) if return_outputs else loss
# 训练参数
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=2e-5,
warmup_steps=500,
logging_steps=100,
save_steps=1000,
bf16=True,
gradient_checkpointing=True,
)
# 训练
trainer = DistillationTrainer(
model=student,
args=training_args,
train_dataset=train_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
student.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# 用法
train_distilled_model(
teacher_name="meta-llama/Llama-2-70b-hf",
student_name="meta-llama/Llama-2-7b-hf",
temperature=2.0,
alpha=0.7
)
# 温度
T = 1.0 # 尖锐(知识转移较少)
T = 2.0 # 标准(良好平衡)
T = 5.0 # 平滑(知识转移更多)
# Alpha(权重)
alpha = 0.5 # 平衡
alpha = 0.7 # 强调教师知识
alpha = 0.9 # 强蒸馏
# 规则:更高的 T + 更高的 alpha = 更强的蒸馏
# 良好比例(教师/学生)
70B / 7B = 10× # 优秀
13B / 1B = 13× # 良好
7B / 1B = 7× # 可接受
# 避免差距过大
70B / 1B = 70× # 过大,效果不佳
# 最佳:使用教师生成的数据 + 真实数据
train_data = {
"teacher_generated": 70%, # 多样化、高质量
"real_data": 30% # 真实标签
}
# 避免:仅使用真实数据(未充分利用教师)
from transformers import pipeline
# 比较学生与教师
teacher_pipe = pipeline("text-generation", model=teacher)
student_pipe = pipeline("text-generation", model=student)
prompts = ["解释量子计算:", "什么是人工智能?"]
for prompt in prompts:
teacher_out = teacher_pipe(prompt, max_new_tokens=100)
student_out = student_pipe(prompt, max_new_tokens=100)
print(f"提示:{prompt}")
print(f"教师:{teacher_out[0]['generated_text']}")
print(f"学生:{student_out[0]['generated_text']}")
print(f"匹配质量:{calculate_similarity(teacher_out, student_out):.2f}")
每周安装次数
255
代码仓库
GitHub 星标数
23.5K
首次出现
2026 年 1 月 21 日
安全审计
安装于
opencode210
gemini-cli196
claude-code185
codex182
cursor181
github-copilot174
Use Knowledge Distillation when you need to:
Key Techniques : Temperature scaling, soft targets, reverse KLD (MiniLLM), logit distillation, response distillation
Papers : Hinton et al. 2015 (arXiv 1503.02531), MiniLLM (arXiv 2306.08543), KD Survey (arXiv 2402.13116)
# Standard transformers
pip install transformers datasets accelerate
# For training
pip install torch deepspeed wandb
# Optional: MiniLLM implementation
git clone https://github.com/microsoft/LMOps
cd LMOps/minillm
pip install -e .
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
# 1. Load teacher (large) and student (small) models
teacher = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b-hf", # Large teacher
torch_dtype=torch.float16,
device_map="auto"
)
student = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", # Small student
torch_dtype=torch.float16,
device_map="cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf")
# 2. Define distillation loss
def distillation_loss(student_logits, teacher_logits, labels, temperature=2.0, alpha=0.5):
"""
Combine hard loss (cross-entropy) with soft loss (KL divergence).
Args:
temperature: Softens probability distributions (higher = softer)
alpha: Weight for distillation loss (1-alpha for hard loss)
"""
# Hard loss: Standard cross-entropy with true labels
hard_loss = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), labels.view(-1))
# Soft loss: KL divergence between student and teacher
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
# Combined loss
return alpha * soft_loss + (1 - alpha) * hard_loss
# 3. Training loop
for batch in dataloader:
# Teacher forward (no grad)
with torch.no_grad():
teacher_outputs = teacher(**batch)
teacher_logits = teacher_outputs.logits
# Student forward
student_outputs = student(**batch)
student_logits = student_outputs.logits
# Compute distillation loss
loss = distillation_loss(
student_logits,
teacher_logits,
batch['labels'],
temperature=2.0,
alpha=0.7 # 70% soft, 30% hard
)
# Backward and optimize
loss.backward()
optimizer.step()
optimizer.zero_grad()
Source : arXiv 2306.08543 (2024)
Innovation : Use reverse KLD instead of forward KLD for better generative model distillation.
def reverse_kl_loss(student_logits, teacher_logits, temperature=1.0):
"""
Reverse KL divergence: KL(Teacher || Student)
Better for generative models than forward KL.
"""
# Teacher distribution (target)
p_teacher = F.softmax(teacher_logits / temperature, dim=-1)
# Student distribution (model)
log_p_student = F.log_softmax(student_logits / temperature, dim=-1)
# Reverse KL: Sum over teacher, student learns to cover teacher's modes
reverse_kl = -(p_teacher * log_p_student).sum(dim=-1).mean()
return reverse_kl * (temperature ** 2)
# Training with MiniLLM
for batch in dataloader:
with torch.no_grad():
teacher_logits = teacher(**batch).logits
student_logits = student(**batch).logits
# Reverse KLD (better for generation)
loss = reverse_kl_loss(student_logits, teacher_logits, temperature=1.0)
loss.backward()
optimizer.step()
Why reverse KL?
# Generate synthetic data from teacher, train student to imitate
# 1. Generate synthetic responses from teacher
prompts = ["Explain AI:", "What is ML?", "Define NLP:"]
teacher_responses = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors='pt').to(teacher.device)
outputs = teacher.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
teacher_responses.append(response)
# 2. Train student on teacher's responses (standard fine-tuning)
train_dataset = [
{"text": f"{prompt}\n{response}"}
for prompt, response in zip(prompts, teacher_responses)
]
# 3. Fine-tune student
trainer = Trainer(
model=student,
args=TrainingArguments(output_dir="./student", num_train_epochs=3, learning_rate=2e-5),
train_dataset=train_dataset,
)
trainer.train()
Purpose : Soften probability distributions to expose teacher's uncertainty.
# Low temperature (T=1): Sharp distribution
logits = [3.0, 2.0, 1.0]
probs_T1 = softmax(logits / 1.0) # [0.67, 0.24, 0.09]
# High temperature (T=4): Soft distribution
probs_T4 = softmax(logits / 4.0) # [0.42, 0.34, 0.24]
# Higher T reveals more information about relative rankings
Rule : Use T=2-5 for distillation (2 is common default).
# Total loss = alpha * soft_loss + (1 - alpha) * hard_loss
# Soft loss: Learn from teacher's knowledge
soft_loss = KL(student || teacher)
# Hard loss: Learn from ground truth labels
hard_loss = CrossEntropy(student_output, true_labels)
# Typical values:
alpha = 0.5 # Balanced
alpha = 0.7 # More emphasis on teacher
alpha = 0.3 # More emphasis on labels
# Forward KL: KL(Student || Teacher)
# - Student matches teacher's average behavior
# - Mode-seeking: Student focuses on teacher's highest probability modes
# - Good for classification
# Reverse KL: KL(Teacher || Student)
# - Student covers all of teacher's behaviors
# - Mode-covering: Student learns diverse behaviors
# - Good for generation (MiniLLM)
# Train student to match teacher's logits directly
def logit_distillation_trainer(student, teacher, dataloader, temperature=2.0):
optimizer = torch.optim.AdamW(student.parameters(), lr=2e-5)
for epoch in range(3):
for batch in dataloader:
# Get logits
with torch.no_grad():
teacher_logits = teacher(**batch).logits
student_logits = student(**batch).logits
# MSE on logits (alternative to KLD)
loss = F.mse_loss(student_logits, teacher_logits)
# Or use KLD
# loss = F.kl_div(
# F.log_softmax(student_logits/temperature, dim=-1),
# F.softmax(teacher_logits/temperature, dim=-1),
# reduction='batchmean'
# ) * (temperature ** 2)
loss.backward()
optimizer.step()
optimizer.zero_grad()
return student
# Stage 1: Distill from teacher
student = distill(teacher, student, epochs=5)
# Stage 2: Fine-tune on task-specific data
student = fine_tune(student, task_data, epochs=3)
# Results in better task performance than single-stage
# Learn from multiple expert teachers
def multi_teacher_distillation(student, teachers, batch):
"""Distill from ensemble of teachers."""
teacher_logits_list = []
# Get logits from all teachers
with torch.no_grad():
for teacher in teachers:
logits = teacher(**batch).logits
teacher_logits_list.append(logits)
# Average teacher predictions
avg_teacher_logits = torch.stack(teacher_logits_list).mean(dim=0)
# Student learns from ensemble
student_logits = student(**batch).logits
loss = F.kl_div(
F.log_softmax(student_logits, dim=-1),
F.softmax(avg_teacher_logits, dim=-1),
reduction='batchmean'
)
return loss
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
def train_distilled_model(
teacher_name="meta-llama/Llama-2-70b-hf",
student_name="meta-llama/Llama-2-7b-hf",
output_dir="./distilled-llama-7b",
temperature=2.0,
alpha=0.7,
):
# Load models
teacher = AutoModelForCausalLM.from_pretrained(teacher_name, torch_dtype=torch.float16, device_map="auto")
student = AutoModelForCausalLM.from_pretrained(student_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(teacher_name)
# Custom trainer with distillation
class DistillationTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
# Student forward
outputs_student = model(**inputs)
student_logits = outputs_student.logits
# Teacher forward (no grad)
with torch.no_grad():
outputs_teacher = teacher(**inputs)
teacher_logits = outputs_teacher.logits
# Distillation loss
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
# Hard loss
hard_loss = outputs_student.loss
# Combined
loss = alpha * soft_loss + (1 - alpha) * hard_loss
return (loss, outputs_student) if return_outputs else loss
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=2e-5,
warmup_steps=500,
logging_steps=100,
save_steps=1000,
bf16=True,
gradient_checkpointing=True,
)
# Train
trainer = DistillationTrainer(
model=student,
args=training_args,
train_dataset=train_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
student.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Usage
train_distilled_model(
teacher_name="meta-llama/Llama-2-70b-hf",
student_name="meta-llama/Llama-2-7b-hf",
temperature=2.0,
alpha=0.7
)
# Temperature
T = 1.0 # Sharp (less knowledge transfer)
T = 2.0 # Standard (good balance)
T = 5.0 # Soft (more knowledge transfer)
# Alpha (weight)
alpha = 0.5 # Balanced
alpha = 0.7 # Emphasize teacher knowledge
alpha = 0.9 # Strong distillation
# Rule: Higher T + higher alpha = stronger distillation
# Good ratios (teacher/student)
70B / 7B = 10× # Excellent
13B / 1B = 13× # Good
7B / 1B = 7× # Acceptable
# Avoid too large gap
70B / 1B = 70× # Too large, ineffective
# Best: Use teacher-generated data + real data
train_data = {
"teacher_generated": 70%, # Diverse, high-quality
"real_data": 30% # Ground truth
}
# Avoid: Only real data (doesn't utilize teacher fully)
from transformers import pipeline
# Compare student vs teacher
teacher_pipe = pipeline("text-generation", model=teacher)
student_pipe = pipeline("text-generation", model=student)
prompts = ["Explain quantum computing:", "What is AI?"]
for prompt in prompts:
teacher_out = teacher_pipe(prompt, max_new_tokens=100)
student_out = student_pipe(prompt, max_new_tokens=100)
print(f"Prompt: {prompt}")
print(f"Teacher: {teacher_out[0]['generated_text']}")
print(f"Student: {student_out[0]['generated_text']}")
print(f"Match quality: {calculate_similarity(teacher_out, student_out):.2f}")
Weekly Installs
255
Repository
GitHub Stars
23.5K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode210
gemini-cli196
claude-code185
codex182
cursor181
github-copilot174
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
41,800 周安装
Nx Import 使用指南:从源仓库导入代码并保留Git历史
250 周安装
OpenPencil CLI 工具:.fig 设计文件命令行操作与 MCP 服务器 | 设计自动化
250 周安装
学术深度研究技能:AI驱动的学术文献综述与多源验证工具,生成APA格式报告
250 周安装
React PDF 渲染器 - 使用 JSON 生成 PDF 文档,支持自定义组件和流式渲染
250 周安装
后端安全编码专家 | 安全开发实践、漏洞预防与防御性编程技术指南
250 周安装
TanStack Form:高性能无头表单库,支持TypeScript、Zod、Valibot验证
250 周安装