long-context by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill long-context在以下情况下使用长上下文技术:
关键技术:RoPE(旋转位置编码)、YaRN、ALiBi(带线性偏置的注意力)、位置插值
论文:RoFormer (arXiv 2104.09864)、YaRN (arXiv 2309.00071)、ALiBi (arXiv 2108.12409)、位置插值 (arXiv 2306.15595)
# HuggingFace Transformers (包含 RoPE、YaRN 支持)
pip install transformers torch
# 用于自定义实现
pip install einops # 张量操作
pip install rotary-embedding-torch # 独立的 RoPE
# 可选:用于提高效率的 FlashAttention
pip install flash-attn --no-build-isolation
import torch
import torch.nn as nn
class RotaryEmbedding(nn.Module):
"""旋转位置编码 (RoPE)。"""
def __init__(self, dim, max_seq_len=8192, base=10000):
super().__init__()
# 计算逆频率
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.max_seq_len = max_seq_len
def forward(self, seq_len, device):
# 位置索引
t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
# 计算频率
freqs = torch.outer(t, self.inv_freq) # (seq_len, dim/2)
# 计算 sin 和 cos
emb = torch.cat((freqs, freqs), dim=-1) # (seq_len, dim)
return emb.cos(), emb.sin()
def rotate_half(x):
"""旋转一半的隐藏维度。"""
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin):
"""将旋转嵌入应用到查询和键上。"""
# q, k 形状: (batch, heads, seq_len, dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
# 用法
rope = RotaryEmbedding(dim=64, max_seq_len=8192)
cos, sin = rope(seq_len=2048, device='cuda')
# 在注意力层中
q_rotated, k_rotated = apply_rotary_pos_emb(query, key, cos, sin)
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
def get_alibi_slopes(num_heads):
"""获取每个注意力头的 ALiBi 斜率值。"""
def get_slopes_power_of_2(n):
start = 2 ** (-(2 ** -(math.log2(n) - 3)))
ratio = start
return [start * (ratio ** i) for i in range(n)]
if math.log2(num_heads).is_integer():
return get_slopes_power_of_2(num_heads)
else:
# 最接近的 2 的幂
closest_power = 2 ** math.floor(math.log2(num_heads))
slopes = get_slopes_power_of_2(closest_power)
# 添加额外的斜率
extra = get_slopes_power_of_2(2 * closest_power)
slopes.extend(extra[0::2][:num_heads - closest_power])
return slopes
def create_alibi_bias(seq_len, num_heads):
"""创建 ALiBi 注意力偏置。"""
# 距离矩阵
context_position = torch.arange(seq_len)
memory_position = torch.arange(seq_len)
relative_position = memory_position[None, :] - context_position[:, None]
# 获取斜率
slopes = torch.tensor(get_alibi_slopes(num_heads))
# 将斜率应用于距离
alibi = slopes[:, None, None] * relative_position[None, :, :]
return alibi # (num_heads, seq_len, seq_len)
# 在注意力中的用法
num_heads = 8
seq_len = 2048
alibi_bias = create_alibi_bias(seq_len, num_heads).to('cuda')
# 将偏置添加到注意力分数中
# attn_scores 形状: (batch, num_heads, seq_len, seq_len)
attn_scores = attn_scores + alibi_bias
attn_weights = torch.softmax(attn_scores, dim=-1)
from transformers import LlamaForCausalLM, LlamaTokenizer
# 原始上下文:2048 tokens
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
# 使用位置插值扩展到 32k
# 修改 RoPE 基础频率
model.config.rope_scaling = {
"type": "linear",
"factor": 16.0 # 2048 * 16 = 32768
}
# 或使用动态缩放
model.config.rope_scaling = {
"type": "dynamic",
"factor": 16.0
}
# 使用长文档进行微调(需要最少的步骤)
# 在此配置更改后,位置插值即可开箱即用
工作原理:
数学公式:
q_m = (W_q * x_m) * e^(imθ)
k_n = (W_k * x_n) * e^(inθ)
where θ_j = base^(-2j/d) for j ∈ [0, d/2)
优势:
关键创新:
参数:
# YaRN 配置
yarn_config = {
"scale": 16, # 扩展因子
"original_max_position": 2048, # 基础上下文
"extrapolation_factor": 1.0, # NTK 参数
"attn_factor": 1.0, # 注意力缩放
"beta_fast": 32, # 高频缩放
"beta_slow": 1, # 低频缩放
}
性能:
核心思想:
公式:
attention_bias[i, j] = -m * |i - j|
where m = slope for each attention head
优势:
技术:
公式:
# 原始:位置索引 [0, 1, 2, ..., L]
# 扩展:位置索引 [0, 0.5, 1.0, ..., L/2]
# (对于 2 倍扩展)
scaled_position[i] = i / extension_factor
结果:
| 方法 | 最大上下文 | 是否需要训练 | 内存 | 外推能力 | 最适合 |
|---|---|---|---|---|---|
| RoPE | 8k-32k | 完整预训练 | 中等 | 良好 | 新模型 |
| YaRN | 32k-128k | 最少(效率高 10 倍) | 中等 | 优秀 | 扩展现有模型 |
| ALiBi | 无限制 | 完整预训练 | 低(-11%) | 优秀 | 从头开始训练 |
| 位置插值 | 32k+ | 最少(1k 步) | 中等 | 差(设计如此) | 快速扩展 |
from transformers import AutoModelForCausalLM, AutoConfig
# 带有 YaRN 缩放的 RoPE
config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
config.rope_scaling = {
"type": "yarn",
"factor": 8.0,
"original_max_position_embeddings": 8192,
"attention_factor": 1.0
}
model = AutoModelForCausalLM.from_config(config)
# 位置插值(更简单)
config.rope_scaling = {
"type": "linear",
"factor": 4.0
}
# 动态缩放(根据输入长度调整)
config.rope_scaling = {
"type": "dynamic",
"factor": 8.0
}
class LongContextAttention(nn.Module):
"""带有 RoPE 的多头注意力。"""
def __init__(self, hidden_size, num_heads, max_seq_len=32768):
super().__init__()
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
# Q, K, V 投影
self.q_proj = nn.Linear(hidden_size, hidden_size)
self.k_proj = nn.Linear(hidden_size, hidden_size)
self.v_proj = nn.Linear(hidden_size, hidden_size)
self.o_proj = nn.Linear(hidden_size, hidden_size)
# RoPE
self.rotary_emb = RotaryEmbedding(
dim=self.head_dim,
max_seq_len=max_seq_len
)
def forward(self, hidden_states):
batch_size, seq_len, _ = hidden_states.shape
# 投影到 Q, K, V
q = self.q_proj(hidden_states)
k = self.k_proj(hidden_states)
v = self.v_proj(hidden_states)
# 为多头注意力重塑形状
q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# 应用 RoPE
cos, sin = self.rotary_emb(seq_len, device=hidden_states.device)
q, k = apply_rotary_pos_emb(q, k, cos, sin)
# 标准注意力
attn_output = F.scaled_dot_product_attention(q, k, v)
# 重塑形状并投影
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(batch_size, seq_len, -1)
output = self.o_proj(attn_output)
return output
from transformers import Trainer, TrainingArguments
# 扩展模型配置
model.config.max_position_embeddings = 32768
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
# 训练参数(需要最少的步骤)
training_args = TrainingArguments(
output_dir="./llama-32k",
num_train_epochs=1,
max_steps=1000, # 仅需 1000 步!
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=2e-5,
warmup_steps=100,
logging_steps=10,
save_steps=500,
)
# 在长文档上训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=long_document_dataset, # 32k token 序列
)
trainer.train()
# 克隆 YaRN 实现
git clone https://github.com/jquesnelle/yarn
cd yarn
# 使用 YaRN 微调 LLaMA
python scripts/train.py \
--model meta-llama/Llama-2-7b-hf \
--scale 16 \
--rope_theta 10000 \
--max_length 32768 \
--batch_size 1 \
--gradient_accumulation 16 \
--steps 400 \
--learning_rate 2e-5
# 对于新模型(从头开始训练)
use_method = "ALiBi" # 最佳外推能力,最低内存
# 对于扩展现有的 RoPE 模型
use_method = "YaRN" # 最高效的扩展(数据量少 10 倍)
# 对于计算量最小的快速扩展
use_method = "Position Interpolation" # 1000 步
# 对于效率良好的适度扩展
use_method = "Linear RoPE Scaling" # 内置,简单
# 保守(更安全,质量更好)
scaling_factor = 2.0 # 8k → 16k
# 适中(良好的平衡)
scaling_factor = 4.0 # 8k → 32k
# 激进(需要更多微调)
scaling_factor = 8.0 # 8k → 64k
scaling_factor = 16.0 # 8k → 128k
# 规则:更大的因子需要更多的微调步骤
steps_needed = 100 * scaling_factor # 粗略估计
# ✅ 好:与目标长度匹配的长文档
train_data = [
{"text": long_doc_32k_tokens}, # 完整的 32k
{"text": long_doc_24k_tokens}, # 不同长度
{"text": long_doc_16k_tokens},
]
# ❌ 差:短文档(无法学习长上下文)
train_data = [
{"text": short_doc_2k_tokens},
]
# 使用数据集如:
# - PG-19(书籍,长文本)
# - arXiv 论文
# - 长对话
# - GitHub 仓库(拼接的文件)
# ❌ 差:在没有微调的情况下应用位置插值
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
# 模型在没有微调的情况下性能会很差!
# ✅ 好:在缩放后进行微调
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
fine_tune(model, long_documents, steps=1000)
# ❌ 差:在没有数据的情况下进行过于激进的缩放
scale_to_1M_tokens() # 没有大量微调是无法工作的
# ✅ 好:增量缩放
# 8k → 16k → 32k → 64k(每一步都进行微调)
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加载长上下文模型
model = AutoModelForCausalLM.from_pretrained(
"togethercomputer/LLaMA-2-7B-32K", # 32k 上下文
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
# 处理长文档
long_text = "..." * 30000 # 30k tokens
inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to('cuda')
# 生成
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 在微调时使用梯度检查点
model.gradient_checkpointing_enable()
# 使用 Flash Attention 2
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
attn_implementation="flash_attention_2", # 快 2-3 倍
torch_dtype=torch.float16
)
# 使用分页注意力 (vLLM)
from vllm import LLM
llm = LLM(
model="togethercomputer/LLaMA-2-7B-32K",
max_model_len=32768, # 32k 上下文
gpu_memory_utilization=0.9
)
references/rope.md - 详细的 RoPE 实现和理论references/extension_methods.md - YaRN、ALiBi、位置插值比较references/fine_tuning.md - 上下文扩展的完整微调指南每周安装次数
236
仓库
GitHub 星标数
22.6K
首次出现
2026 年 1 月 21 日
安全审计
已安装于
opencode197
claude-code183
gemini-cli182
codex167
github-copilot151
cursor148
Use Long Context techniques when you need to:
Key Techniques : RoPE (Rotary Position Embeddings), YaRN, ALiBi (Attention with Linear Biases), Position Interpolation
Papers : RoFormer (arXiv 2104.09864), YaRN (arXiv 2309.00071), ALiBi (arXiv 2108.12409), Position Interpolation (arXiv 2306.15595)
# HuggingFace Transformers (includes RoPE, YaRN support)
pip install transformers torch
# For custom implementations
pip install einops # Tensor operations
pip install rotary-embedding-torch # Standalone RoPE
# Optional: FlashAttention for efficiency
pip install flash-attn --no-build-isolation
import torch
import torch.nn as nn
class RotaryEmbedding(nn.Module):
"""Rotary Position Embeddings (RoPE)."""
def __init__(self, dim, max_seq_len=8192, base=10000):
super().__init__()
# Compute inverse frequencies
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.max_seq_len = max_seq_len
def forward(self, seq_len, device):
# Position indices
t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
# Compute frequencies
freqs = torch.outer(t, self.inv_freq) # (seq_len, dim/2)
# Compute sin and cos
emb = torch.cat((freqs, freqs), dim=-1) # (seq_len, dim)
return emb.cos(), emb.sin()
def rotate_half(x):
"""Rotate half the hidden dimensions."""
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin):
"""Apply rotary embeddings to queries and keys."""
# q, k shape: (batch, heads, seq_len, dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
# Usage
rope = RotaryEmbedding(dim=64, max_seq_len=8192)
cos, sin = rope(seq_len=2048, device='cuda')
# In attention layer
q_rotated, k_rotated = apply_rotary_pos_emb(query, key, cos, sin)
def get_alibi_slopes(num_heads):
"""Get ALiBi slope values for each attention head."""
def get_slopes_power_of_2(n):
start = 2 ** (-(2 ** -(math.log2(n) - 3)))
ratio = start
return [start * (ratio ** i) for i in range(n)]
if math.log2(num_heads).is_integer():
return get_slopes_power_of_2(num_heads)
else:
# Closest power of 2
closest_power = 2 ** math.floor(math.log2(num_heads))
slopes = get_slopes_power_of_2(closest_power)
# Add extra slopes
extra = get_slopes_power_of_2(2 * closest_power)
slopes.extend(extra[0::2][:num_heads - closest_power])
return slopes
def create_alibi_bias(seq_len, num_heads):
"""Create ALiBi attention bias."""
# Distance matrix
context_position = torch.arange(seq_len)
memory_position = torch.arange(seq_len)
relative_position = memory_position[None, :] - context_position[:, None]
# Get slopes
slopes = torch.tensor(get_alibi_slopes(num_heads))
# Apply slopes to distances
alibi = slopes[:, None, None] * relative_position[None, :, :]
return alibi # (num_heads, seq_len, seq_len)
# Usage in attention
num_heads = 8
seq_len = 2048
alibi_bias = create_alibi_bias(seq_len, num_heads).to('cuda')
# Add bias to attention scores
# attn_scores shape: (batch, num_heads, seq_len, seq_len)
attn_scores = attn_scores + alibi_bias
attn_weights = torch.softmax(attn_scores, dim=-1)
from transformers import LlamaForCausalLM, LlamaTokenizer
# Original context: 2048 tokens
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
# Extend to 32k with position interpolation
# Modify RoPE base frequency
model.config.rope_scaling = {
"type": "linear",
"factor": 16.0 # 2048 * 16 = 32768
}
# Or use dynamic scaling
model.config.rope_scaling = {
"type": "dynamic",
"factor": 16.0
}
# Fine-tune with long documents (minimal steps needed)
# Position interpolation works out-of-the-box after this config change
How it works:
Mathematical formulation:
q_m = (W_q * x_m) * e^(imθ)
k_n = (W_k * x_n) * e^(inθ)
where θ_j = base^(-2j/d) for j ∈ [0, d/2)
Advantages:
Key innovation:
Parameters:
# YaRN configuration
yarn_config = {
"scale": 16, # Extension factor
"original_max_position": 2048, # Base context
"extrapolation_factor": 1.0, # NTK parameter
"attn_factor": 1.0, # Attention scaling
"beta_fast": 32, # High-frequency scale
"beta_slow": 1, # Low-frequency scale
}
Performance:
Core idea:
Formula:
attention_bias[i, j] = -m * |i - j|
where m = slope for each attention head
Advantages:
Technique:
Formula:
# Original: position indices [0, 1, 2, ..., L]
# Extended: position indices [0, 0.5, 1.0, ..., L/2]
# (for 2× extension)
scaled_position[i] = i / extension_factor
Results:
| Method | Max Context | Training Needed | Memory | Extrapolation | Best For |
|---|---|---|---|---|---|
| RoPE | 8k-32k | Full pre-training | Moderate | Good | New models |
| YaRN | 32k-128k | Minimal (10× efficient) | Moderate | Excellent | Extending existing models |
| ALiBi | Unlimited | Full pre-training | Low (-11%) | Excellent | Training from scratch |
| Position Interpolation | 32k+ | Minimal (1k steps) | Moderate | Poor (by design) |
from transformers import AutoModelForCausalLM, AutoConfig
# RoPE with YaRN scaling
config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
config.rope_scaling = {
"type": "yarn",
"factor": 8.0,
"original_max_position_embeddings": 8192,
"attention_factor": 1.0
}
model = AutoModelForCausalLM.from_config(config)
# Position interpolation (simpler)
config.rope_scaling = {
"type": "linear",
"factor": 4.0
}
# Dynamic scaling (adjusts based on input length)
config.rope_scaling = {
"type": "dynamic",
"factor": 8.0
}
class LongContextAttention(nn.Module):
"""Multi-head attention with RoPE."""
def __init__(self, hidden_size, num_heads, max_seq_len=32768):
super().__init__()
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
# Q, K, V projections
self.q_proj = nn.Linear(hidden_size, hidden_size)
self.k_proj = nn.Linear(hidden_size, hidden_size)
self.v_proj = nn.Linear(hidden_size, hidden_size)
self.o_proj = nn.Linear(hidden_size, hidden_size)
# RoPE
self.rotary_emb = RotaryEmbedding(
dim=self.head_dim,
max_seq_len=max_seq_len
)
def forward(self, hidden_states):
batch_size, seq_len, _ = hidden_states.shape
# Project to Q, K, V
q = self.q_proj(hidden_states)
k = self.k_proj(hidden_states)
v = self.v_proj(hidden_states)
# Reshape for multi-head
q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# Apply RoPE
cos, sin = self.rotary_emb(seq_len, device=hidden_states.device)
q, k = apply_rotary_pos_emb(q, k, cos, sin)
# Standard attention
attn_output = F.scaled_dot_product_attention(q, k, v)
# Reshape and project
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(batch_size, seq_len, -1)
output = self.o_proj(attn_output)
return output
from transformers import Trainer, TrainingArguments
# Extend model config
model.config.max_position_embeddings = 32768
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
# Training args (minimal steps needed)
training_args = TrainingArguments(
output_dir="./llama-32k",
num_train_epochs=1,
max_steps=1000, # Only 1000 steps!
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=2e-5,
warmup_steps=100,
logging_steps=10,
save_steps=500,
)
# Train on long documents
trainer = Trainer(
model=model,
args=training_args,
train_dataset=long_document_dataset, # 32k token sequences
)
trainer.train()
# Clone YaRN implementation
git clone https://github.com/jquesnelle/yarn
cd yarn
# Fine-tune LLaMA with YaRN
python scripts/train.py \
--model meta-llama/Llama-2-7b-hf \
--scale 16 \
--rope_theta 10000 \
--max_length 32768 \
--batch_size 1 \
--gradient_accumulation 16 \
--steps 400 \
--learning_rate 2e-5
# For NEW models (training from scratch)
use_method = "ALiBi" # Best extrapolation, lowest memory
# For EXTENDING existing RoPE models
use_method = "YaRN" # Most efficient extension (10× less data)
# For QUICK extension with minimal compute
use_method = "Position Interpolation" # 1000 steps
# For MODERATE extension with good efficiency
use_method = "Linear RoPE Scaling" # Built-in, simple
# Conservative (safer, better quality)
scaling_factor = 2.0 # 8k → 16k
# Moderate (good balance)
scaling_factor = 4.0 # 8k → 32k
# Aggressive (requires more fine-tuning)
scaling_factor = 8.0 # 8k → 64k
scaling_factor = 16.0 # 8k → 128k
# Rule: Larger factors need more fine-tuning steps
steps_needed = 100 * scaling_factor # Rough estimate
# ✅ Good: Long documents matching target length
train_data = [
{"text": long_doc_32k_tokens}, # Full 32k
{"text": long_doc_24k_tokens}, # Varied lengths
{"text": long_doc_16k_tokens},
]
# ❌ Bad: Short documents (won't learn long context)
train_data = [
{"text": short_doc_2k_tokens},
]
# Use datasets like:
# - PG-19 (books, long texts)
# - arXiv papers
# - Long-form conversations
# - GitHub repositories (concatenated files)
# ❌ Bad: Applying position interpolation without fine-tuning
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
# Model will perform poorly without fine-tuning!
# ✅ Good: Fine-tune after scaling
model.config.rope_scaling = {"type": "linear", "factor": 16.0}
fine_tune(model, long_documents, steps=1000)
# ❌ Bad: Too aggressive scaling without data
scale_to_1M_tokens() # Won't work without massive fine-tuning
# ✅ Good: Incremental scaling
# 8k → 16k → 32k → 64k (fine-tune at each step)
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load long-context model
model = AutoModelForCausalLM.from_pretrained(
"togethercomputer/LLaMA-2-7B-32K", # 32k context
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
# Process long document
long_text = "..." * 30000 # 30k tokens
inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to('cuda')
# Generate
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Use gradient checkpointing for fine-tuning
model.gradient_checkpointing_enable()
# Use Flash Attention 2
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
attn_implementation="flash_attention_2", # 2-3× faster
torch_dtype=torch.float16
)
# Use paged attention (vLLM)
from vllm import LLM
llm = LLM(
model="togethercomputer/LLaMA-2-7B-32K",
max_model_len=32768, # 32k context
gpu_memory_utilization=0.9
)
references/rope.md - Detailed RoPE implementation and theoryreferences/extension_methods.md - YaRN, ALiBi, Position Interpolation comparisonsreferences/fine_tuning.md - Complete fine-tuning guide for context extensionWeekly Installs
236
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode197
claude-code183
gemini-cli182
codex167
github-copilot151
cursor148
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
41,800 周安装
| Quick extension |