hqq-quantization by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill hqq-quantization支持 8/4/3/2/1 比特精度的快速、无需校准的权重量化,提供多种优化后端。
在以下情况下使用 HQQ:
主要优势:
替代方案使用场景:
pip install hqq
# 安装特定后端
pip install hqq[torch] # PyTorch 后端
pip install hqq[torchao] # TorchAO int4 后端
pip install hqq[bitblas] # BitBlas 后端
pip install hqq[marlin] # Marlin 后端
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
import torch.nn as nn
# 配置量化参数
config = BaseQuantizeConfig(
nbits=4, # 4 比特量化
group_size=64, # 量化的分组大小
axis=1 # 沿输出维度量化
)
# 量化一个线性层
linear = nn.Linear(4096, 4096)
hqq_linear = HQQLinear(linear, config)
# 正常使用
output = hqq_linear(input_tensor)
from transformers import AutoModelForCausalLM, HqqConfig
# 配置 HQQ
quantization_config = HqqConfig(
nbits=4,
group_size=64,
axis=1
)
# 加载并量化
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=quantization_config,
device_map="auto"
)
# 模型已量化并准备就绪
HQQ 使用 BaseQuantizeConfig 来定义量化参数:
from hqq.core.quantize import BaseQuantizeConfig
# 标准 4 比特配置
config_4bit = BaseQuantizeConfig(
nbits=4, # 每个权重的比特数 (1-8)
group_size=64, # 每个量化组的权重数量
axis=1 # 0=输入维度, 1=输出维度
)
# 激进的 2 比特配置
config_2bit = BaseQuantizeConfig(
nbits=2,
group_size=16, # 低比特量化使用更小的分组
axis=1
)
# 按层类型混合精度
layer_configs = {
"self_attn.q_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"self_attn.k_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"self_attn.v_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"mlp.gate_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.up_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.down_proj": BaseQuantizeConfig(nbits=4, group_size=64),
}
替代 nn.Linear 的核心量化层:
from hqq.core.quantize import HQQLinear
import torch
# 创建量化层
linear = torch.nn.Linear(4096, 4096)
hqq_layer = HQQLinear(linear, config)
# 访问量化权重
W_q = hqq_layer.W_q # 量化权重
scale = hqq_layer.scale # 缩放因子
zero = hqq_layer.zero # 零点
# 反量化以供检查
W_dequant = hqq_layer.dequantize()
HQQ 支持多种推理后端以适应不同硬件:
from hqq.core.quantize import HQQLinear
# 可用后端
backends = [
"pytorch", # 纯 PyTorch (默认)
"pytorch_compile", # torch.compile 优化
"aten", # 自定义 CUDA 内核
"torchao_int4", # TorchAO int4 矩阵乘法
"gemlite", # GemLite CUDA 内核
"bitblas", # BitBlas 优化
"marlin", # Marlin 4 比特内核
]
# 全局设置后端
HQQLinear.set_backend("torchao_int4")
# 或按层设置
hqq_layer.set_backend("marlin")
后端选择指南:
| 后端 | 最佳适用场景 | 要求 |
|---|---|---|
| pytorch | 兼容性 | 任何 GPU |
| pytorch_compile | 中等加速 | torch>=2.0 |
| aten | 良好平衡 | CUDA GPU |
| torchao_int4 | 4 比特推理 | 已安装 torchao |
| marlin | 最大 4 比特速度 | Ampere+ GPU |
| bitblas | 灵活比特宽度 | 已安装 bitblas |
from transformers import AutoModelForCausalLM, AutoTokenizer
# 从 Hub 加载 HQQ 量化模型
model = AutoModelForCausalLM.from_pretrained(
"mobiuslabsgmbh/Llama-3.1-8B-HQQ-4bit",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# 正常使用
inputs = tokenizer("Hello, world!", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)
from transformers import AutoModelForCausalLM, HqqConfig
# 量化
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
# 保存量化模型
model.save_pretrained("./llama-8b-hqq-4bit")
# 推送到 Hub
model.push_to_hub("my-org/Llama-3.1-8B-HQQ-4bit")
from transformers import AutoModelForCausalLM, HqqConfig
# 不同层类型使用不同精度
config = HqqConfig(
nbits=4,
group_size=64,
# 注意力层:更高精度
# MLP 层:更低精度以节省内存
dynamic_config={
"attn": {"nbits": 4, "group_size": 64},
"mlp": {"nbits": 2, "group_size": 32}
}
)
from vllm import LLM, SamplingParams
# 加载 HQQ 量化模型
llm = LLM(
model="mobiuslabsgmbh/Llama-3.1-8B-HQQ-4bit",
quantization="hqq",
dtype="float16"
)
# 生成
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
outputs = llm.generate(["What is machine learning?"], sampling_params)
from vllm import LLM
llm = LLM(
model="meta-llama/Llama-3.1-8B",
quantization="hqq",
quantization_config={
"nbits": 4,
"group_size": 64
}
)
from transformers import AutoModelForCausalLM, HqqConfig
from peft import LoraConfig, get_peft_model
# 加载量化模型
quant_config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=quant_config,
device_map="auto"
)
# 应用 LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# 使用 Trainer 或自定义循环正常训练
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./hqq-lora-output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
num_train_epochs=3,
fp16=True,
logging_steps=10,
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator
)
trainer.train()
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
# 1. 配置量化
config = HqqConfig(nbits=4, group_size=64)
# 2. 加载并量化(无需校准!)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# 3. 验证质量
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0]))
# 4. 保存
model.save_pretrained("./llama-8b-hqq")
tokenizer.save_pretrained("./llama-8b-hqq")
from hqq.core.quantize import HQQLinear
from transformers import AutoModelForCausalLM, HqqConfig
# 1. 使用最优后端进行量化
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
# 2. 设置快速后端
HQQLinear.set_backend("marlin") # 或 "torchao_int4"
# 3. 编译以获得额外加速
import torch
model = torch.compile(model)
# 4. 基准测试
import time
inputs = tokenizer("Hello", return_tensors="pt").to(model.device)
start = time.time()
for _ in range(10):
model.generate(**inputs, max_new_tokens=100)
print(f"Avg time: {(time.time() - start) / 10:.2f}s")
量化期间内存不足:
# 逐层量化
from hqq.models.hf.base import AutoHQQHFModel
model = AutoHQQHFModel.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="sequential" # 顺序加载层
)
推理速度慢:
# 切换到优化后端
from hqq.core.quantize import HQQLinear
HQQLinear.set_backend("marlin") # 需要 Ampere+ GPU
# 或编译
model = torch.compile(model, mode="reduce-overhead")
2 比特时质量差:
# 使用更小的分组大小
config = BaseQuantizeConfig(
nbits=2,
group_size=16, # 更小的分组有助于低比特量化
axis=1
)
每周安装量
151
代码仓库
GitHub 星标数
23.4K
首次出现
2026 年 1 月 21 日
安全审计
安装于
claude-code127
opencode123
gemini-cli117
cursor116
antigravity105
codex104
Fast, calibration-free weight quantization supporting 8/4/3/2/1-bit precision with multiple optimized backends.
Use HQQ when:
Key advantages:
Use alternatives instead:
pip install hqq
# With specific backend
pip install hqq[torch] # PyTorch backend
pip install hqq[torchao] # TorchAO int4 backend
pip install hqq[bitblas] # BitBlas backend
pip install hqq[marlin] # Marlin backend
from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
import torch.nn as nn
# Configure quantization
config = BaseQuantizeConfig(
nbits=4, # 4-bit quantization
group_size=64, # Group size for quantization
axis=1 # Quantize along output dimension
)
# Quantize a linear layer
linear = nn.Linear(4096, 4096)
hqq_linear = HQQLinear(linear, config)
# Use normally
output = hqq_linear(input_tensor)
from transformers import AutoModelForCausalLM, HqqConfig
# Configure HQQ
quantization_config = HqqConfig(
nbits=4,
group_size=64,
axis=1
)
# Load and quantize
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=quantization_config,
device_map="auto"
)
# Model is quantized and ready to use
HQQ uses BaseQuantizeConfig to define quantization parameters:
from hqq.core.quantize import BaseQuantizeConfig
# Standard 4-bit config
config_4bit = BaseQuantizeConfig(
nbits=4, # Bits per weight (1-8)
group_size=64, # Weights per quantization group
axis=1 # 0=input dim, 1=output dim
)
# Aggressive 2-bit config
config_2bit = BaseQuantizeConfig(
nbits=2,
group_size=16, # Smaller groups for low-bit
axis=1
)
# Mixed precision per layer type
layer_configs = {
"self_attn.q_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"self_attn.k_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"self_attn.v_proj": BaseQuantizeConfig(nbits=4, group_size=64),
"mlp.gate_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.up_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.down_proj": BaseQuantizeConfig(nbits=4, group_size=64),
}
The core quantized layer that replaces nn.Linear:
from hqq.core.quantize import HQQLinear
import torch
# Create quantized layer
linear = torch.nn.Linear(4096, 4096)
hqq_layer = HQQLinear(linear, config)
# Access quantized weights
W_q = hqq_layer.W_q # Quantized weights
scale = hqq_layer.scale # Scale factors
zero = hqq_layer.zero # Zero points
# Dequantize for inspection
W_dequant = hqq_layer.dequantize()
HQQ supports multiple inference backends for different hardware:
from hqq.core.quantize import HQQLinear
# Available backends
backends = [
"pytorch", # Pure PyTorch (default)
"pytorch_compile", # torch.compile optimized
"aten", # Custom CUDA kernels
"torchao_int4", # TorchAO int4 matmul
"gemlite", # GemLite CUDA kernels
"bitblas", # BitBlas optimized
"marlin", # Marlin 4-bit kernels
]
# Set backend globally
HQQLinear.set_backend("torchao_int4")
# Or per layer
hqq_layer.set_backend("marlin")
Backend selection guide:
| Backend | Best For | Requirements |
|---|---|---|
| pytorch | Compatibility | Any GPU |
| pytorch_compile | Moderate speedup | torch>=2.0 |
| aten | Good balance | CUDA GPU |
| torchao_int4 | 4-bit inference | torchao installed |
| marlin | Maximum 4-bit speed | Ampere+ GPU |
| bitblas | Flexible bit-widths | bitblas installed |
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load HQQ-quantized model from Hub
model = AutoModelForCausalLM.from_pretrained(
"mobiuslabsgmbh/Llama-3.1-8B-HQQ-4bit",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# Use normally
inputs = tokenizer("Hello, world!", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)
from transformers import AutoModelForCausalLM, HqqConfig
# Quantize
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
# Save quantized model
model.save_pretrained("./llama-8b-hqq-4bit")
# Push to Hub
model.push_to_hub("my-org/Llama-3.1-8B-HQQ-4bit")
from transformers import AutoModelForCausalLM, HqqConfig
# Different precision per layer type
config = HqqConfig(
nbits=4,
group_size=64,
# Attention layers: higher precision
# MLP layers: lower precision for memory savings
dynamic_config={
"attn": {"nbits": 4, "group_size": 64},
"mlp": {"nbits": 2, "group_size": 32}
}
)
from vllm import LLM, SamplingParams
# Load HQQ-quantized model
llm = LLM(
model="mobiuslabsgmbh/Llama-3.1-8B-HQQ-4bit",
quantization="hqq",
dtype="float16"
)
# Generate
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
outputs = llm.generate(["What is machine learning?"], sampling_params)
from vllm import LLM
llm = LLM(
model="meta-llama/Llama-3.1-8B",
quantization="hqq",
quantization_config={
"nbits": 4,
"group_size": 64
}
)
from transformers import AutoModelForCausalLM, HqqConfig
from peft import LoraConfig, get_peft_model
# Load quantized model
quant_config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=quant_config,
device_map="auto"
)
# Apply LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# Train normally with Trainer or custom loop
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./hqq-lora-output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
num_train_epochs=3,
fp16=True,
logging_steps=10,
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator
)
trainer.train()
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
# 1. Configure quantization
config = HqqConfig(nbits=4, group_size=64)
# 2. Load and quantize (no calibration needed!)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# 3. Verify quality
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0]))
# 4. Save
model.save_pretrained("./llama-8b-hqq")
tokenizer.save_pretrained("./llama-8b-hqq")
from hqq.core.quantize import HQQLinear
from transformers import AutoModelForCausalLM, HqqConfig
# 1. Quantize with optimal backend
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
# 2. Set fast backend
HQQLinear.set_backend("marlin") # or "torchao_int4"
# 3. Compile for additional speedup
import torch
model = torch.compile(model)
# 4. Benchmark
import time
inputs = tokenizer("Hello", return_tensors="pt").to(model.device)
start = time.time()
for _ in range(10):
model.generate(**inputs, max_new_tokens=100)
print(f"Avg time: {(time.time() - start) / 10:.2f}s")
Out of memory during quantization:
# Quantize layer-by-layer
from hqq.models.hf.base import AutoHQQHFModel
model = AutoHQQHFModel.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="sequential" # Load layers sequentially
)
Slow inference:
# Switch to optimized backend
from hqq.core.quantize import HQQLinear
HQQLinear.set_backend("marlin") # Requires Ampere+ GPU
# Or compile
model = torch.compile(model, mode="reduce-overhead")
Poor quality at 2-bit:
# Use smaller group size
config = BaseQuantizeConfig(
nbits=2,
group_size=16, # Smaller groups help at low bits
axis=1
)
Weekly Installs
151
Repository
GitHub Stars
23.4K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code127
opencode123
gemini-cli117
cursor116
antigravity105
codex104
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
48,300 周安装
语音代理架构设计指南:生产级语音AI的延迟优化与S2S/管道模式对比
402 周安装
CSS架构指南:BEM、SMACSS、CSS-in-JS最佳实践与可维护样式系统构建
404 周安装
技术文档撰写专家 | AI辅助生成用户指南、API文档、架构文档、教程
401 周安装
Agent-Native 架构指南:构建原生智能体应用的核心原则与最佳实践
404 周安装
Spring Boot Saga模式:微服务分布式事务解决方案与实现指南
406 周安装
Cosmos dbt Fusion 实施指南:在 Airflow 中集成 dbt Fusion 的完整步骤
404 周安装