model-quantization by martinholovsky/claude-skills-generator
npx skills add https://github.com/martinholovsky/claude-skills-generator --skill model-quantization文件组织:拆分结构。详细实现请参见
references/。
风险等级:中等 - 模型操作、潜在质量下降、资源管理
您是 AI 模型量化领域的专家,在 4 位/8 位优化、GGUF 格式转换以及质量与性能权衡方面拥有深厚的专业知识。您的专长涵盖量化技术、内存优化以及资源受限部署的基准测试。
您擅长:
主要用例:
量化模型时,您将:
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# tests/test_quantization.py
import pytest
from pathlib import Path
class TestQuantizationQuality:
"""测试量化模型的质量指标。"""
@pytest.fixture
def baseline_metrics(self):
"""原始模型的基线指标。"""
return {
"perplexity": 5.2,
"accuracy": 0.95,
"latency_ms": 100
}
def test_perplexity_within_threshold(self, quantized_model, baseline_metrics):
"""量化模型的困惑度在基线值的 10% 以内。"""
benchmark = QuantizationBenchmark(TEST_PROMPTS)
results = benchmark.benchmark(quantized_model)
max_perplexity = baseline_metrics["perplexity"] * 1.10
assert results["perplexity"] <= max_perplexity, \
f"困惑度 {results['perplexity']} 超过阈值 {max_perplexity}"
def test_accuracy_maintained(self, quantized_model, test_cases):
"""关键用例保持准确性。"""
correct = 0
for prompt, expected in test_cases:
response = quantized_model(prompt, max_tokens=50)
if expected.lower() in response["choices"][0]["text"].lower():
correct += 1
accuracy = correct / len(test_cases)
assert accuracy >= 0.90, f"准确率 {accuracy} 低于 90% 阈值"
def test_memory_under_limit(self, quantized_model, max_memory_mb):
"""模型符合内存约束。"""
import psutil
process = psutil.Process()
memory_mb = process.memory_info().rss / (1024 * 1024)
assert memory_mb <= max_memory_mb, \
f"内存 {memory_mb}MB 超过限制 {max_memory_mb}MB"
def test_latency_acceptable(self, quantized_model, baseline_metrics):
"""推理延迟在可接受范围内。"""
benchmark = QuantizationBenchmark(TEST_PROMPTS)
results = benchmark.benchmark(quantized_model)
# 量化后应该更快或相似
max_latency = baseline_metrics["latency_ms"] * 1.5
assert results["latency_ms"] <= max_latency
# 实现量化以使测试通过
quantizer = SecureQuantizer(models_dir, llama_cpp_dir)
output = quantizer.quantize(
input_model="model-f16.gguf",
output_name="model-Q5_K_M.gguf",
quantization="Q5_K_M"
)
# 运行所有量化测试
pytest tests/test_quantization.py -v
# 运行覆盖率测试
pytest tests/test_quantization.py --cov=quantization --cov-report=term-missing
# 运行基准测试
python -m pytest tests/test_quantization.py::TestQuantizationQuality -v --benchmark
| 量化 | 位数 | 内存 | 质量 | 用例 |
|---|---|---|---|---|
| Q4_0 | 4 | 50% | 低 | 最小 RAM |
| Q4_K_S | 4 | 50% | 中等 | 低 RAM |
| Q4_K_M | 4 | 52% | 良好 | 平衡 |
| Q5_K_S | 5 | 58% | 更好 | 更多 RAM |
| Q5_K_M | 5 | 60% | 更好+ | 推荐 |
| Q6_K | 6 | 66% | 高 | 注重质量 |
| Q8_0 | 8 | 75% | 最佳 | 最高质量 |
| F16 | 16 | 100% | 原始 | 基线 |
| 量化 | 模型大小 | 所需 RAM |
|---|---|---|
| Q4_K_M | 4.1 GB | 6 GB |
| Q5_K_M | 4.8 GB | 7 GB |
| Q8_0 | 7.2 GB | 10 GB |
| F16 | 14.0 GB | 18 GB |
from pathlib import Path
import subprocess
import hashlib
import structlog
logger = structlog.get_logger()
class SecureQuantizer:
"""带有验证的安全模型量化。"""
def __init__(self, models_dir: str, llama_cpp_dir: str):
self.models_dir = Path(models_dir)
self.llama_cpp_dir = Path(llama_cpp_dir)
self.quantize_bin = self.llama_cpp_dir / "quantize"
if not self.quantize_bin.exists():
raise FileNotFoundError("未找到 llama.cpp quantize 二进制文件")
def quantize(
self,
input_model: str,
output_name: str,
quantization: str = "Q4_K_M"
) -> str:
"""通过验证量化模型。"""
input_path = self.models_dir / input_model
output_path = self.models_dir / output_name
# 验证输入
if not input_path.exists():
raise FileNotFoundError(f"未找到模型:{input_path}")
# 验证量化类型
valid_types = ["Q4_0", "Q4_K_S", "Q4_K_M", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"]
if quantization not in valid_types:
raise ValueError(f"无效的量化:{quantization}")
# 计算输入校验和
input_checksum = self._calculate_checksum(input_path)
logger.info("quantize.starting",
input=input_model,
quantization=quantization,
input_checksum=input_checksum[:16])
# 运行量化
result = subprocess.run(
[
str(self.quantize_bin),
str(input_path),
str(output_path),
quantization
],
capture_output=True,
text=True,
timeout=3600 # 1 小时超时
)
if result.returncode != 0:
logger.error("quantize.failed", stderr=result.stderr)
raise QuantizationError(f"量化失败:{result.stderr}")
# 计算输出校验和
output_checksum = self._calculate_checksum(output_path)
# 保存校验和
self._save_checksum(output_path, output_checksum)
logger.info("quantize.complete",
output=output_name,
output_checksum=output_checksum[:16],
size_mb=output_path.stat().st_size / (1024*1024))
return str(output_path)
def _calculate_checksum(self, path: Path) -> str:
"""计算 SHA256 校验和。"""
sha256 = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
sha256.update(chunk)
return sha256.hexdigest()
def _save_checksum(self, model_path: Path, checksum: str):
"""将校验和与模型一起保存。"""
checksum_path = model_path.with_suffix(".sha256")
checksum_path.write_text(f"{checksum} {model_path.name}")
import numpy as np
from typing import Dict
class QuantizationBenchmark:
"""量化质量基准测试。"""
def __init__(self, test_prompts: list[str]):
self.test_prompts = test_prompts
def benchmark(self, model_path: str) -> Dict:
"""在模型上运行质量基准测试。"""
from llama_cpp import Llama
llm = Llama(model_path=model_path, n_ctx=512, verbose=False)
results = {
"perplexity": self._measure_perplexity(llm),
"latency_ms": self._measure_latency(llm),
"memory_mb": self._measure_memory(llm)
}
logger.info("benchmark.complete",
model=Path(model_path).name,
**results)
return results
def _measure_perplexity(self, llm) -> float:
"""测量模型困惑度。"""
# 简化的困惑度计算
total_nll = 0
total_tokens = 0
for prompt in self.test_prompts:
tokens = llm.tokenize(prompt.encode())
logits = llm.eval(tokens)
# 计算负对数似然
total_tokens += len(tokens)
return np.exp(total_nll / total_tokens) if total_tokens > 0 else float('inf')
def _measure_latency(self, llm) -> float:
"""测量推理延迟。"""
import time
latencies = []
for prompt in self.test_prompts[:5]:
start = time.time()
llm(prompt, max_tokens=50)
latencies.append((time.time() - start) * 1000)
return np.mean(latencies)
def _measure_memory(self, llm) -> float:
"""测量内存使用情况。"""
import psutil
process = psutil.Process()
return process.memory_info().rss / (1024 * 1024)
class QuantizationSelector:
"""为硬件选择最优量化。"""
def select(
self,
model_params_b: float,
available_ram_gb: float,
quality_priority: str = "balanced"
) -> str:
"""根据约束选择量化级别。"""
# 每种量化的每个参数内存
memory_per_param = {
"Q4_K_M": 0.5,
"Q5_K_M": 0.625,
"Q6_K": 0.75,
"Q8_0": 1.0
}
# 质量分数(相对)
quality_scores = {
"Q4_K_M": 0.7,
"Q5_K_M": 0.85,
"Q6_K": 0.92,
"Q8_0": 0.98
}
# 计算哪些能放入 RAM(需要约 2GB 开销)
usable_ram = available_ram_gb - 2
candidates = []
for quant, mem_factor in memory_per_param.items():
model_mem = model_params_b * mem_factor
if model_mem <= usable_ram:
candidates.append(quant)
if not candidates:
raise ValueError(f"没有量化级别适合 {available_ram_gb}GB RAM")
# 根据优先级选择
if quality_priority == "quality":
return max(candidates, key=lambda q: quality_scores[q])
elif quality_priority == "speed":
return min(candidates, key=lambda q: memory_per_param[q])
else: # balanced
# 返回能容纳的最高质量级别
return max(candidates, key=lambda q: quality_scores[q])
# 用法
selector = QuantizationSelector()
quant = selector.select(
model_params_b=7.0,
available_ram_gb=8.0,
quality_priority="balanced"
)
# 返回 "Q5_K_M"
class ModelConverter:
"""将模型转换为 GGUF 格式。"""
def convert_hf_to_gguf(
self,
hf_model_path: str,
output_path: str,
quantization: str = None
) -> str:
"""将 HuggingFace 模型转换为 GGUF。"""
# 转换为 GGUF
convert_script = self.llama_cpp_dir / "convert_hf_to_gguf.py"
result = subprocess.run(
[
"python",
str(convert_script),
hf_model_path,
"--outtype", "f16",
"--outfile", output_path
],
capture_output=True,
text=True
)
if result.returncode != 0:
raise ConversionError(f"转换失败:{result.stderr}")
# 可选量化
if quantization:
quantizer = SecureQuantizer(
str(Path(output_path).parent),
str(self.llama_cpp_dir)
)
return quantizer.quantize(
Path(output_path).name,
Path(output_path).stem + f"_{quantization}.gguf",
quantization
)
return output_path
def verify_model_integrity(model_path: str) -> bool:
"""验证模型文件完整性。"""
path = Path(model_path)
checksum_path = path.with_suffix(".sha256")
if not checksum_path.exists():
logger.warning("model.no_checksum", model=path.name)
return False
expected = checksum_path.read_text().split()[0]
actual = calculate_checksum(path)
if expected != actual:
logger.error("model.checksum_mismatch",
model=path.name,
expected=expected[:16],
actual=actual[:16])
return False
return True
def safe_load_quantized(model_path: str) -> Llama:
"""通过验证加载量化模型。"""
# 验证完整性
if not verify_model_integrity(model_path):
raise SecurityError("模型完整性检查失败")
# 验证路径
path = Path(model_path).resolve()
allowed_dir = Path("/var/jarvis/models").resolve()
if not path.is_relative_to(allowed_dir):
raise SecurityError("模型位于允许的目录之外")
return Llama(model_path=str(path))
# 错误 - 无验证
llm = Llama(model_path=user_provided_path)
# 正确 - 先验证
if not verify_model_integrity(path):
raise SecurityError("模型验证失败")
llm = Llama(model_path=path)
# 错误 - 对质量关键任务使用 Q4_0
llm = Llama(model_path="model-Q4_0.gguf") # 质量差
# 正确 - 选择合适的级别
quant = selector.select(7.0, 8.0, "quality")
llm = Llama(model_path=f"model-{quant}.gguf")
您的目标是创建满足以下条件的量化模型:
您理解量化是质量和资源使用之间的权衡。部署前始终进行基准测试并验证模型完整性。
关键提醒:
每周安装次数
73
仓库
GitHub 星标数
32
首次出现
2026年1月20日
安全审计
安装于
gemini-cli60
codex60
opencode58
github-copilot57
cursor56
cline51
File Organization : Split structure. See
references/for detailed implementations.
Risk Level : MEDIUM - Model manipulation, potential quality degradation, resource management
You are an expert in AI model quantization with deep expertise in 4-bit/8-bit optimization, GGUF format conversion, and quality-performance tradeoffs. Your mastery spans quantization techniques, memory optimization, and benchmarking for resource-constrained deployments.
You excel at:
Primary Use Cases :
When quantizing models, you will:
# tests/test_quantization.py
import pytest
from pathlib import Path
class TestQuantizationQuality:
"""Test quantized model quality metrics."""
@pytest.fixture
def baseline_metrics(self):
"""Baseline metrics from original model."""
return {
"perplexity": 5.2,
"accuracy": 0.95,
"latency_ms": 100
}
def test_perplexity_within_threshold(self, quantized_model, baseline_metrics):
"""Quantized model perplexity within 10% of baseline."""
benchmark = QuantizationBenchmark(TEST_PROMPTS)
results = benchmark.benchmark(quantized_model)
max_perplexity = baseline_metrics["perplexity"] * 1.10
assert results["perplexity"] <= max_perplexity, \
f"Perplexity {results['perplexity']} exceeds threshold {max_perplexity}"
def test_accuracy_maintained(self, quantized_model, test_cases):
"""Critical use cases maintain accuracy."""
correct = 0
for prompt, expected in test_cases:
response = quantized_model(prompt, max_tokens=50)
if expected.lower() in response["choices"][0]["text"].lower():
correct += 1
accuracy = correct / len(test_cases)
assert accuracy >= 0.90, f"Accuracy {accuracy} below 90% threshold"
def test_memory_under_limit(self, quantized_model, max_memory_mb):
"""Model fits within memory constraint."""
import psutil
process = psutil.Process()
memory_mb = process.memory_info().rss / (1024 * 1024)
assert memory_mb <= max_memory_mb, \
f"Memory {memory_mb}MB exceeds limit {max_memory_mb}MB"
def test_latency_acceptable(self, quantized_model, baseline_metrics):
"""Inference latency within acceptable range."""
benchmark = QuantizationBenchmark(TEST_PROMPTS)
results = benchmark.benchmark(quantized_model)
# Quantized should be faster or similar
max_latency = baseline_metrics["latency_ms"] * 1.5
assert results["latency_ms"] <= max_latency
# Implement quantization to make tests pass
quantizer = SecureQuantizer(models_dir, llama_cpp_dir)
output = quantizer.quantize(
input_model="model-f16.gguf",
output_name="model-Q5_K_M.gguf",
quantization="Q5_K_M"
)
# Run all quantization tests
pytest tests/test_quantization.py -v
# Run with coverage
pytest tests/test_quantization.py --cov=quantization --cov-report=term-missing
# Run benchmarks
python -m pytest tests/test_quantization.py::TestQuantizationQuality -v --benchmark
| Quantization | Bits | Memory | Quality | Use Case |
|---|---|---|---|---|
| Q4_0 | 4 | 50% | Low | Minimum RAM |
| Q4_K_S | 4 | 50% | Medium | Low RAM |
| Q4_K_M | 4 | 52% | Good | Balanced |
| Q5_K_S | 5 | 58% | Better | More RAM |
| Q5_K_M | 5 | 60% | Better+ | Recommended |
| Q6_K | 6 | 66% | High |
| Quantization | Model Size | RAM Required |
|---|---|---|
| Q4_K_M | 4.1 GB | 6 GB |
| Q5_K_M | 4.8 GB | 7 GB |
| Q8_0 | 7.2 GB | 10 GB |
| F16 | 14.0 GB | 18 GB |
from pathlib import Path
import subprocess
import hashlib
import structlog
logger = structlog.get_logger()
class SecureQuantizer:
"""Secure model quantization with validation."""
def __init__(self, models_dir: str, llama_cpp_dir: str):
self.models_dir = Path(models_dir)
self.llama_cpp_dir = Path(llama_cpp_dir)
self.quantize_bin = self.llama_cpp_dir / "quantize"
if not self.quantize_bin.exists():
raise FileNotFoundError("llama.cpp quantize binary not found")
def quantize(
self,
input_model: str,
output_name: str,
quantization: str = "Q4_K_M"
) -> str:
"""Quantize model with validation."""
input_path = self.models_dir / input_model
output_path = self.models_dir / output_name
# Validate input
if not input_path.exists():
raise FileNotFoundError(f"Model not found: {input_path}")
# Validate quantization type
valid_types = ["Q4_0", "Q4_K_S", "Q4_K_M", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"]
if quantization not in valid_types:
raise ValueError(f"Invalid quantization: {quantization}")
# Calculate input checksum
input_checksum = self._calculate_checksum(input_path)
logger.info("quantize.starting",
input=input_model,
quantization=quantization,
input_checksum=input_checksum[:16])
# Run quantization
result = subprocess.run(
[
str(self.quantize_bin),
str(input_path),
str(output_path),
quantization
],
capture_output=True,
text=True,
timeout=3600 # 1 hour timeout
)
if result.returncode != 0:
logger.error("quantize.failed", stderr=result.stderr)
raise QuantizationError(f"Quantization failed: {result.stderr}")
# Calculate output checksum
output_checksum = self._calculate_checksum(output_path)
# Save checksum
self._save_checksum(output_path, output_checksum)
logger.info("quantize.complete",
output=output_name,
output_checksum=output_checksum[:16],
size_mb=output_path.stat().st_size / (1024*1024))
return str(output_path)
def _calculate_checksum(self, path: Path) -> str:
"""Calculate SHA256 checksum."""
sha256 = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
sha256.update(chunk)
return sha256.hexdigest()
def _save_checksum(self, model_path: Path, checksum: str):
"""Save checksum alongside model."""
checksum_path = model_path.with_suffix(".sha256")
checksum_path.write_text(f"{checksum} {model_path.name}")
import numpy as np
from typing import Dict
class QuantizationBenchmark:
"""Benchmark quantization quality."""
def __init__(self, test_prompts: list[str]):
self.test_prompts = test_prompts
def benchmark(self, model_path: str) -> Dict:
"""Run quality benchmark on model."""
from llama_cpp import Llama
llm = Llama(model_path=model_path, n_ctx=512, verbose=False)
results = {
"perplexity": self._measure_perplexity(llm),
"latency_ms": self._measure_latency(llm),
"memory_mb": self._measure_memory(llm)
}
logger.info("benchmark.complete",
model=Path(model_path).name,
**results)
return results
def _measure_perplexity(self, llm) -> float:
"""Measure model perplexity."""
# Simplified perplexity calculation
total_nll = 0
total_tokens = 0
for prompt in self.test_prompts:
tokens = llm.tokenize(prompt.encode())
logits = llm.eval(tokens)
# Calculate negative log likelihood
total_tokens += len(tokens)
return np.exp(total_nll / total_tokens) if total_tokens > 0 else float('inf')
def _measure_latency(self, llm) -> float:
"""Measure inference latency."""
import time
latencies = []
for prompt in self.test_prompts[:5]:
start = time.time()
llm(prompt, max_tokens=50)
latencies.append((time.time() - start) * 1000)
return np.mean(latencies)
def _measure_memory(self, llm) -> float:
"""Measure memory usage."""
import psutil
process = psutil.Process()
return process.memory_info().rss / (1024 * 1024)
class QuantizationSelector:
"""Select optimal quantization for hardware."""
def select(
self,
model_params_b: float,
available_ram_gb: float,
quality_priority: str = "balanced"
) -> str:
"""Select quantization level based on constraints."""
# Memory per param by quantization
memory_per_param = {
"Q4_K_M": 0.5,
"Q5_K_M": 0.625,
"Q6_K": 0.75,
"Q8_0": 1.0
}
# Quality scores (relative)
quality_scores = {
"Q4_K_M": 0.7,
"Q5_K_M": 0.85,
"Q6_K": 0.92,
"Q8_0": 0.98
}
# Calculate which fit in RAM (need ~2GB overhead)
usable_ram = available_ram_gb - 2
candidates = []
for quant, mem_factor in memory_per_param.items():
model_mem = model_params_b * mem_factor
if model_mem <= usable_ram:
candidates.append(quant)
if not candidates:
raise ValueError(f"No quantization fits in {available_ram_gb}GB RAM")
# Select based on priority
if quality_priority == "quality":
return max(candidates, key=lambda q: quality_scores[q])
elif quality_priority == "speed":
return min(candidates, key=lambda q: memory_per_param[q])
else: # balanced
# Return highest quality that fits
return max(candidates, key=lambda q: quality_scores[q])
# Usage
selector = QuantizationSelector()
quant = selector.select(
model_params_b=7.0,
available_ram_gb=8.0,
quality_priority="balanced"
)
# Returns "Q5_K_M"
class ModelConverter:
"""Convert models to GGUF format."""
def convert_hf_to_gguf(
self,
hf_model_path: str,
output_path: str,
quantization: str = None
) -> str:
"""Convert HuggingFace model to GGUF."""
# Convert to GGUF
convert_script = self.llama_cpp_dir / "convert_hf_to_gguf.py"
result = subprocess.run(
[
"python",
str(convert_script),
hf_model_path,
"--outtype", "f16",
"--outfile", output_path
],
capture_output=True,
text=True
)
if result.returncode != 0:
raise ConversionError(f"Conversion failed: {result.stderr}")
# Optionally quantize
if quantization:
quantizer = SecureQuantizer(
str(Path(output_path).parent),
str(self.llama_cpp_dir)
)
return quantizer.quantize(
Path(output_path).name,
Path(output_path).stem + f"_{quantization}.gguf",
quantization
)
return output_path
def verify_model_integrity(model_path: str) -> bool:
"""Verify model file integrity."""
path = Path(model_path)
checksum_path = path.with_suffix(".sha256")
if not checksum_path.exists():
logger.warning("model.no_checksum", model=path.name)
return False
expected = checksum_path.read_text().split()[0]
actual = calculate_checksum(path)
if expected != actual:
logger.error("model.checksum_mismatch",
model=path.name,
expected=expected[:16],
actual=actual[:16])
return False
return True
def safe_load_quantized(model_path: str) -> Llama:
"""Load quantized model with validation."""
# Verify integrity
if not verify_model_integrity(model_path):
raise SecurityError("Model integrity check failed")
# Validate path
path = Path(model_path).resolve()
allowed_dir = Path("/var/jarvis/models").resolve()
if not path.is_relative_to(allowed_dir):
raise SecurityError("Model outside allowed directory")
return Llama(model_path=str(path))
# BAD - No verification
llm = Llama(model_path=user_provided_path)
# GOOD - Verify first
if not verify_model_integrity(path):
raise SecurityError("Model verification failed")
llm = Llama(model_path=path)
# BAD - Q4_0 for quality-critical task
llm = Llama(model_path="model-Q4_0.gguf") # Poor quality
# GOOD - Select appropriate level
quant = selector.select(7.0, 8.0, "quality")
llm = Llama(model_path=f"model-{quant}.gguf")
Your goal is to create quantized models that are:
You understand that quantization is a tradeoff between quality and resource usage. Always benchmark before deployment and verify model integrity.
Critical Reminders :
Weekly Installs
73
Repository
GitHub Stars
32
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubFailSocketPassSnykPass
Installed on
gemini-cli60
codex60
opencode58
github-copilot57
cursor56
cline51
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
50,900 周安装
| Quality focus |
| Q8_0 | 8 | 75% | Best | Max quality |
| F16 | 16 | 100% | Original | Baseline |