flash-moe-inference by aradotso/trending-skills
npx skills add https://github.com/aradotso/trending-skills --skill flash-moe-inference技能来自 ara.so — Daily 2026 技能集。
Flash-MoE 是一个纯 C/Objective-C/Metal 推理引擎,可在配备 48GB RAM 的 MacBook Pro 上以 4.4+ 令牌/秒 的速度运行 Qwen3.5-397B-A17B(3970 亿参数的混合专家模型)。它按需从 NVMe SSD 流式传输 209GB 的专家权重——无需 Python,无需机器学习框架,只需 C、Objective-C 和手动调优的 Metal 着色器。
# 克隆仓库
git clone https://github.com/danveloper/flash-moe
cd flash-moe/metal_infer
# 构建所有内容
make
# 验证构建产物
ls infer chat main
Makefile 会编译 infer.m、chat.m、main.m,并为 编译 Metal 着色器。
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
shaders.metal# 在 metal_infer/ 目录下
# 指向你下载的 Qwen3.5-397B safetensors 目录
python3 extract_weights.py /path/to/Qwen3.5-397B-A17B-Instruct/
# 生成:
# model_weights.bin (约 5.5GB,运行时内存映射)
# model_weights.json (张量清单)
# vocab.bin (词汇表)
# tokenizer.bin (BPE 分词器数据)
# 从仓库根目录
python3 repack_experts.py /path/to/Qwen3.5-397B-A17B-Instruct/ metal_infer/packed_experts/
# 生成 packed_experts/ 目录(约 209GB)
# 每个专家是一个单独的文件:layer_XX_expert_YYYY.bin
# 将 4 位专家转换为 2 位(节省约 89GB,总计 120GB)
python3 metal_infer/repack_experts_2bit.py \
metal_infer/packed_experts/ \
metal_infer/packed_experts_2bit/
cd metal_infer
# 4 位推理(生产质量,工具调用有效)
./infer --prompt "解释量子计算" --tokens 100
# 2 位推理(更快,但会破坏 JSON/工具调用)
./infer --prompt "解释量子计算" --tokens 100 --2bit
# 逐层时间细分
./infer --prompt "你好" --tokens 20 --timing
./chat
# 打开支持完整工具调用的 TUI
# 默认使用 4 位专家
./main
# 运行纯专家前向传播基准测试
# 报告无注意力开销的令牌/秒
flash-moe/
├── paper/
│ └── flash_moe.pdf # 完整技术论文
├── metal_infer/
│ ├── infer.m # 完整推理引擎(约 7000 行)
│ ├── shaders.metal # Metal 计算内核(约 1200 行)
│ ├── chat.m # 交互式聊天 TUI
│ ├── tokenizer.h # 单头文件 C BPE 分词器(449 行)
│ ├── main.m # 仅 MoE 基准测试
│ ├── Makefile
│ ├── extract_weights.py # Safetensors → model_weights.bin
│ ├── repack_experts_2bit.py # 4 位 → 2 位重量化
│ ├── train_predictor.py # 专家路由预测分析
│ ├── model_weights.bin # 非专家权重(内存映射)
│ ├── model_weights.json # 张量清单
│ ├── vocab.bin
│ ├── tokenizer.bin
│ ├── packed_experts/ # 4 位专家文件(209GB)
│ └── packed_experts_2bit/ # 2 位专家文件(120GB,可选)
├── repack_experts.py # 从 safetensors 打包 4 位专家
├── progress.py # 结果可视化
└── results.tsv # 实验日志
该模型有 60 个 Transformer 层:
CMD3(prev) → CMD1: 注意力投影 + delta-net [1.22ms GPU]
→ CPU: 刷新结果 [0.01ms CPU]
→ CMD2: o_proj + 归一化 + 路由 + 共享 [0.55ms GPU]
→ CPU: softmax + topK 路由 [0.003ms]
→ I/O: 并行预读 K=4 个专家 [2.41ms SSD]
→ CMD3: 专家前向传播 + 组合 + 归一化 [0.04ms 编码,延迟执行]
shaders.metal 文件包含手写内核。关键内核:
// 4 位反量化矩阵-向量乘法(FMA 优化)
// 关键洞察:使用 fma(nibble, scale*x, bias*x) 替代 (nibble*scale + bias)*x
// 预计算 scale*x 和 bias*x,将反量化+乘法融合到一条 FMA 指令中
kernel void matvec_4bit_fma(
device const uint8_t* weights [[buffer(0)]],
device const float* scales [[buffer(1)]],
device const float* biases [[buffer(2)]],
device const float* x [[buffer(3)]],
device float* out [[buffer(4)]],
uint tid [[thread_position_in_threadgroup]],
uint gid [[threadgroup_position_in_grid]])
{
// ... 分块 SIMD 归约 FMA 内核
// 比朴素 (nibble * scale + bias) * x 快 12%
}
// 融合的 SwiGLU 激活
kernel void swiglu(device float* gate [[buffer(0)]],
device const float* up [[buffer(1)]],
uint gid [[thread_position_in_grid]])
{
float g = gate[gid];
gate[gid] = (g / (1.0f + exp(-g))) * up[gid];
}
// RMS 归一化(两遍)
kernel void rms_norm_pass1(...) // 平方和归约
kernel void rms_norm_pass2(...) // 应用归一化
// GPU RoPE(与 Q 解交织和 K 归一化融合)
kernel void rope_qk(...)
// MoE 组合 + 残差 + sigmoid 门控(融合)
kernel void moe_combine_residual(...)
核心创新——每层仅从 SSD 加载 K=4 个活跃专家:
// 使用 GCD 调度组进行并行专家加载
// 摘自 infer.m(概念模式)
dispatch_group_t group = dispatch_group_create();
dispatch_queue_t ioQueue = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
for (int k = 0; k < K_EXPERTS; k++) {
int expert_id = top_k_indices[k];
dispatch_group_async(group, ioQueue, ^{
// 每个专家:4 位下约 6.75MB
char path[256];
snprintf(path, sizeof(path),
"packed_experts/layer_%02d_expert_%04d.bin",
layer, expert_id);
int fd = open(path, O_RDONLY);
// pread() —— 非阻塞,操作系统页缓存处理 LRU
pread(fd, expert_buffer[k], expert_size, 0);
close(fd);
});
}
dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
// GPU 计算紧随其后 —— 在 Apple Silicon 上,串行流水线是硬件最优的
为什么用 pread() 而不是 mmap():mmap 在冷数据上会产生每页错误开销(约慢 5 倍)。直接的 pread() 配合操作系统页缓存自然能达到约 71% 的命中率。
循环更新使用 Accelerate BLAS —— 比标量计算快 64%:
// 每个头的 GatedDeltaNet 状态更新(概念模式)
// state: 128×128 浮点矩阵,64 个头
// 摘自 infer.m
#import <Accelerate/Accelerate.h>
for (int h = 0; h < 64; h++) {
float* S = state + h * 128 * 128; // 128×128 状态矩阵
float* q = Q + h * 128;
float* k = K + h * 128;
float* v = V + h * 128;
// β·(k⊗v) 外积更新
// cblas_sger: S += beta * (k ⊗ v)
cblas_sger(CblasRowMajor, 128, 128,
beta[h], k, 1, v, 1, S, 128);
// 衰减: S = alpha * S
cblas_sscal(128 * 128, alpha[h], S, 1);
// 输出: o = S @ q
cblas_sgemv(CblasRowMajor, CblasNoTrans,
128, 128, 1.0f, S, 128, q, 1, 0.0f,
output + h * 128, 1);
}
\name\ 而不是 "name")F_NOCACHE 标志避免页缓存抖动| 方法 | 失败原因 |
|---|---|
mmap() 专家文件 | 每页错误开销:比 pread() 慢 5 倍 |
dispatch_io | dispatch_data 管理开销:-70% |
F_RDADVISE 预取 | SSD DMA 和 GPU 共享内存控制器 —— 并发访问:-73% GPU 速度 |
| 自定义 Metal LRU 缓存 | GPU 内存压力:比操作系统页缓存慢 38% |
| LZ4 专家压缩 | 解压缩开销 > 热缓存节省:-13% |
| 时序专家预测 | 25% 命中率,浪费 SSD 带宽:-18% |
| 推测性早期路由 | 缓存污染:-38% |
| MTP 推测解码 | MoE I/O 随令牌数扩展(与密集模型不同):收支平衡 |
| 自旋轮询 GPU 等待 | CPU 热节流与 GPU 竞争:-23% |
| 并行 SSD + GPU 重叠 | 统一内存控制器仲裁:净负收益 |
关键原则:在 Apple Silicon 上,GPU DMA 和 SSD DMA 共享同一个内存控制器。串行流水线(GPU → SSD → GPU)是硬件最优的。
# 确保已安装 Xcode CLI 工具
xcode-select --install
# 检查 Metal 编译器是否可用
xcrun -sdk macosx metal --version
引擎设计为使用约 6GB 活跃内存:
model_weights.bin(内存映射,只读)如果出现 OOM,请检查是否有其他进程消耗统一内存:
sudo memory_pressure
vm_stat
# 检查 SSD 速度 —— 需要约 17GB/s 以达到目标性能
# 运行时附带计时以识别瓶颈
./infer --prompt "你好" --tokens 5 --timing
# 确认 packed_experts/ 在内部 SSD 上,而非外置驱动器
diskutil info /
# infer.m 期望的默认路径:
# metal_infer/packed_experts/ (4 位)
# metal_infer/packed_experts_2bit/ (2 位)
# 确保从 metal_infer/ 目录运行
cd metal_infer
./infer --prompt "测试"
使用 4 位,而非 2 位。2 位量化会损坏 JSON 输出中的引号字符,导致工具调用不可靠。对于代理工作负载,请始终使用默认的 4 位配置。
引擎显式管理所有分配:
model_weights.bin 是只读内存映射的 —— 内核管理页面每周安装次数
119
仓库
GitHub 星标数
10
首次出现
3 天前
安全审计
安装于
github-copilot119
codex119
warp119
kimi-cli119
amp119
cline119
Skill by ara.so — Daily 2026 Skills collection.
Flash-MoE is a pure C/Objective-C/Metal inference engine that runs Qwen3.5-397B-A17B (397B parameter Mixture-of-Experts) on a MacBook Pro with 48GB RAM at 4.4+ tokens/second. It streams 209GB of expert weights from NVMe SSD on demand — no Python, no ML frameworks, just C, Objective-C, and hand-tuned Metal shaders.
# Clone the repo
git clone https://github.com/danveloper/flash-moe
cd flash-moe/metal_infer
# Build everything
make
# Verify build artifacts
ls infer chat main
The Makefile compiles infer.m, chat.m, main.m with Metal shader compilation for shaders.metal.
# From the metal_infer/ directory
# Point to your downloaded Qwen3.5-397B safetensors directory
python3 extract_weights.py /path/to/Qwen3.5-397B-A17B-Instruct/
# Produces:
# model_weights.bin (~5.5GB, mmap'd at runtime)
# model_weights.json (tensor manifest)
# vocab.bin (vocabulary)
# tokenizer.bin (BPE tokenizer data)
# From repo root
python3 repack_experts.py /path/to/Qwen3.5-397B-A17B-Instruct/ metal_infer/packed_experts/
# Produces packed_experts/ directory (~209GB)
# Each expert is a separate file: layer_XX_expert_YYYY.bin
# Convert 4-bit experts to 2-bit (saves ~89GB, 120GB total)
python3 metal_infer/repack_experts_2bit.py \
metal_infer/packed_experts/ \
metal_infer/packed_experts_2bit/
cd metal_infer
# 4-bit inference (production quality, tool calling works)
./infer --prompt "Explain quantum computing" --tokens 100
# 2-bit inference (faster, breaks JSON/tool calling)
./infer --prompt "Explain quantum computing" --tokens 100 --2bit
# Per-layer timing breakdown
./infer --prompt "Hello" --tokens 20 --timing
./chat
# Opens TUI with full tool calling support
# Uses 4-bit experts by default
./main
# Runs pure expert forward-pass benchmark
# Reports tokens/sec without attention overhead
flash-moe/
├── paper/
│ └── flash_moe.pdf # Full technical paper
├── metal_infer/
│ ├── infer.m # Complete inference engine (~7000 lines)
│ ├── shaders.metal # Metal compute kernels (~1200 lines)
│ ├── chat.m # Interactive chat TUI
│ ├── tokenizer.h # Single-header C BPE tokenizer (449 lines)
│ ├── main.m # MoE-only benchmark
│ ├── Makefile
│ ├── extract_weights.py # Safetensors → model_weights.bin
│ ├── repack_experts_2bit.py # 4-bit → 2-bit requantization
│ ├── train_predictor.py # Expert routing prediction analysis
│ ├── model_weights.bin # Non-expert weights (mmap'd)
│ ├── model_weights.json # Tensor manifest
│ ├── vocab.bin
│ ├── tokenizer.bin
│ ├── packed_experts/ # 4-bit expert files (209GB)
│ └── packed_experts_2bit/ # 2-bit expert files (120GB, optional)
├── repack_experts.py # 4-bit expert packing from safetensors
├── progress.py # Results visualization
└── results.tsv # Experiment log
The model has 60 transformer layers :
CMD3(prev) → CMD1: attention projections + delta-net [1.22ms GPU]
→ CPU: flush results [0.01ms CPU]
→ CMD2: o_proj + norm + routing + shared [0.55ms GPU]
→ CPU: softmax + topK routing [0.003ms]
→ I/O: parallel pread K=4 experts [2.41ms SSD]
→ CMD3: expert forward + combine + norm [0.04ms encode, DEFERRED]
The shaders.metal file contains hand-written kernels. Key kernels:
// 4-bit dequantized matrix-vector multiply (FMA-optimized)
// Key insight: fma(nibble, scale*x, bias*x) instead of (nibble*scale + bias)*x
// Pre-compute scale*x and bias*x to fuse dequant+multiply in one FMA instruction
kernel void matvec_4bit_fma(
device const uint8_t* weights [[buffer(0)]],
device const float* scales [[buffer(1)]],
device const float* biases [[buffer(2)]],
device const float* x [[buffer(3)]],
device float* out [[buffer(4)]],
uint tid [[thread_position_in_threadgroup]],
uint gid [[threadgroup_position_in_grid]])
{
// ... tiled SIMD-reduced FMA kernel
// 12% faster than naive (nibble * scale + bias) * x
}
// Fused SwiGLU activation
kernel void swiglu(device float* gate [[buffer(0)]],
device const float* up [[buffer(1)]],
uint gid [[thread_position_in_grid]])
{
float g = gate[gid];
gate[gid] = (g / (1.0f + exp(-g))) * up[gid];
}
// RMS normalization (two-pass)
kernel void rms_norm_pass1(...) // sum of squares reduction
kernel void rms_norm_pass2(...) // apply normalization
// GPU RoPE (fused with Q deinterleave and K normalization)
kernel void rope_qk(...)
// MoE combine + residual + sigmoid gate (fused)
kernel void moe_combine_residual(...)
The core innovation — loading only K=4 active experts per layer from SSD:
// Parallel expert loading using GCD dispatch groups
// From infer.m (conceptual pattern)
dispatch_group_t group = dispatch_group_create();
dispatch_queue_t ioQueue = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
for (int k = 0; k < K_EXPERTS; k++) {
int expert_id = top_k_indices[k];
dispatch_group_async(group, ioQueue, ^{
// Each expert: ~6.75MB at 4-bit
char path[256];
snprintf(path, sizeof(path),
"packed_experts/layer_%02d_expert_%04d.bin",
layer, expert_id);
int fd = open(path, O_RDONLY);
// pread() — non-blocking, OS page cache handles LRU
pread(fd, expert_buffer[k], expert_size, 0);
close(fd);
});
}
dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
// GPU compute follows — serial pipeline is hardware-optimal on Apple Silicon
Whypread() not mmap(): mmap incurs per-page fault overhead on cold data (~5x slower). Direct pread() with OS page cache achieves ~71% hit rate naturally.
The recurrence update uses Accelerate BLAS — 64% faster than scalar:
// GatedDeltaNet state update per head (conceptual pattern)
// state: 128×128 float matrix, 64 heads
// From infer.m
#import <Accelerate/Accelerate.h>
for (int h = 0; h < 64; h++) {
float* S = state + h * 128 * 128; // 128×128 state matrix
float* q = Q + h * 128;
float* k = K + h * 128;
float* v = V + h * 128;
// β·(k⊗v) outer product update
// cblas_sger: S += beta * (k ⊗ v)
cblas_sger(CblasRowMajor, 128, 128,
beta[h], k, 1, v, 1, S, 128);
// Decay: S = alpha * S
cblas_sscal(128 * 128, alpha[h], S, 1);
// Output: o = S @ q
cblas_sgemv(CblasRowMajor, CblasNoTrans,
128, 128, 1.0f, S, 128, q, 1, 0.0f,
output + h * 128, 1);
}
\name\ instead of "name")F_NOCACHE flag to avoid page cache thrashing| Approach | Why it fails |
|---|---|
mmap() expert files | Per-page fault overhead: 5x slower than pread() |
dispatch_io | dispatch_data management overhead: -70% |
F_RDADVISE prefetch | SSD DMA + GPU share memory controller — concurrent access: -73% GPU speed |
| Custom Metal LRU cache | GPU memory pressure: -38% vs OS page cache |
| LZ4 expert compression | Decompress overhead > warm cache savings: -13% |
| Temporal expert prediction | 25% hit rate, wastes SSD bandwidth: -18% |
| Speculative early routing |
Key principle : On Apple Silicon, GPU DMA and SSD DMA share the same memory controller. The serial pipeline (GPU → SSD → GPU) is hardware-optimal.
# Ensure Xcode CLI tools are installed
xcode-select --install
# Check Metal compiler is available
xcrun -sdk macosx metal --version
The engine is designed to use ~6GB active:
model_weights.bin (mmap'd, read-only)If you see OOM, check for other processes consuming unified memory:
sudo memory_pressure
vm_stat
# Check SSD speed — needs ~17GB/s for target performance
# Run with timing to identify bottleneck
./infer --prompt "Hello" --tokens 5 --timing
# Verify packed_experts/ is on internal SSD, not external drive
diskutil info /
# Default paths expected by infer.m:
# metal_infer/packed_experts/ (4-bit)
# metal_infer/packed_experts_2bit/ (2-bit)
# Ensure you're running from metal_infer/ directory
cd metal_infer
./infer --prompt "test"
Use 4-bit, not 2-bit. The 2-bit quantization corrupts quote characters in JSON output, making tool calling unreliable. Always use the default 4-bit configuration for agentic workloads.
The engine explicitly manages all allocations:
model_weights.bin is mmap'd read-only — kernel manages pagesWeekly Installs
119
Repository
GitHub Stars
10
First Seen
3 days ago
Security Audits
Gen Agent Trust HubFailSocketWarnSnykWarn
Installed on
github-copilot119
codex119
warp119
kimi-cli119
amp119
cline119
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
49,800 周安装
PPTX文档自动化技能:使用Python和Node.js编程创建编辑PowerPoint演示文稿
81 周安装
project-discover:AI辅助项目逆向工程与知识沉淀工具,一键建立项目SSOT
81 周安装
Angular 17+ 现代开发规范:独立组件、Signal 状态管理与原生控制流最佳实践
81 周安装
Tailwind CSS 官方插件详解:排版与表单样式优化,提升前端开发效率
81 周安装
机器学习模型训练指南:从数据准备到模型评估的完整流程与最佳实践
81 周安装
WebSocket实时通信系统实现 - Socket.IO服务器与客户端完整代码示例
81 周安装
| Cache pollution: -38% |
| MTP speculative decoding | MoE I/O scales per-token (unlike dense models): break-even |
| Spin-poll GPU wait | CPU thermal throttle competes with GPU: -23% |
| Parallel SSD + GPU overlap | Unified memory controller arbitration: net negative |