segment-anything-model by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill segment-anything-model使用 Meta AI 的 Segment Anything Model 进行零样本图像分割的完整指南。
在以下情况下使用 SAM:
主要特性:
替代方案:
# 从 GitHub 安装
pip install git+https://github.com/facebookresearch/segment-anything.git
# 可选依赖项
pip install opencv-python pycocotools matplotlib
# 或使用 HuggingFace transformers
pip install transformers
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# ViT-H(最大,最准确)- 2.4GB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
# ViT-L(中等)- 1.2GB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
# ViT-B(最小,最快)- 375MB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
import numpy as np
from segment_anything import sam_model_registry, SamPredictor
# 加载模型
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
sam.to(device="cuda")
# 创建预测器
predictor = SamPredictor(sam)
# 设置图像(计算一次嵌入)
image = cv2.imread("image.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictor.set_image(image)
# 使用点提示进行预测
input_point = np.array([[500, 375]]) # (x, y) 坐标
input_label = np.array([1]) # 1 = 前景,0 = 背景
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True # 返回 3 个掩码选项
)
# 选择最佳掩码
best_mask = masks[np.argmax(scores)]
import torch
from PIL import Image
from transformers import SamModel, SamProcessor
# 加载模型和处理器
model = SamModel.from_pretrained("facebook/sam-vit-huge")
processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
model.to("cuda")
# 使用点提示处理图像
image = Image.open("image.jpg")
input_points = [[[450, 600]]] # 点批次
inputs = processor(image, input_points=input_points, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# 生成掩码
with torch.no_grad():
outputs = model(**inputs)
# 将掩码后处理到原始尺寸
masks = processor.image_processor.post_process_masks(
outputs.pred_masks.cpu(),
inputs["original_sizes"].cpu(),
inputs["reshaped_input_sizes"].cpu()
)
SAM Architecture:
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Image Encoder │────▶│ Prompt Encoder │────▶│ Mask Decoder │
│ (ViT) │ │ (Points/Boxes) │ │ (Transformer) │
└─────────────────┘ └─────────────────┘ └─────────────────┘
│ │ │
Image Embeddings Prompt Embeddings Masks + IoU
(computed once) (per prompt) predictions
| 模型 | 检查点 | 大小 | 速度 | 准确度 |
|---|---|---|---|---|
| ViT-H | vit_h | 2.4 GB | 最慢 | 最佳 |
| ViT-L | vit_l | 1.2 GB | 中等 | 良好 |
| ViT-B | vit_b | 375 MB | 最快 | 良好 |
| 提示 | 描述 | 使用场景 |
|---|---|---|
| 点(前景) | 点击对象 | 单个对象选择 |
| 点(背景) | 点击对象外部 | 排除区域 |
| 边界框 | 对象周围的矩形 | 较大对象 |
| 先前掩码 | 低分辨率掩码输入 | 迭代优化 |
# 单个前景点
input_point = np.array([[500, 375]])
input_label = np.array([1])
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True
)
# 多个点(前景 + 背景)
input_points = np.array([[500, 375], [600, 400], [450, 300]])
input_labels = np.array([1, 1, 0]) # 2 个前景,1 个背景
masks, scores, logits = predictor.predict(
point_coords=input_points,
point_labels=input_labels,
multimask_output=False # 当提示清晰时返回单个掩码
)
# 边界框 [x1, y1, x2, y2]
input_box = np.array([425, 600, 700, 875])
masks, scores, logits = predictor.predict(
box=input_box,
multimask_output=False
)
# 框 + 点以实现精确控制
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
box=np.array([400, 300, 700, 600]),
multimask_output=False
)
# 初始预测
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
multimask_output=True
)
# 使用先前掩码添加额外点进行优化
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375], [550, 400]]),
point_labels=np.array([1, 0]), # 添加背景点
mask_input=logits[np.argmax(scores)][None, :, :], # 使用最佳掩码
multimask_output=False
)
from segment_anything import SamAutomaticMaskGenerator
# 创建生成器
mask_generator = SamAutomaticMaskGenerator(sam)
# 生成所有掩码
masks = mask_generator.generate(image)
# 每个掩码包含:
# - segmentation:二进制掩码
# - bbox:[x, y, w, h]
# - area:像素计数
# - predicted_iou:质量分数
# - stability_score:鲁棒性分数
# - point_coords:生成点
mask_generator = SamAutomaticMaskGenerator(
model=sam,
points_per_side=32, # 网格密度(越多 = 掩码越多)
pred_iou_thresh=0.88, # 质量阈值
stability_score_thresh=0.95, # 稳定性阈值
crop_n_layers=1, # 多尺度裁剪
crop_n_points_downscale_factor=2,
min_mask_region_area=100, # 移除微小掩码
)
masks = mask_generator.generate(image)
# 按面积排序(最大优先)
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
# 按预测 IoU 过滤
high_quality = [m for m in masks if m['predicted_iou'] > 0.9]
# 按稳定性分数过滤
stable_masks = [m for m in masks if m['stability_score'] > 0.95]
# 高效处理多张图像
images = [cv2.imread(f"image_{i}.jpg") for i in range(10)]
all_masks = []
for image in images:
predictor.set_image(image)
masks, _, _ = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
multimask_output=True
)
all_masks.append(masks)
# 高效处理多个提示(一次图像编码)
predictor.set_image(image)
# 点提示批次
points = [
np.array([[100, 100]]),
np.array([[200, 200]]),
np.array([[300, 300]])
]
all_masks = []
for point in points:
masks, scores, _ = predictor.predict(
point_coords=point,
point_labels=np.array([1]),
multimask_output=True
)
all_masks.append(masks[np.argmax(scores)])
python scripts/export_onnx_model.py \
--checkpoint sam_vit_h_4b8939.pth \
--model-type vit_h \
--output sam_onnx.onnx \
--return-single-mask
import onnxruntime
# 加载 ONNX 模型
ort_session = onnxruntime.InferenceSession("sam_onnx.onnx")
# 运行推理(图像嵌入单独计算)
masks = ort_session.run(
None,
{
"image_embeddings": image_embeddings,
"point_coords": point_coords,
"point_labels": point_labels,
"mask_input": np.zeros((1, 1, 256, 256), dtype=np.float32),
"has_mask_input": np.array([0], dtype=np.float32),
"orig_im_size": np.array([h, w], dtype=np.float32)
}
)
import cv2
# 加载模型
predictor = SamPredictor(sam)
predictor.set_image(image)
def on_click(event, x, y, flags, param):
if event == cv2.EVENT_LBUTTONDOWN:
# 前景点
masks, scores, _ = predictor.predict(
point_coords=np.array([[x, y]]),
point_labels=np.array([1]),
multimask_output=True
)
# 显示最佳掩码
display_mask(masks[np.argmax(scores)])
def extract_object(image, point):
"""提取指定点的对象并生成透明背景。"""
predictor.set_image(image)
masks, scores, _ = predictor.predict(
point_coords=np.array([point]),
point_labels=np.array([1]),
multimask_output=True
)
best_mask = masks[np.argmax(scores)]
# 创建 RGBA 输出
rgba = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
rgba[:, :, :3] = image
rgba[:, :, 3] = best_mask * 255
return rgba
# 处理医学图像(灰度转 RGB)
medical_image = cv2.imread("scan.png", cv2.IMREAD_GRAYSCALE)
rgb_image = cv2.cvtColor(medical_image, cv2.COLOR_GRAY2RGB)
predictor.set_image(rgb_image)
# 分割感兴趣区域
masks, scores, _ = predictor.predict(
box=np.array([x1, y1, x2, y2]), # ROI 边界框
multimask_output=True
)
# SamAutomaticMaskGenerator 输出
{
"segmentation": np.ndarray, # H×W 二进制掩码
"bbox": [x, y, w, h], # 边界框
"area": int, # 像素计数
"predicted_iou": float, # 0-1 质量分数
"stability_score": float, # 0-1 鲁棒性分数
"crop_box": [x, y, w, h], # 生成裁剪区域
"point_coords": [[x, y]], # 输入点
}
from pycocotools import mask as mask_utils
# 将掩码编码为 RLE
rle = mask_utils.encode(np.asfortranarray(mask.astype(np.uint8)))
rle["counts"] = rle["counts"].decode("utf-8")
# 将 RLE 解码为掩码
decoded_mask = mask_utils.decode(rle)
# 对于有限的 VRAM 使用较小模型
sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
# 批量处理图像
# 在大批次之间清除 CUDA 缓存
torch.cuda.empty_cache()
# 使用半精度
sam = sam.half()
# 减少自动生成的点数
mask_generator = SamAutomaticMaskGenerator(
model=sam,
points_per_side=16, # 默认是 32
)
# 部署时使用 ONNX
# 使用 --return-single-mask 导出以获得更快推理
| 问题 | 解决方案 |
|---|---|
| 内存不足 | 使用 ViT-B 模型,减小图像尺寸 |
| 推理速度慢 | 使用 ViT-B,减少 points_per_side |
| 掩码质量差 | 尝试不同的提示,使用框 + 点组合 |
| 边缘伪影 | 使用 stability_score 过滤 |
| 小对象遗漏 | 增加 points_per_side |
每周安装量
182
仓库
GitHub 星标数
23.5K
首次出现
2026年1月21日
安全审计
已安装于
opencode151
claude-code149
gemini-cli141
cursor137
codex130
github-copilot121
Comprehensive guide to using Meta AI's Segment Anything Model for zero-shot image segmentation.
Use SAM when:
Key features:
Use alternatives instead:
# From GitHub
pip install git+https://github.com/facebookresearch/segment-anything.git
# Optional dependencies
pip install opencv-python pycocotools matplotlib
# Or use HuggingFace transformers
pip install transformers
# ViT-H (largest, most accurate) - 2.4GB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
# ViT-L (medium) - 1.2GB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
# ViT-B (smallest, fastest) - 375MB
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
import numpy as np
from segment_anything import sam_model_registry, SamPredictor
# Load model
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
sam.to(device="cuda")
# Create predictor
predictor = SamPredictor(sam)
# Set image (computes embeddings once)
image = cv2.imread("image.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictor.set_image(image)
# Predict with point prompts
input_point = np.array([[500, 375]]) # (x, y) coordinates
input_label = np.array([1]) # 1 = foreground, 0 = background
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True # Returns 3 mask options
)
# Select best mask
best_mask = masks[np.argmax(scores)]
import torch
from PIL import Image
from transformers import SamModel, SamProcessor
# Load model and processor
model = SamModel.from_pretrained("facebook/sam-vit-huge")
processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
model.to("cuda")
# Process image with point prompt
image = Image.open("image.jpg")
input_points = [[[450, 600]]] # Batch of points
inputs = processor(image, input_points=input_points, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Generate masks
with torch.no_grad():
outputs = model(**inputs)
# Post-process masks to original size
masks = processor.image_processor.post_process_masks(
outputs.pred_masks.cpu(),
inputs["original_sizes"].cpu(),
inputs["reshaped_input_sizes"].cpu()
)
SAM Architecture:
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Image Encoder │────▶│ Prompt Encoder │────▶│ Mask Decoder │
│ (ViT) │ │ (Points/Boxes) │ │ (Transformer) │
└─────────────────┘ └─────────────────┘ └─────────────────┘
│ │ │
Image Embeddings Prompt Embeddings Masks + IoU
(computed once) (per prompt) predictions
| Model | Checkpoint | Size | Speed | Accuracy |
|---|---|---|---|---|
| ViT-H | vit_h | 2.4 GB | Slowest | Best |
| ViT-L | vit_l | 1.2 GB | Medium | Good |
| ViT-B | vit_b | 375 MB | Fastest | Good |
| Prompt | Description | Use Case |
|---|---|---|
| Point (foreground) | Click on object | Single object selection |
| Point (background) | Click outside object | Exclude regions |
| Bounding box | Rectangle around object | Larger objects |
| Previous mask | Low-res mask input | Iterative refinement |
# Single foreground point
input_point = np.array([[500, 375]])
input_label = np.array([1])
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True
)
# Multiple points (foreground + background)
input_points = np.array([[500, 375], [600, 400], [450, 300]])
input_labels = np.array([1, 1, 0]) # 2 foreground, 1 background
masks, scores, logits = predictor.predict(
point_coords=input_points,
point_labels=input_labels,
multimask_output=False # Single mask when prompts are clear
)
# Bounding box [x1, y1, x2, y2]
input_box = np.array([425, 600, 700, 875])
masks, scores, logits = predictor.predict(
box=input_box,
multimask_output=False
)
# Box + points for precise control
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
box=np.array([400, 300, 700, 600]),
multimask_output=False
)
# Initial prediction
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
multimask_output=True
)
# Refine with additional point using previous mask
masks, scores, logits = predictor.predict(
point_coords=np.array([[500, 375], [550, 400]]),
point_labels=np.array([1, 0]), # Add background point
mask_input=logits[np.argmax(scores)][None, :, :], # Use best mask
multimask_output=False
)
from segment_anything import SamAutomaticMaskGenerator
# Create generator
mask_generator = SamAutomaticMaskGenerator(sam)
# Generate all masks
masks = mask_generator.generate(image)
# Each mask contains:
# - segmentation: binary mask
# - bbox: [x, y, w, h]
# - area: pixel count
# - predicted_iou: quality score
# - stability_score: robustness score
# - point_coords: generating point
mask_generator = SamAutomaticMaskGenerator(
model=sam,
points_per_side=32, # Grid density (more = more masks)
pred_iou_thresh=0.88, # Quality threshold
stability_score_thresh=0.95, # Stability threshold
crop_n_layers=1, # Multi-scale crops
crop_n_points_downscale_factor=2,
min_mask_region_area=100, # Remove tiny masks
)
masks = mask_generator.generate(image)
# Sort by area (largest first)
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
# Filter by predicted IoU
high_quality = [m for m in masks if m['predicted_iou'] > 0.9]
# Filter by stability score
stable_masks = [m for m in masks if m['stability_score'] > 0.95]
# Process multiple images efficiently
images = [cv2.imread(f"image_{i}.jpg") for i in range(10)]
all_masks = []
for image in images:
predictor.set_image(image)
masks, _, _ = predictor.predict(
point_coords=np.array([[500, 375]]),
point_labels=np.array([1]),
multimask_output=True
)
all_masks.append(masks)
# Process multiple prompts efficiently (one image encoding)
predictor.set_image(image)
# Batch of point prompts
points = [
np.array([[100, 100]]),
np.array([[200, 200]]),
np.array([[300, 300]])
]
all_masks = []
for point in points:
masks, scores, _ = predictor.predict(
point_coords=point,
point_labels=np.array([1]),
multimask_output=True
)
all_masks.append(masks[np.argmax(scores)])
python scripts/export_onnx_model.py \
--checkpoint sam_vit_h_4b8939.pth \
--model-type vit_h \
--output sam_onnx.onnx \
--return-single-mask
import onnxruntime
# Load ONNX model
ort_session = onnxruntime.InferenceSession("sam_onnx.onnx")
# Run inference (image embeddings computed separately)
masks = ort_session.run(
None,
{
"image_embeddings": image_embeddings,
"point_coords": point_coords,
"point_labels": point_labels,
"mask_input": np.zeros((1, 1, 256, 256), dtype=np.float32),
"has_mask_input": np.array([0], dtype=np.float32),
"orig_im_size": np.array([h, w], dtype=np.float32)
}
)
import cv2
# Load model
predictor = SamPredictor(sam)
predictor.set_image(image)
def on_click(event, x, y, flags, param):
if event == cv2.EVENT_LBUTTONDOWN:
# Foreground point
masks, scores, _ = predictor.predict(
point_coords=np.array([[x, y]]),
point_labels=np.array([1]),
multimask_output=True
)
# Display best mask
display_mask(masks[np.argmax(scores)])
def extract_object(image, point):
"""Extract object at point with transparent background."""
predictor.set_image(image)
masks, scores, _ = predictor.predict(
point_coords=np.array([point]),
point_labels=np.array([1]),
multimask_output=True
)
best_mask = masks[np.argmax(scores)]
# Create RGBA output
rgba = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
rgba[:, :, :3] = image
rgba[:, :, 3] = best_mask * 255
return rgba
# Process medical images (grayscale to RGB)
medical_image = cv2.imread("scan.png", cv2.IMREAD_GRAYSCALE)
rgb_image = cv2.cvtColor(medical_image, cv2.COLOR_GRAY2RGB)
predictor.set_image(rgb_image)
# Segment region of interest
masks, scores, _ = predictor.predict(
box=np.array([x1, y1, x2, y2]), # ROI bounding box
multimask_output=True
)
# SamAutomaticMaskGenerator output
{
"segmentation": np.ndarray, # H×W binary mask
"bbox": [x, y, w, h], # Bounding box
"area": int, # Pixel count
"predicted_iou": float, # 0-1 quality score
"stability_score": float, # 0-1 robustness score
"crop_box": [x, y, w, h], # Generation crop region
"point_coords": [[x, y]], # Input point
}
from pycocotools import mask as mask_utils
# Encode mask to RLE
rle = mask_utils.encode(np.asfortranarray(mask.astype(np.uint8)))
rle["counts"] = rle["counts"].decode("utf-8")
# Decode RLE to mask
decoded_mask = mask_utils.decode(rle)
# Use smaller model for limited VRAM
sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
# Process images in batches
# Clear CUDA cache between large batches
torch.cuda.empty_cache()
# Use half precision
sam = sam.half()
# Reduce points for automatic generation
mask_generator = SamAutomaticMaskGenerator(
model=sam,
points_per_side=16, # Default is 32
)
# Use ONNX for deployment
# Export with --return-single-mask for faster inference
| Issue | Solution |
|---|---|
| Out of memory | Use ViT-B model, reduce image size |
| Slow inference | Use ViT-B, reduce points_per_side |
| Poor mask quality | Try different prompts, use box + points |
| Edge artifacts | Use stability_score filtering |
| Small objects missed | Increase points_per_side |
Weekly Installs
182
Repository
GitHub Stars
23.5K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode151
claude-code149
gemini-cli141
cursor137
codex130
github-copilot121
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
47,700 周安装
信用风险数据分析:自动化数据清洗与变量筛选流程,提升模型稳定性
5,900 周安装
Microsoft Store Developer CLI (msstore) 使用指南:自动化发布和管理商店应用
6,000 周安装
SwiftUI性能审计指南:检测、诊断与修复iOS应用卡顿掉帧问题
6,200 周安装
Python设计模式实战指南:KISS原则、单一职责、组合优于继承
6,200 周安装
FastAPI Python开发指南:高效后端API构建与最佳实践
6,300 周安装
前端代码审查工具 - 自动化检查TypeScript/JS代码质量、性能与业务逻辑
6,300 周安装