monitoring-observability by akillness/oh-my-skills
npx skills add https://github.com/akillness/oh-my-skills --skill monitoring-observability应用埋点 (Node.js):
import express from 'express';
import promClient from 'prom-client';
const app = express();
// 默认指标 (CPU、内存等)
promClient.collectDefaultMetrics();
// 自定义指标
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code']
});
const httpRequestTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// 中间件用于追踪请求
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const labels = {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode
};
httpRequestDuration.observe(labels, duration);
httpRequestTotal.inc(labels);
});
next();
});
// 指标端点
app.get('/metrics', async (req, res) => {
res.set('Content-Type', promClient.register.contentType);
res.end(await promClient.register.metrics());
});
app.listen(3000);
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'my-app'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/metrics'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- 'alert_rules.yml'
alert_rules.yml:
groups:
- name: application_alerts
interval: 30s
rules:
# 高错误率
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% (threshold: 5%)"
# 响应时间慢
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "Slow response time"
description: "95th percentile is {{ $value }}s"
# Pod 宕机
- alert: PodDown
expr: up{job="my-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Pod is down"
description: "{{ $labels.instance }} has been down for more than 2 minutes"
# 高内存使用率
- alert: HighMemoryUsage
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}%"
Winston (Node.js):
import winston from 'winston';
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'my-app',
environment: process.env.NODE_ENV
},
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
// 用法
logger.info('User logged in', { userId: '123', ip: '1.2.3.4' });
logger.error('Database connection failed', { error: err.message, stack: err.stack });
// Express 中间件
app.use((req, res, next) => {
logger.info('HTTP Request', {
method: req.method,
path: req.path,
ip: req.ip,
userAgent: req.get('user-agent')
});
next();
});
dashboard.json (示例):
{
"dashboard": {
"title": "Application Metrics",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])",
"legendFormat": "Errors"
}
]
},
{
"title": "Response Time (p95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"
}
]
},
{
"title": "CPU Usage",
"type": "gauge",
"targets": [
{
"expr": "rate(process_cpu_seconds_total[5m]) * 100"
}
]
}
]
}
}
高级健康检查:
interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
checks: {
database: { status: string; latency?: number; error?: string };
redis: { status: string; latency?: number };
externalApi: { status: string; latency?: number };
};
}
app.get('/health', async (req, res) => {
const startTime = Date.now();
const health: HealthStatus = {
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
checks: {
database: { status: 'unknown' },
redis: { status: 'unknown' },
externalApi: { status: 'unknown' }
}
};
// 数据库检查
try {
const dbStart = Date.now();
await db.raw('SELECT 1');
health.checks.database = {
status: 'healthy',
latency: Date.now() - dbStart
};
} catch (error) {
health.status = 'unhealthy';
health.checks.database = {
status: 'unhealthy',
error: error.message
};
}
// Redis 检查
try {
const redisStart = Date.now();
await redis.ping();
health.checks.redis = {
status: 'healthy',
latency: Date.now() - redisStart
};
} catch (error) {
health.status = 'degraded';
health.checks.redis = { status: 'unhealthy' };
}
const statusCode = health.status === 'healthy' ? 200 : health.status === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
});
Golden Signals:
1. Latency (Response Time)
- P50, P95, P99 percentiles
- Per API endpoint
2. Traffic (Request Volume)
- Requests per second
- Per endpoint, per status code
3. Errors (Error Rate)
- 5xx error rate
- 4xx error rate
- Per error type
4. Saturation (Resource Utilization)
- CPU usage
- Memory usage
- Disk I/O
- Network bandwidth
#monitoring #observability #Prometheus #Grafana #logging #metrics #infrastructure
每周安装数
1
代码仓库
GitHub 星标数
3
首次出现
1 天前
安全审计
安装于
mcpjam1
claude-code1
junie1
windsurf1
zencoder1
crush1
Application Instrumentation (Node.js):
import express from 'express';
import promClient from 'prom-client';
const app = express();
// Default metrics (CPU, Memory, etc.)
promClient.collectDefaultMetrics();
// Custom metrics
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code']
});
const httpRequestTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// Middleware to track requests
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const labels = {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode
};
httpRequestDuration.observe(labels, duration);
httpRequestTotal.inc(labels);
});
next();
});
// Metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', promClient.register.contentType);
res.end(await promClient.register.metrics());
});
app.listen(3000);
prometheus.yml :
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'my-app'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/metrics'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- 'alert_rules.yml'
alert_rules.yml :
groups:
- name: application_alerts
interval: 30s
rules:
# High error rate
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% (threshold: 5%)"
# Slow response time
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "Slow response time"
description: "95th percentile is {{ $value }}s"
# Pod down
- alert: PodDown
expr: up{job="my-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Pod is down"
description: "{{ $labels.instance }} has been down for more than 2 minutes"
# High memory usage
- alert: HighMemoryUsage
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}%"
Winston (Node.js) :
import winston from 'winston';
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'my-app',
environment: process.env.NODE_ENV
},
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
// Usage
logger.info('User logged in', { userId: '123', ip: '1.2.3.4' });
logger.error('Database connection failed', { error: err.message, stack: err.stack });
// Express middleware
app.use((req, res, next) => {
logger.info('HTTP Request', {
method: req.method,
path: req.path,
ip: req.ip,
userAgent: req.get('user-agent')
});
next();
});
dashboard.json (example):
{
"dashboard": {
"title": "Application Metrics",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])",
"legendFormat": "Errors"
}
]
},
{
"title": "Response Time (p95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"
}
]
},
{
"title": "CPU Usage",
"type": "gauge",
"targets": [
{
"expr": "rate(process_cpu_seconds_total[5m]) * 100"
}
]
}
]
}
}
Advanced Health Check :
interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
checks: {
database: { status: string; latency?: number; error?: string };
redis: { status: string; latency?: number };
externalApi: { status: string; latency?: number };
};
}
app.get('/health', async (req, res) => {
const startTime = Date.now();
const health: HealthStatus = {
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
checks: {
database: { status: 'unknown' },
redis: { status: 'unknown' },
externalApi: { status: 'unknown' }
}
};
// Database check
try {
const dbStart = Date.now();
await db.raw('SELECT 1');
health.checks.database = {
status: 'healthy',
latency: Date.now() - dbStart
};
} catch (error) {
health.status = 'unhealthy';
health.checks.database = {
status: 'unhealthy',
error: error.message
};
}
// Redis check
try {
const redisStart = Date.now();
await redis.ping();
health.checks.redis = {
status: 'healthy',
latency: Date.now() - redisStart
};
} catch (error) {
health.status = 'degraded';
health.checks.redis = { status: 'unhealthy' };
}
const statusCode = health.status === 'healthy' ? 200 : health.status === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
});
Golden Signals:
1. Latency (Response Time)
- P50, P95, P99 percentiles
- Per API endpoint
2. Traffic (Request Volume)
- Requests per second
- Per endpoint, per status code
3. Errors (Error Rate)
- 5xx error rate
- 4xx error rate
- Per error type
4. Saturation (Resource Utilization)
- CPU usage
- Memory usage
- Disk I/O
- Network bandwidth
#monitoring #observability #Prometheus #Grafana #logging #metrics #infrastructure
Weekly Installs
1
Repository
GitHub Stars
3
First Seen
1 day ago
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
mcpjam1
claude-code1
junie1
windsurf1
zencoder1
crush1
Azure 升级评估与自动化工具 - 轻松迁移 Functions 计划、托管层级和 SKU
94,100 周安装
Sigma Tutor:基于布鲁姆2-Sigma方法的AI个性化学习导师,诊断提问精通进阶
1,100 周安装
Refero Design:研究优先设计方法,学习最佳实践,打造独特用户体验
1,000 周安装
Flutter MVVM架构实现指南:可扩展应用开发与provider依赖注入
1,100 周安装
CTF杂项挑战快速参考指南 | 沙箱逃逸、编码解码、信号处理与提权技术
1,200 周安装
安全最佳实践指南:识别语言框架漏洞,编写安全代码与生成修复报告
1,100 周安装
Playwright 交互式测试技能:持久会话调试本地Web/Electron应用,无需重启工具链
1,100 周安装