xgboost-lightgbm by tondevrel/scientific-agent-skills
npx skills add https://github.com/tondevrel/scientific-agent-skills --skill xgboost-lightgbmXGBoost(极限梯度提升)和 LightGBM(轻量级梯度提升机)是用于表格/结构化数据机器学习的事实标准库。它们经常在 Kaggle 竞赛中获胜,并因其速度、准确性和鲁棒性而在工业界被广泛使用。
XGBoost 官方 : https://xgboost.readthedocs.io/
XGBoost GitHub : https://github.com/dmlc/xgboost
LightGBM 官方 : https://lightgbm.readthedocs.io/
LightGBM GitHub : https://github.com/microsoft/LightGBM
搜索模式 : xgboost.XGBClassifier, lightgbm.LGBMRegressor, xgboost.train,
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
lightgbm.cv这两个库都顺序地构建决策树集成,其中每棵新树都纠正先前树的错误。这创建了能够捕捉复杂非线性模式的高精度模型。
XGBoost : 较慢但通常稍准确一些。更适合较小的数据集(<10 万行)。
LightGBM : 更快,尤其是在大型数据集(数百万行)上。使用基于直方图的学习。
两者都包含 L1/L2 正则化(alpha、lambda 参数)以防止过拟合。当您拥有许多特征时,这一点至关重要。
LightGBM 具有原生的分类特征支持。XGBoost 需要编码(标签编码或独热编码)。
# 安装两者
pip install xgboost lightgbm
# 对于 GPU 支持
pip install xgboost[gpu] lightgbm[gpu]
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
# LightGBM
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
# 1. 准备数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 2. 创建并训练模型
model = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=6,
random_state=42
)
model.fit(X_train, y_train)
# 3. 预测和评估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
from lightgbm import LGBMRegressor
# 1. 创建模型
model = LGBMRegressor(
n_estimators=100,
learning_rate=0.1,
num_leaves=31,
random_state=42
)
# 2. 训练
model.fit(X_train, y_train)
# 3. 预测
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.4f}")
eval_set 参数在训练期间跟踪验证指标。scale_pos_weight(XGBoost)或 class_weight(LightGBM)。xgb.train() 或 lgb.train() 而不是 sklearn 包装器。.save_model() 和 .load_model() 方法,而不是 pickle(更稳健)。max_depth(XGBoost)或 num_leaves(LightGBM)至关重要。太深 = 过拟合。learning_rate 降低到 0.01-0.05。max_depth,LightGBM 使用 num_leaves。它们不同!# ❌ 错误:在没有验证集或早停的情况下训练
model = XGBClassifier(n_estimators=1000)
model.fit(X_train, y_train) # 很可能过拟合
# ✅ 正确:使用带有验证的早停
model = XGBClassifier(n_estimators=1000, early_stopping_rounds=10)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
verbose=False
)
# ❌ 错误:对 LightGBM 的分类特征进行独热编码
X_encoded = pd.get_dummies(X) # 创建许多稀疏列
model = LGBMClassifier()
model.fit(X_encoded, y)
# ✅ 正确:使用 categorical_feature 参数
model = LGBMClassifier()
model.fit(
X, y,
categorical_feature=['category_col1', 'category_col2']
)
# ❌ 错误:忽略类别不平衡
model = XGBClassifier()
model.fit(X_train, y_train) # 多数类别主导
# ✅ 正确:使用 scale_pos_weight 处理不平衡
from sklearn.utils.class_weight import compute_sample_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = XGBClassifier(scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)
from xgboost import XGBClassifier
import numpy as np
# 二分类
model = XGBClassifier(
n_estimators=100, # 树的数量
max_depth=6, # 最大树深度
learning_rate=0.1, # 步长收缩率 (eta)
subsample=0.8, # 行采样比例
colsample_bytree=0.8, # 列采样比例
random_state=42
)
# 使用早停进行训练
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=True
)
# 获取最佳迭代次数
print(f"Best iteration: {model.best_iteration}")
print(f"Best score: {model.best_score}")
import xgboost as xgb
# 1. 创建 DMatrix(XGBoost 的内部数据结构)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# 2. 设置参数
params = {
'objective': 'binary:logistic', # 或 'reg:squarederror' 用于回归
'max_depth': 6,
'eta': 0.1, # learning_rate
'subsample': 0.8,
'colsample_bytree': 0.8,
'eval_metric': 'auc',
'seed': 42
}
# 3. 通过交叉验证监控进行训练
evals = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=evals,
early_stopping_rounds=10,
verbose_eval=50
)
# 4. 预测
dtest = xgb.DMatrix(X_test)
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)
import xgboost as xgb
# 准备数据
dtrain = xgb.DMatrix(X_train, label=y_train)
# 参数
params = {
'objective': 'binary:logistic',
'max_depth': 6,
'eta': 0.1,
'eval_metric': 'auc'
}
# 运行交叉验证
cv_results = xgb.cv(
params,
dtrain,
num_boost_round=1000,
nfold=5,
stratified=True,
early_stopping_rounds=10,
seed=42,
verbose_eval=50
)
# 最佳迭代次数
print(f"Best iteration: {cv_results.shape[0]}")
print(f"Best score: {cv_results['test-auc-mean'].max():.4f}")
from lightgbm import LGBMClassifier
# 二分类
model = LGBMClassifier(
n_estimators=100,
num_leaves=31, # LightGBM 使用叶子数,而不是深度
learning_rate=0.1,
feature_fraction=0.8, # 与 colsample_bytree 相同
bagging_fraction=0.8, # 与 subsample 相同
bagging_freq=5,
random_state=42
)
# 使用早停进行训练
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='auc',
callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
print(f"Best iteration: {model.best_iteration_}")
print(f"Best score: {model.best_score_}")
import lightgbm as lgb
# 1. 创建 Dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
# 2. 参数
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
# 3. 训练
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, val_data],
valid_names=['train', 'val'],
callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
# 4. 预测
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)
import lightgbm as lgb
import pandas as pd
# 假设 'category' 和 'group' 是分类列
# 不要对它们进行独热编码!
# 方法 1:按名称指定
model = lgb.LGBMClassifier()
model.fit(
X_train, y_train,
categorical_feature=['category', 'group']
)
# 方法 2:按索引指定
model.fit(
X_train, y_train,
categorical_feature=[2, 5] # 分类列的索引
)
# 方法 3:转换为 category 数据类型(自动检测)
X_train['category'] = X_train['category'].astype('category')
X_train['group'] = X_train['group'].astype('category')
model.fit(X_train, y_train) # 自动检测
学习率 (learning_rate 或 eta)
树复杂度
max_depth (3-10)num_leaves (20-100)采样比例
subsample / bagging_fraction: 0.5-1.0colsample_bytree / feature_fraction: 0.5-1.0正则化
reg_alpha (L1): 0-10reg_lambda (L2): 0-10from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
# 定义参数网格
param_grid = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
# 网格搜索
model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(
model,
param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
# 最佳参数
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# 使用最佳模型
best_model = grid_search.best_estimator_
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
def objective(trial):
"""Optuna 目标函数。"""
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
}
model = XGBClassifier(**params, random_state=42)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
return score
# 运行优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"Best value: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
# 训练模型
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 获取特征重要性
importance = model.feature_importances_
feature_names = X_train.columns
# 按重要性排序
indices = importance.argsort()[::-1]
# 绘图
plt.figure(figsize=(10, 6))
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(len(importance)), feature_names[indices], rotation=45, ha='right')
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('特征重要性')
plt.tight_layout()
plt.show()
# 打印前 10 个
print("Top 10 features:")
for i in range(min(10, len(importance))):
print(f"{feature_names[indices[i]]}: {importance[indices[i]]:.4f}")
import shap
from xgboost import XGBClassifier
# 训练模型
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 创建 SHAP 解释器
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# 摘要图
shap.summary_plot(shap_values, X_test, plot_type="bar")
# 单个预测的力图
shap.force_plot(
explainer.expected_value,
shap_values[0],
X_test.iloc[0]
)
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
def kaggle_pipeline(X, y, X_test):
"""完整的 Kaggle 竞赛流水线。"""
# 1. 交叉验证设置
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
# 2. 存储预测
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
# 3. 在每个折上训练
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
print(f"\nFold {fold + 1}/{n_folds}")
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
# 训练模型
model = XGBClassifier(
n_estimators=1000,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50,
verbose=False
)
# 预测验证集
oof_predictions[val_idx] = model.predict_proba(X_val)[:, 1]
# 预测测试集
test_predictions += model.predict_proba(X_test)[:, 1] / n_folds
# 4. 计算 OOF 分数
oof_score = roc_auc_score(y, oof_predictions)
print(f"\nOOF AUC: {oof_score:.4f}")
return oof_predictions, test_predictions
# 用法
# oof_preds, test_preds = kaggle_pipeline(X_train, y_train, X_test)
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
def train_imbalanced_classifier(X_train, y_train, X_val, y_val):
"""处理不平衡数据集。"""
# 计算 scale_pos_weight
n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
scale_pos_weight = n_neg / n_pos
print(f"类别分布: {n_neg} 负例, {n_pos} 正例")
print(f"Scale pos weight: {scale_pos_weight:.2f}")
# 方法 1: scale_pos_weight 参数
model = XGBClassifier(
n_estimators=100,
max_depth=5,
scale_pos_weight=scale_pos_weight,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=False
)
return model
# 替代方案: sample_weight
sample_weights = compute_sample_weight('balanced', y_train)
model.fit(X_train, y_train, sample_weight=sample_weights)
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
def multiclass_pipeline(X_train, y_train, X_val, y_val):
"""使用 LightGBM 进行多类别分类。"""
# 训练模型
model = LGBMClassifier(
n_estimators=200,
num_leaves=31,
learning_rate=0.05,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='multi_logloss',
callbacks=[lgb.early_stopping(stopping_rounds=20)]
)
# 预测
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)
# 评估
print(classification_report(y_val, y_pred))
return model, y_pred, y_pred_proba
# 用法
# model, preds, proba = multiclass_pipeline(X_train, y_train, X_val, y_val)
import pandas as pd
from xgboost import XGBRegressor
def time_series_features(df, target_col, date_col):
"""创建基于时间的特征。"""
df = df.copy()
df[date_col] = pd.to_datetime(df[date_col])
# 时间特征
df['year'] = df[date_col].dt.year
df['month'] = df[date_col].dt.month
df['day'] = df[date_col].dt.day
df['dayofweek'] = df[date_col].dt.dayofweek
df['quarter'] = df[date_col].dt.quarter
# 滞后特征
for lag in [1, 7, 30]:
df[f'lag_{lag}'] = df[target_col].shift(lag)
# 滚动统计
for window in [7, 30]:
df[f'rolling_mean_{window}'] = df[target_col].rolling(window).mean()
df[f'rolling_std_{window}'] = df[target_col].rolling(window).std()
return df.dropna()
def train_time_series_model(df, target_col, feature_cols):
"""在时间序列上训练 XGBoost。"""
# 按时间分割(不要打乱!)
split_idx = int(0.8 * len(df))
train = df.iloc[:split_idx]
test = df.iloc[split_idx:]
X_train = train[feature_cols]
y_train = train[target_col]
X_test = test[feature_cols]
y_test = test[target_col]
# 训练
model = XGBRegressor(
n_estimators=200,
max_depth=5,
learning_rate=0.05,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=20,
verbose=False
)
# 预测
y_pred = model.predict(X_test)
return model, y_pred
# 用法
# df = time_series_features(df, 'sales', 'date')
# model, predictions = train_time_series_model(df, 'sales', feature_cols)
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
def create_stacked_model(X_train, y_train, X_test):
"""使用元学习器堆叠 XGBoost 和 LightGBM。"""
# 基础模型
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lgb_model = LGBMClassifier(n_estimators=100, random_state=42)
# 通过交叉验证生成元特征
xgb_train_preds = cross_val_predict(
xgb_model, X_train, y_train, cv=5, method='predict_proba'
)[:, 1]
lgb_train_preds = cross_val_predict(
lgb_model, X_train, y_train, cv=5, method='predict_proba'
)[:, 1]
# 在整个训练集上训练基础模型
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
# 从基础模型获取测试预测
xgb_test_preds = xgb_model.predict_proba(X_test)[:, 1]
lgb_test_preds = lgb_model.predict_proba(X_test)[:, 1]
# 创建元特征
meta_X_train = np.column_stack([xgb_train_preds, lgb_train_preds])
meta_X_test = np.column_stack([xgb_test_preds, lgb_test_preds])
# 训练元模型
meta_model = LogisticRegression()
meta_model.fit(meta_X_train, y_train)
# 最终预测
final_preds = meta_model.predict_proba(meta_X_test)[:, 1]
return final_preds
# 用法
# stacked_predictions = create_stacked_model(X_train, y_train, X_test)
# 使用 GPU 的 XGBoost
from xgboost import XGBClassifier
model = XGBClassifier(
tree_method='gpu_hist', # 使用 GPU
gpu_id=0,
n_estimators=100
)
model.fit(X_train, y_train)
# 使用 GPU 的 LightGBM
from lightgbm import LGBMClassifier
model = LGBMClassifier(
device='gpu',
gpu_platform_id=0,
gpu_device_id=0,
n_estimators=100
)
model.fit(X_train, y_train)
import lightgbm as lgb
# 使用 float32 而不是 float64
X_train = X_train.astype('float32')
# 对于非常大的数据集,使用 LightGBM 的 Dataset
train_data = lgb.Dataset(
X_train,
label=y_train,
free_raw_data=False # 如果您要重用数据,请将其保留在内存中
)
# 使用基于直方图的方法(LightGBM 已经为此优化)
params = {
'max_bin': 255, # 减少以节省内存,增加以提高准确性
'num_leaves': 31,
'learning_rate': 0.05
}
from xgboost import XGBClassifier
# 使用所有 CPU 核心
model = XGBClassifier(
n_estimators=100,
n_jobs=-1, # 使用所有核心
random_state=42
)
model.fit(X_train, y_train)
# 控制线程数
model = XGBClassifier(
n_estimators=100,
n_jobs=4, # 使用 4 个核心
random_state=42
)
当您基于验证性能调整超参数时,您间接地在验证集上过拟合了。
# ❌ 问题:在同一验证集上反复调优
# 这会导致过于乐观的性能估计
# ✅ 解决方案:使用嵌套交叉验证
from sklearn.model_selection import cross_val_score, GridSearchCV
# 外层循环:性能估计
# 内层循环:超参数调优
param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1]}
model = XGBClassifier()
grid_search = GridSearchCV(model, param_grid, cv=3) # 内层交叉验证
outer_scores = cross_val_score(grid_search, X, y, cv=5) # 外层交叉验证
print(f"无偏性能: {outer_scores.mean():.4f}")
XGBoost 不能原生处理分类特征(但 LightGBM 可以)。
# 对于 XGBoost:使用标签编码,而不是独热编码
from sklearn.preprocessing import LabelEncoder
# ❌ 对 XGBoost 不好:独热编码会创建太多稀疏特征
X_encoded = pd.get_dummies(X, columns=['category'])
# ✅ 对 XGBoost 好:标签编码
le = LabelEncoder()
X['category_encoded'] = le.fit_transform(X['category'])
# ✅ 最好:使用具有原生分类支持的 LightGBM
model = lgb.LGBMClassifier()
model.fit(X, y, categorical_feature=['category'])
较低的学习率需要更多的树,但能提供更好的结果。
# ❌ 问题:学习率太低而树太少
model = XGBClassifier(n_estimators=100, learning_rate=0.01)
# 模型不会收敛
# ✅ 解决方案:使用早停来找到最佳数量
model = XGBClassifier(n_estimators=5000, learning_rate=0.01)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50
)
# 当验证分数停止改善时会停止
XGBoost 使用 max_depth,LightGBM 使用 num_leaves。它们相关但不同!
# XGBoost: max_depth 控制树深度
model_xgb = XGBClassifier(max_depth=6) # 树可以有 2^6 = 64 个叶子
# LightGBM: num_leaves 直接控制叶子数量
model_lgb = LGBMClassifier(num_leaves=31) # 正好 31 个叶子
# ⚠️ 关系: num_leaves ≈ 2^max_depth - 1
# 但 LightGBM 以叶子方式生长树(更快,更准确)
# XGBoost 以层级方式生长树(更保守)
特征重要性可以揭示数据泄漏。
# ✅ 始终检查特征重要性
model.fit(X_train, y_train)
importance = pd.DataFrame({
'feature': X_train.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importance.head(10))
# 🚨 数据泄漏的危险信号:
# 1. 一个特征的重要性 >>90%(可疑)
# 2. ID 列具有高重要性(泄漏!)
# 3. 目标变量衍生的特征(泄漏!)
# 4. 时间序列中的未来信息(泄漏!)
XGBoost 和 LightGBM 已经彻底改变了表格数据的机器学习。它们结合了速度、准确性和可解释性,使其成为结构化数据问题的首选。掌握这些库,您将拥有一个强大的工具来处理绝大多数现实世界的 ML 任务。
每周安装量
105
仓库
GitHub 星标数
6
首次出现
2026年2月8日
安全审计
安装在
gemini-cli96
opencode96
codex96
github-copilot93
cursor90
kimi-cli88
XGBoost (eXtreme Gradient Boosting) and LightGBM (Light Gradient Boosting Machine) are the de facto standard libraries for machine learning on tabular/structured data. They consistently win Kaggle competitions and are widely used in industry for their speed, accuracy, and robustness.
XGBoost Official : https://xgboost.readthedocs.io/
XGBoost GitHub : https://github.com/dmlc/xgboost
LightGBM Official : https://lightgbm.readthedocs.io/
LightGBM GitHub : https://github.com/microsoft/LightGBM
Search patterns : xgboost.XGBClassifier, lightgbm.LGBMRegressor, xgboost.train, lightgbm.cv
Both libraries build an ensemble of decision trees sequentially, where each new tree corrects errors from previous trees. This creates highly accurate models that capture complex non-linear patterns.
XGBoost : Slower but often slightly more accurate. Better for smaller datasets (<100k rows).
LightGBM : Faster, especially on large datasets (millions of rows). Uses histogram-based learning.
Both include L1/L2 regularization (alpha, lambda parameters) to prevent overfitting. This is crucial when you have many features.
LightGBM has native categorical feature support. XGBoost requires encoding (label encoding or one-hot).
# Install both
pip install xgboost lightgbm
# For GPU support
pip install xgboost[gpu] lightgbm[gpu]
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
# LightGBM
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
# 1. Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 2. Create and train model
model = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=6,
random_state=42
)
model.fit(X_train, y_train)
# 3. Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
from lightgbm import LGBMRegressor
# 1. Create model
model = LGBMRegressor(
n_estimators=100,
learning_rate=0.1,
num_leaves=31,
random_state=42
)
# 2. Train
model.fit(X_train, y_train)
# 3. Predict
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.4f}")
eval_set parameter to track validation metrics during training.scale_pos_weight (XGBoost) or class_weight (LightGBM).xgb.train() or lgb.train() instead of sklearn wrappers..save_model() and .load_model() methods, not pickle (more robust).max_depth (XGBoost) or num_leaves (LightGBM) are critical. Too deep = overfit.learning_rate to 0.01-0.05 for datasets >1M rows.max_depth, LightGBM uses num_leaves. They're different!# ❌ BAD: Training without validation set or early stopping
model = XGBClassifier(n_estimators=1000)
model.fit(X_train, y_train) # Will likely overfit
# ✅ GOOD: Use early stopping with validation
model = XGBClassifier(n_estimators=1000, early_stopping_rounds=10)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
verbose=False
)
# ❌ BAD: One-hot encoding categorical features for LightGBM
X_encoded = pd.get_dummies(X) # Creates many sparse columns
model = LGBMClassifier()
model.fit(X_encoded, y)
# ✅ GOOD: Use categorical_feature parameter
model = LGBMClassifier()
model.fit(
X, y,
categorical_feature=['category_col1', 'category_col2']
)
# ❌ BAD: Ignoring class imbalance
model = XGBClassifier()
model.fit(X_train, y_train) # Majority class dominates
# ✅ GOOD: Handle imbalance with scale_pos_weight
from sklearn.utils.class_weight import compute_sample_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = XGBClassifier(scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)
from xgboost import XGBClassifier
import numpy as np
# Binary classification
model = XGBClassifier(
n_estimators=100, # Number of trees
max_depth=6, # Maximum tree depth
learning_rate=0.1, # Step size shrinkage (eta)
subsample=0.8, # Row sampling ratio
colsample_bytree=0.8, # Column sampling ratio
random_state=42
)
# Train with early stopping
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=True
)
# Get best iteration
print(f"Best iteration: {model.best_iteration}")
print(f"Best score: {model.best_score}")
import xgboost as xgb
# 1. Create DMatrix (XGBoost's internal data structure)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# 2. Set parameters
params = {
'objective': 'binary:logistic', # or 'reg:squarederror' for regression
'max_depth': 6,
'eta': 0.1, # learning_rate
'subsample': 0.8,
'colsample_bytree': 0.8,
'eval_metric': 'auc',
'seed': 42
}
# 3. Train with cross-validation monitoring
evals = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=evals,
early_stopping_rounds=10,
verbose_eval=50
)
# 4. Predict
dtest = xgb.DMatrix(X_test)
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)
import xgboost as xgb
# Prepare data
dtrain = xgb.DMatrix(X_train, label=y_train)
# Parameters
params = {
'objective': 'binary:logistic',
'max_depth': 6,
'eta': 0.1,
'eval_metric': 'auc'
}
# Run CV
cv_results = xgb.cv(
params,
dtrain,
num_boost_round=1000,
nfold=5,
stratified=True,
early_stopping_rounds=10,
seed=42,
verbose_eval=50
)
# Best iteration
print(f"Best iteration: {cv_results.shape[0]}")
print(f"Best score: {cv_results['test-auc-mean'].max():.4f}")
from lightgbm import LGBMClassifier
# Binary classification
model = LGBMClassifier(
n_estimators=100,
num_leaves=31, # LightGBM uses leaves, not depth
learning_rate=0.1,
feature_fraction=0.8, # Same as colsample_bytree
bagging_fraction=0.8, # Same as subsample
bagging_freq=5,
random_state=42
)
# Train with early stopping
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='auc',
callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
print(f"Best iteration: {model.best_iteration_}")
print(f"Best score: {model.best_score_}")
import lightgbm as lgb
# 1. Create Dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
# 2. Parameters
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
# 3. Train
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, val_data],
valid_names=['train', 'val'],
callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
# 4. Predict
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)
import lightgbm as lgb
import pandas as pd
# Assume 'category' and 'group' are categorical columns
# DO NOT one-hot encode them!
# Method 1: Specify by name
model = lgb.LGBMClassifier()
model.fit(
X_train, y_train,
categorical_feature=['category', 'group']
)
# Method 2: Specify by index
model.fit(
X_train, y_train,
categorical_feature=[2, 5] # Indices of categorical columns
)
# Method 3: Convert to category dtype (automatic detection)
X_train['category'] = X_train['category'].astype('category')
X_train['group'] = X_train['group'].astype('category')
model.fit(X_train, y_train) # Automatically detects
Learning Rate (learning_rate or eta)
Tree Complexity
max_depth (3-10)num_leaves (20-100)Sampling Ratios
subsample / bagging_fraction: 0.5-1.0colsample_bytree / feature_fraction: 0.5-1.0Regularization
reg_alpha (L1): 0-10reg_lambda (L2): 0-10from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
# Define parameter grid
param_grid = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
# Grid search
model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(
model,
param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
# Best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# Use best model
best_model = grid_search.best_estimator_
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
def objective(trial):
"""Optuna objective function."""
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
}
model = XGBClassifier(**params, random_state=42)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
return score
# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"Best value: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
# Train model
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Get feature importance
importance = model.feature_importances_
feature_names = X_train.columns
# Sort by importance
indices = importance.argsort()[::-1]
# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(len(importance)), feature_names[indices], rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()
# Print top 10
print("Top 10 features:")
for i in range(min(10, len(importance))):
print(f"{feature_names[indices[i]]}: {importance[indices[i]]:.4f}")
import shap
from xgboost import XGBClassifier
# Train model
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Create SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")
# Force plot for single prediction
shap.force_plot(
explainer.expected_value,
shap_values[0],
X_test.iloc[0]
)
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
def kaggle_pipeline(X, y, X_test):
"""Complete Kaggle competition pipeline."""
# 1. Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
# 2. Store predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
# 3. Train on each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
print(f"\nFold {fold + 1}/{n_folds}")
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
# Train model
model = XGBClassifier(
n_estimators=1000,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50,
verbose=False
)
# Predict validation set
oof_predictions[val_idx] = model.predict_proba(X_val)[:, 1]
# Predict test set
test_predictions += model.predict_proba(X_test)[:, 1] / n_folds
# 4. Calculate OOF score
oof_score = roc_auc_score(y, oof_predictions)
print(f"\nOOF AUC: {oof_score:.4f}")
return oof_predictions, test_predictions
# Usage
# oof_preds, test_preds = kaggle_pipeline(X_train, y_train, X_test)
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
def train_imbalanced_classifier(X_train, y_train, X_val, y_val):
"""Handle imbalanced datasets."""
# Calculate scale_pos_weight
n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
scale_pos_weight = n_neg / n_pos
print(f"Class distribution: {n_neg} negative, {n_pos} positive")
print(f"Scale pos weight: {scale_pos_weight:.2f}")
# Method 1: scale_pos_weight parameter
model = XGBClassifier(
n_estimators=100,
max_depth=5,
scale_pos_weight=scale_pos_weight,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=False
)
return model
# Alternative: sample_weight
sample_weights = compute_sample_weight('balanced', y_train)
model.fit(X_train, y_train, sample_weight=sample_weights)
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
def multiclass_pipeline(X_train, y_train, X_val, y_val):
"""Multi-class classification with LightGBM."""
# Train model
model = LGBMClassifier(
n_estimators=200,
num_leaves=31,
learning_rate=0.05,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='multi_logloss',
callbacks=[lgb.early_stopping(stopping_rounds=20)]
)
# Predict
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)
# Evaluate
print(classification_report(y_val, y_pred))
return model, y_pred, y_pred_proba
# Usage
# model, preds, proba = multiclass_pipeline(X_train, y_train, X_val, y_val)
import pandas as pd
from xgboost import XGBRegressor
def time_series_features(df, target_col, date_col):
"""Create time-based features."""
df = df.copy()
df[date_col] = pd.to_datetime(df[date_col])
# Time features
df['year'] = df[date_col].dt.year
df['month'] = df[date_col].dt.month
df['day'] = df[date_col].dt.day
df['dayofweek'] = df[date_col].dt.dayofweek
df['quarter'] = df[date_col].dt.quarter
# Lag features
for lag in [1, 7, 30]:
df[f'lag_{lag}'] = df[target_col].shift(lag)
# Rolling statistics
for window in [7, 30]:
df[f'rolling_mean_{window}'] = df[target_col].rolling(window).mean()
df[f'rolling_std_{window}'] = df[target_col].rolling(window).std()
return df.dropna()
def train_time_series_model(df, target_col, feature_cols):
"""Train XGBoost on time series."""
# Split by time (no shuffle!)
split_idx = int(0.8 * len(df))
train = df.iloc[:split_idx]
test = df.iloc[split_idx:]
X_train = train[feature_cols]
y_train = train[target_col]
X_test = test[feature_cols]
y_test = test[target_col]
# Train
model = XGBRegressor(
n_estimators=200,
max_depth=5,
learning_rate=0.05,
random_state=42
)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=20,
verbose=False
)
# Predict
y_pred = model.predict(X_test)
return model, y_pred
# Usage
# df = time_series_features(df, 'sales', 'date')
# model, predictions = train_time_series_model(df, 'sales', feature_cols)
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
def create_stacked_model(X_train, y_train, X_test):
"""Stack XGBoost and LightGBM with meta-learner."""
# Base models
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lgb_model = LGBMClassifier(n_estimators=100, random_state=42)
# Generate meta-features via cross-validation
xgb_train_preds = cross_val_predict(
xgb_model, X_train, y_train, cv=5, method='predict_proba'
)[:, 1]
lgb_train_preds = cross_val_predict(
lgb_model, X_train, y_train, cv=5, method='predict_proba'
)[:, 1]
# Train base models on full training set
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
# Get test predictions from base models
xgb_test_preds = xgb_model.predict_proba(X_test)[:, 1]
lgb_test_preds = lgb_model.predict_proba(X_test)[:, 1]
# Create meta-features
meta_X_train = np.column_stack([xgb_train_preds, lgb_train_preds])
meta_X_test = np.column_stack([xgb_test_preds, lgb_test_preds])
# Train meta-model
meta_model = LogisticRegression()
meta_model.fit(meta_X_train, y_train)
# Final predictions
final_preds = meta_model.predict_proba(meta_X_test)[:, 1]
return final_preds
# Usage
# stacked_predictions = create_stacked_model(X_train, y_train, X_test)
# XGBoost with GPU
from xgboost import XGBClassifier
model = XGBClassifier(
tree_method='gpu_hist', # Use GPU
gpu_id=0,
n_estimators=100
)
model.fit(X_train, y_train)
# LightGBM with GPU
from lightgbm import LGBMClassifier
model = LGBMClassifier(
device='gpu',
gpu_platform_id=0,
gpu_device_id=0,
n_estimators=100
)
model.fit(X_train, y_train)
import lightgbm as lgb
# Use float32 instead of float64
X_train = X_train.astype('float32')
# For very large datasets, use LightGBM's Dataset
train_data = lgb.Dataset(
X_train,
label=y_train,
free_raw_data=False # Keep data in memory if you'll reuse it
)
# Use histogram-based approach (LightGBM is already optimized for this)
params = {
'max_bin': 255, # Reduce for less memory, increase for more accuracy
'num_leaves': 31,
'learning_rate': 0.05
}
from xgboost import XGBClassifier
# Use all CPU cores
model = XGBClassifier(
n_estimators=100,
n_jobs=-1, # Use all cores
random_state=42
)
model.fit(X_train, y_train)
# Control number of threads
model = XGBClassifier(
n_estimators=100,
n_jobs=4, # Use 4 cores
random_state=42
)
When you tune hyperparameters based on validation performance, you're indirectly overfitting to the validation set.
# ❌ Problem: Tuning on same validation set repeatedly
# This leads to overly optimistic performance estimates
# ✅ Solution: Use nested cross-validation
from sklearn.model_selection import cross_val_score, GridSearchCV
# Outer loop: performance estimation
# Inner loop: hyperparameter tuning
param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1]}
model = XGBClassifier()
grid_search = GridSearchCV(model, param_grid, cv=3) # Inner CV
outer_scores = cross_val_score(grid_search, X, y, cv=5) # Outer CV
print(f"Unbiased performance: {outer_scores.mean():.4f}")
XGBoost doesn't handle categorical features natively (but LightGBM does).
# For XGBoost: use label encoding, NOT one-hot
from sklearn.preprocessing import LabelEncoder
# ❌ BAD for XGBoost: One-hot encoding creates too many sparse features
X_encoded = pd.get_dummies(X, columns=['category'])
# ✅ GOOD for XGBoost: Label encoding
le = LabelEncoder()
X['category_encoded'] = le.fit_transform(X['category'])
# ✅ BEST: Use LightGBM with native categorical support
model = lgb.LGBMClassifier()
model.fit(X, y, categorical_feature=['category'])
Lower learning rate needs more trees but gives better results.
# ❌ Problem: Too few trees with low learning rate
model = XGBClassifier(n_estimators=100, learning_rate=0.01)
# Model won't converge
# ✅ Solution: Use early stopping to find optimal number
model = XGBClassifier(n_estimators=5000, learning_rate=0.01)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=50
)
# Will stop when validation score stops improving
XGBoost uses max_depth, LightGBM uses num_leaves. They're related but different!
# XGBoost: max_depth controls tree depth
model_xgb = XGBClassifier(max_depth=6) # Tree can have 2^6 = 64 leaves
# LightGBM: num_leaves controls number of leaves directly
model_lgb = LGBMClassifier(num_leaves=31) # Exactly 31 leaves
# ⚠️ Relationship: num_leaves ≈ 2^max_depth - 1
# But LightGBM grows trees leaf-wise (faster, more accurate)
# XGBoost grows trees level-wise (more conservative)
Feature importance can reveal data leakage.
# ✅ Always check feature importance
model.fit(X_train, y_train)
importance = pd.DataFrame({
'feature': X_train.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importance.head(10))
# 🚨 Red flags for data leakage:
# 1. One feature has >>90% importance (suspicious)
# 2. ID columns have high importance (leakage!)
# 3. Target-derived features (leakage!)
# 4. Future information in time series (leakage!)
XGBoost and LightGBM have revolutionized machine learning on tabular data. Their combination of speed, accuracy, and interpretability makes them the go-to choice for structured data problems. Master these libraries, and you'll have a powerful tool for the vast majority of real-world ML tasks.
Weekly Installs
105
Repository
GitHub Stars
6
First Seen
Feb 8, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
gemini-cli96
opencode96
codex96
github-copilot93
cursor90
kimi-cli88
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
69,600 周安装