Feature Engineering by aj-geddes/useful-ai-prompts
npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Feature Engineering'特征工程通过领域知识和数学变换来创建和转换特征,以提高模型性能、可解释性和泛化能力。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures,
OneHotEncoder, OrdinalEncoder, LabelEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
# 创建示例数据集
np.random.seed(42)
df = pd.DataFrame({
'age': np.random.uniform(18, 80, 1000),
'income': np.random.uniform(20000, 150000, 1000),
'experience_years': np.random.uniform(0, 50, 1000),
'category': np.random.choice(['A', 'B', 'C'], 1000),
'city': np.random.choice(['NYC', 'LA', 'Chicago'], 1000),
'purchased': np.random.choice([0, 1], 1000),
})
print("原始数据:")
print(df.head())
print(df.info())
# 1. 分类变量编码
# 独热编码
print("\n1. 独热编码:")
df_ohe = pd.get_dummies(df, columns=['category', 'city'], drop_first=True)
print(df_ohe.head())
# 序数编码
print("\n2. 序数编码:")
ordinal_encoder = OrdinalEncoder()
df['category_ordinal'] = ordinal_encoder.fit_transform(df[['category']])
print(df[['category', 'category_ordinal']].head())
# 标签编码
print("\n3. 标签编码:")
le = LabelEncoder()
df['city_encoded'] = le.fit_transform(df['city'])
print(df[['city', 'city_encoded']].head())
# 2. 特征缩放
print("\n4. 特征缩放:")
X = df[['age', 'income', 'experience_years']].copy()
# StandardScaler (均值=0, 标准差=1)
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
# MinMaxScaler [0, 1]
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)
# RobustScaler (对异常值稳健)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X)
# 可视化
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].hist(X['age'], bins=30, edgecolor='black')
axes[0, 0].set_title('原始年龄')
axes[0, 1].hist(X_standard[:, 0], bins=30, edgecolor='black')
axes[0, 1].set_title('StandardScaler 年龄')
axes[1, 0].hist(X_minmax[:, 0], bins=30, edgecolor='black')
axes[1, 0].set_title('MinMaxScaler 年龄')
axes[1, 1].hist(X_robust[:, 0], bins=30, edgecolor='black')
axes[1, 1].set_title('RobustScaler 年龄')
plt.tight_layout()
plt.show()
# 3. 多项式特征
print("\n5. 多项式特征:")
X_simple = df[['age']].copy()
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_simple)
X_poly_df = pd.DataFrame(X_poly, columns=['age', 'age^2'])
print(X_poly_df.head())
# 可视化
plt.figure(figsize=(12, 5))
plt.scatter(df['age'], df['income'], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)
plt.show()
# 4. 特征交互
print("\n6. 特征交互:")
df['age_income_interaction'] = df['age'] * df['income'] / 10000
df['age_experience_ratio'] = df['age'] / (df['experience_years'] + 1)
print(df[['age', 'income', 'age_income_interaction', 'age_experience_ratio']].head())
# 5. 领域特定转换
print("\n7. 领域特定特征:")
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100],
labels=['Young', 'Middle', 'Senior', 'Retired'])
df['income_level'] = pd.qcut(df['income'], q=3, labels=['Low', 'Medium', 'High'])
df['log_income'] = np.log1p(df['income'])
df['sqrt_experience'] = np.sqrt(df['experience_years'])
print(df[['age', 'age_group', 'income', 'income_level', 'log_income']].head())
# 6. 时间特征 (如果日期数据可用)
print("\n8. 时间特征:")
dates = pd.date_range('2023-01-01', periods=len(df))
df['date'] = dates
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['is_weekend'] = df['date'].dt.dayofweek >= 5
print(df[['date', 'year', 'month', 'day_of_week', 'is_weekend']].head())
# 7. 特征标准化流水线
print("\n9. 特征工程流水线:")
# 分离数值和分类特征
numerical_features = ['age', 'income', 'experience_years']
categorical_features = ['category', 'city']
# 创建预处理流水线
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first'), categorical_features),
]
)
X_processed = preprocessor.fit_transform(df[numerical_features + categorical_features])
print(f"处理后形状: {X_processed.shape}")
# 8. 特征统计
print("\n10. 特征统计:")
X_for_stats = df[numerical_features].copy()
X_for_stats['category_A'] = (df['category'] == 'A').astype(int)
X_for_stats['city_NYC'] = (df['city'] == 'NYC').astype(int)
feature_stats = pd.DataFrame({
'Feature': X_for_stats.columns,
'Mean': X_for_stats.mean(),
'Std': X_for_stats.std(),
'Min': X_for_stats.min(),
'Max': X_for_stats.max(),
'Skewness': X_for_stats.skew(),
'Kurtosis': X_for_stats.kurtosis(),
})
print(feature_stats)
# 9. 特征相关性
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
X_numeric = df[numerical_features].copy()
X_numeric['purchased'] = df['purchased']
corr_matrix = X_numeric.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0])
axes[0].set_title('特征相关性矩阵')
# 工程化特征的分布
axes[1].hist(df['age_income_interaction'], bins=30, edgecolor='black', alpha=0.7)
axes[1].set_title('年龄-收入交互项分布')
axes[1].set_xlabel('值')
axes[1].set_ylabel('频率')
plt.tight_layout()
plt.show()
# 10. 特征分箱 / 离散化
print("\n11. 特征分箱:")
df['age_bin_equal'] = pd.cut(df['age'], bins=5)
df['age_bin_quantile'] = pd.qcut(df['age'], q=5)
df['income_bins'] = pd.cut(df['income'], bins=[0, 50000, 100000, 150000])
print("等宽分箱:")
print(df['age_bin_equal'].value_counts().sort_index())
print("\n等频分箱:")
print(df['age_bin_quantile'].value_counts().sort_index())
# 11. 缺失值创建与处理
print("\n12. 缺失值插补:")
df_with_missing = df.copy()
missing_indices = np.random.choice(len(df), 50, replace=False)
df_with_missing.loc[missing_indices, 'age'] = np.nan
# 均值插补
age_mean = df_with_missing['age'].mean()
df_with_missing['age_imputed_mean'] = df_with_missing['age'].fillna(age_mean)
# 中位数插补
age_median = df_with_missing['age'].median()
df_with_missing['age_imputed_median'] = df_with_missing['age'].fillna(age_median)
# 前向填充
df_with_missing['age_imputed_ffill'] = df_with_missing['age'].fillna(method='ffill')
print(df_with_missing[['age', 'age_imputed_mean', 'age_imputed_median']].head(10))
print("\n特征工程完成!")
print(f"原始特征数量: {len(df.columns) - 5}")
print(f"最终可用特征数量: {len(df.columns)}")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
每周安装量
0
代码仓库
GitHub 星标数
116
首次出现
1970年1月1日
安全审计
Feature engineering creates and transforms features to improve model performance, interpretability, and generalization through domain knowledge and mathematical transformations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures,
OneHotEncoder, OrdinalEncoder, LabelEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
# Create sample dataset
np.random.seed(42)
df = pd.DataFrame({
'age': np.random.uniform(18, 80, 1000),
'income': np.random.uniform(20000, 150000, 1000),
'experience_years': np.random.uniform(0, 50, 1000),
'category': np.random.choice(['A', 'B', 'C'], 1000),
'city': np.random.choice(['NYC', 'LA', 'Chicago'], 1000),
'purchased': np.random.choice([0, 1], 1000),
})
print("Original Data:")
print(df.head())
print(df.info())
# 1. Categorical Encoding
# One-Hot Encoding
print("\n1. One-Hot Encoding:")
df_ohe = pd.get_dummies(df, columns=['category', 'city'], drop_first=True)
print(df_ohe.head())
# Ordinal Encoding
print("\n2. Ordinal Encoding:")
ordinal_encoder = OrdinalEncoder()
df['category_ordinal'] = ordinal_encoder.fit_transform(df[['category']])
print(df[['category', 'category_ordinal']].head())
# Label Encoding
print("\n3. Label Encoding:")
le = LabelEncoder()
df['city_encoded'] = le.fit_transform(df['city'])
print(df[['city', 'city_encoded']].head())
# 2. Feature Scaling
print("\n4. Feature Scaling:")
X = df[['age', 'income', 'experience_years']].copy()
# StandardScaler (mean=0, std=1)
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
# MinMaxScaler [0, 1]
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)
# RobustScaler (resistant to outliers)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X)
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].hist(X['age'], bins=30, edgecolor='black')
axes[0, 0].set_title('Original Age')
axes[0, 1].hist(X_standard[:, 0], bins=30, edgecolor='black')
axes[0, 1].set_title('StandardScaler Age')
axes[1, 0].hist(X_minmax[:, 0], bins=30, edgecolor='black')
axes[1, 0].set_title('MinMaxScaler Age')
axes[1, 1].hist(X_robust[:, 0], bins=30, edgecolor='black')
axes[1, 1].set_title('RobustScaler Age')
plt.tight_layout()
plt.show()
# 3. Polynomial Features
print("\n5. Polynomial Features:")
X_simple = df[['age']].copy()
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_simple)
X_poly_df = pd.DataFrame(X_poly, columns=['age', 'age^2'])
print(X_poly_df.head())
# Visualization
plt.figure(figsize=(12, 5))
plt.scatter(df['age'], df['income'], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')
plt.grid(True, alpha=0.3)
plt.show()
# 4. Feature Interactions
print("\n6. Feature Interactions:")
df['age_income_interaction'] = df['age'] * df['income'] / 10000
df['age_experience_ratio'] = df['age'] / (df['experience_years'] + 1)
print(df[['age', 'income', 'age_income_interaction', 'age_experience_ratio']].head())
# 5. Domain-specific Transformations
print("\n7. Domain-specific Features:")
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100],
labels=['Young', 'Middle', 'Senior', 'Retired'])
df['income_level'] = pd.qcut(df['income'], q=3, labels=['Low', 'Medium', 'High'])
df['log_income'] = np.log1p(df['income'])
df['sqrt_experience'] = np.sqrt(df['experience_years'])
print(df[['age', 'age_group', 'income', 'income_level', 'log_income']].head())
# 6. Temporal Features (if date data available)
print("\n8. Temporal Features:")
dates = pd.date_range('2023-01-01', periods=len(df))
df['date'] = dates
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['is_weekend'] = df['date'].dt.dayofweek >= 5
print(df[['date', 'year', 'month', 'day_of_week', 'is_weekend']].head())
# 7. Feature Standardization Pipeline
print("\n9. Feature Engineering Pipeline:")
# Separate numerical and categorical features
numerical_features = ['age', 'income', 'experience_years']
categorical_features = ['category', 'city']
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first'), categorical_features),
]
)
X_processed = preprocessor.fit_transform(df[numerical_features + categorical_features])
print(f"Processed shape: {X_processed.shape}")
# 8. Feature Statistics
print("\n10. Feature Statistics:")
X_for_stats = df[numerical_features].copy()
X_for_stats['category_A'] = (df['category'] == 'A').astype(int)
X_for_stats['city_NYC'] = (df['city'] == 'NYC').astype(int)
feature_stats = pd.DataFrame({
'Feature': X_for_stats.columns,
'Mean': X_for_stats.mean(),
'Std': X_for_stats.std(),
'Min': X_for_stats.min(),
'Max': X_for_stats.max(),
'Skewness': X_for_stats.skew(),
'Kurtosis': X_for_stats.kurtosis(),
})
print(feature_stats)
# 9. Feature Correlations
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
X_numeric = df[numerical_features].copy()
X_numeric['purchased'] = df['purchased']
corr_matrix = X_numeric.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0])
axes[0].set_title('Feature Correlation Matrix')
# Distribution of engineered features
axes[1].hist(df['age_income_interaction'], bins=30, edgecolor='black', alpha=0.7)
axes[1].set_title('Age-Income Interaction Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
# 10. Feature Binning / Discretization
print("\n11. Feature Binning:")
df['age_bin_equal'] = pd.cut(df['age'], bins=5)
df['age_bin_quantile'] = pd.qcut(df['age'], q=5)
df['income_bins'] = pd.cut(df['income'], bins=[0, 50000, 100000, 150000])
print("Equal Width Binning:")
print(df['age_bin_equal'].value_counts().sort_index())
print("\nEqual Frequency Binning:")
print(df['age_bin_quantile'].value_counts().sort_index())
# 11. Missing Value Creation and Handling
print("\n12. Missing Value Imputation:")
df_with_missing = df.copy()
missing_indices = np.random.choice(len(df), 50, replace=False)
df_with_missing.loc[missing_indices, 'age'] = np.nan
# Mean imputation
age_mean = df_with_missing['age'].mean()
df_with_missing['age_imputed_mean'] = df_with_missing['age'].fillna(age_mean)
# Median imputation
age_median = df_with_missing['age'].median()
df_with_missing['age_imputed_median'] = df_with_missing['age'].fillna(age_median)
# Forward fill
df_with_missing['age_imputed_ffill'] = df_with_missing['age'].fillna(method='ffill')
print(df_with_missing[['age', 'age_imputed_mean', 'age_imputed_median']].head(10))
print("\nFeature Engineering Complete!")
print(f"Original features: {len(df.columns) - 5}")
print(f"Final features available: {len(df.columns)}")
Weekly Installs
0
Repository
GitHub Stars
116
First Seen
Jan 1, 1970
Security Audits
专业SEO审计工具:全面网站诊断、技术SEO优化与页面分析指南
57,600 周安装