计算特征之间的相关性,并移除高度相关的特征(相关系数大于 0.95)
import pandas as pd import numpy as np # 创建一个示例数据框 data = { 'A': [1, 2, 3, 4, 5], 'B': [5, 4, 3, 2, 1], 'C': [1, 1, 2, 2, 3], 'D': [5, 3, 2, 4, 5], 'E': [2, 4, 6, 8, 10] } X = pd.DataFrame(data) print("原始数据框:") print(X) # 计算相关矩阵 corr_matrix = X.corr().abs() print("\n相关矩阵:") print(corr_matrix) # 获取上三角矩阵 upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) print("\n上三角矩阵:") print(upper) # 找出任何相关系数大于 0.95 的列 to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] print("\n要移除的特征:") print(to_drop) # 移除高相关特征 X_reduced = X.drop(to_drop, axis=1) print("\n去除高度相关后的数据框:") print(X_reduced)
提取和展示模型中各特征的重要性,尤其是查看特征重要性为零的特征以及非零特征。
import pandas as pd import lightgbm as lgb from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split # 加载数据集 data = load_breast_cancer() X = pd.DataFrame(data.data, columns=data.feature_names) y = pd.Series(data.target) # 拆分训练和测试数据 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建 LightGBM 数据集 d_train = lgb.Dataset(X_train, label=y_train) # 训练 LightGBM 模型 model = lgb.train(params={'objective': 'binary'}, train_set=d_train, num_boost_round=100) # 获取特征重要性 fea_iptc = pd.DataFrame({ 'feature_name': X.columns, 'importance': model.feature_importance() }) # 显示特征重要性数据框 print(fea_iptc) # 找到特征重要性为0的特征 fea_importance_0 = fea_iptc['feature_name'].loc[fea_iptc['importance'] == 0].tolist() print('fea_importance_0: ', len(fea_importance_0), fea_importance_0) # 找到特征重要性不为0的特征 fea_importance_not_0 = fea_iptc['feature_name'].loc[fea_iptc['importance'] != 0].tolist() print('fea_importance_not_0: ', len(fea_importance_not_0), fea_importance_not_0)
使用信息值(IV)和群体稳定指数(PSI)筛选特征
import pandas as pd # 示例数据:特征名称与它们的 IV 和 PSI 值 data = { 'name': ['feature1', 'feature2', 'feature3', 'feature4', 'feature5'], 'iv': [0.02, 0.15, 0.03, 0.01, 0.5], # IV 值 'psi': [0.04, 0.02, 0.06, 0.07, 0.03] # PSI 值 } # 创建 DataFrame iv_df = pd.DataFrame(data) # 定义筛选特征的函数 def select_features_by_iv_psi(feature_stats, iv_threshold=0.01, psi_threshold=0.05): """根据IV和PSI值筛选特征""" high_iv_features = feature_stats[feature_stats['iv'] >= iv_threshold] selected_features = high_iv_features[high_iv_features['psi'] < psi_threshold] return selected_features['name'].tolist() # 可选的特征列表 selected_features_list = ['feature1', 'feature2', 'feature3'] # 过滤特征 filtered_iv_df = iv_df[iv_df['name'].isin(selected_features_list)] # 使用筛选函数 selected_features = select_features_by_iv_psi(filtered_iv_df) # 输出筛选结果 print("Selected features:", len(selected_features), selected_features)