Isolation Forests

孤立森林是一个经典的异常检测算法，能处理大规模的多维数据。

孤立森林认为异常样本通常few and different：相比正常样本，它们数量上比较少，特征值差异比较大。因此，异常样本更容易被孤立。孤立森林通过构建二叉树的方法孤立每一个异常样本——因为异常样本容易被孤立的特征，异常样本更靠近根节点，正常样本则在更深处。构建的这棵树被称为孤立树（Isolation Tree, iTree)。iTrees的集合即iForest。anomalies are those instances which have short average path lengths on the iTrees。

from sklearn.ensemble import IsolationForest

IsolationForest不能将检测为异常值的某个数据设为缺失值只能删除

代码

import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif as MIC

from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import random
import warnings
warnings.filterwarnings("ignore")  # 忽略警告信息
data_train = pd.read_csv("../原始数据集/train_10000.csv")
data_test = pd.read_csv("../原始数据集/validate_1000.csv")
#获取数据
X_val_df = data_test.iloc[:, 1:-1]
y_val_df = data_test.iloc[:, -1:]
X_train_df = data_train.iloc[:, 1:-1]
y_train_df = data_train.iloc[:, -1:]

1 2	df = X_train_df.fillna(X_train_df.median()) X_train_df = X_train_df.fillna(X_train_df.median())

from sklearn.ensemble import IsolationForest
import numpy as np

# 创建示例数据
X = np.random.rand(200, 2)

# 创建孤立森林模型
clf = IsolationForest(contamination=0.05, n_estimators=200, random_state=42)  # contamination参数用于指定异常值的比例

# 拟合模型并预测异常值
clf.fit(df.values)
y_pred = clf.predict(df.values)

# 打印异常值的预测结果
print("异常值预测结果：")
print(y_pred)

异常值预测结果：
[1 1 1 ... 1 1 1]

1 2	df = pd.DataFrame(y_pred) df.to_csv('output.csv', index=False)

bool_array = y_pred == -1
# bool_l = bool_l.tolist()
# bool_l
# 根据布尔数组剔除样本
df_filtered = df[~bool_array]

# 打印剔除样本后的DataFrame
df_filtered

	feature0	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	...	feature97	feature98	feature99	feature100	feature101	feature102	feature103	feature104	feature105	feature106
0	51.567250	288358.4	1.105027	1.855900	201.460169	6.582261	-0.516321	5.636771e+11	2.222212e+05	2.002797e+14	...	-14.701891	31459.628135	254.582034	0.0	-2.014506e+08	159.299350	0.603211	0.940002	-5.256075	180.977310
1	63.804874	288358.4	1.106802	1.050387	391.605375	13.323439	4.662871	2.181820e+10	-1.442474e+05	2.601894e+15	...	-8.959940	42830.526855	270.580779	0.0	-1.534970e+09	180.099705	0.506220	0.552654	16.505952	314.783263
2	49.138527	288358.4	1.111649	0.767127	130.708067	6.485547	5.696815	5.474603e+11	-4.288403e+05	1.363373e+14	...	32.159667	249963.241809	160.207067	0.0	7.998345e+08	112.632639	0.080100	0.235920	64.707581	183.304610
3	59.801849	288358.4	1.109169	0.731299	258.521076	0.716737	23.238461	-3.539891e+11	9.530472e+03	-1.338632e+15	...	1.815809	125478.297201	196.223295	0.0	-9.246920e+07	138.431470	2.548783	1.414810	-9.662399	212.302670
4	76.520831	288358.4	1.113410	5.795408	256.038997	-1.803483	14.040495	-1.071014e+11	6.499723e+05	2.800250e+15	...	40.623904	186266.423019	179.083883	0.0	-6.796351e+08	259.858740	0.337643	0.228832	59.733069	135.541233
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	66.948837	288358.4	1.102805	5.779815	260.805938	6.663322	-6.495112	-1.959797e+11	-1.169449e+06	-7.013034e+15	...	-2.835490	131837.380427	207.050530	0.0	-5.375408e+08	180.099705	1.598658	1.239977	-1.625234	221.368887
9996	59.801849	288358.4	1.107632	0.679847	240.874068	22.327405	2.251901	8.157356e+11	1.887989e+05	-2.570942e+15	...	-7.159508	3416.075865	228.252191	0.0	4.302888e+08	180.099705	2.348464	3.079268	11.018358	250.312228
9997	42.208350	288358.4	1.110253	0.037611	180.875482	9.471924	6.725166	1.039534e+12	8.453963e+04	-1.985673e+15	...	49.752609	132354.251442	211.716051	0.0	-5.853507e+07	167.836714	3.840799	0.940002	75.537477	241.702576
9998	26.797447	288358.4	1.094471	0.505019	253.278224	19.199080	-4.451117	5.994078e+11	-3.025228e+05	-3.013453e+15	...	17.797175	101045.903091	225.324419	0.0	7.009934e+08	27.189454	0.538561	0.940002	-10.481948	113.104089
9999	63.908598	288358.4	1.105027	0.803881	234.035162	16.398958	-2.217535	4.389648e+11	8.104504e+05	-3.890134e+14	...	12.651208	78226.955793	157.853686	0.0	-8.710744e+07	252.716160	0.869572	0.940002	-3.776887	101.844761

9500 rows × 107 columns

1	y_train_flt = y_train_df[~bool_array]

X_train_flt = df_filtered.values
X_train = X_train_df.values
y_train = y_train_df.values
y_train_flt = y_train_flt.values

1	X_train.shape, X_train_flt.shape, y_train.shape, y_train_flt.shape

((10000, 107), (9500, 107), (10000, 1), (9500, 1))

clf1 = XGBClassifier()
clf2 = XGBClassifier()
clf1.fit(X_train, y_train)
clf2.fit(X_train_flt, y_train_flt)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

from sklearn.metrics import classification_report
X_val = X_val_df.values
y_val = y_val_df.values
print(classification_report(y_val, clf1.predict(X_val)))
print(classification_report(y_val, clf2.predict(X_val)))

              precision    recall  f1-score   support

           0       0.58      0.74      0.65       176
           1       0.48      0.67      0.56       166
           2       0.64      0.40      0.49       171
           3       0.96      0.89      0.92       169
           4       0.89      0.74      0.81       156
           5       1.00      0.93      0.96       162

    accuracy                           0.73      1000
   macro avg       0.76      0.73      0.73      1000
weighted avg       0.75      0.73      0.73      1000

              precision    recall  f1-score   support

           0       0.78      0.64      0.70       176
           1       0.47      0.76      0.58       166
           2       0.68      0.50      0.58       171
           3       0.94      0.90      0.92       169
           4       0.90      0.88      0.89       156
           5       0.99      0.91      0.95       162

    accuracy                           0.76      1000
   macro avg       0.79      0.76      0.77      1000
weighted avg       0.79      0.76      0.77      1000