Isolation Forests

孤立森林是一个经典的异常检测算法,能处理大规模的多维数据。

​ 孤立森林认为异常样本通常few and different:相比正常样本,它们数量上比较少,特征值差异比较大。因此,异常样本更容易被孤立。孤立森林通过构建二叉树的方法孤立每一个异常样本——因为异常样本容易被孤立的特征,异常样本更靠近根节点,正常样本则在更深处。构建的这棵树被称为孤立树(Isolation Tree, iTree)。iTrees的集合即iForest。anomalies are those instances which have short average path lengths on the iTrees。

from sklearn.ensemble import IsolationForest

IsolationForest不能将检测为异常值的某个数据设为缺失值只能删除

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif as MIC

from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import random
import warnings
warnings.filterwarnings("ignore") # 忽略警告信息
data_train = pd.read_csv("../原始数据集/train_10000.csv")
data_test = pd.read_csv("../原始数据集/validate_1000.csv")
#获取数据
X_val_df = data_test.iloc[:, 1:-1]
y_val_df = data_test.iloc[:, -1:]
X_train_df = data_train.iloc[:, 1:-1]
y_train_df = data_train.iloc[:, -1:]
1
2
df = X_train_df.fillna(X_train_df.median())
X_train_df = X_train_df.fillna(X_train_df.median())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.ensemble import IsolationForest
import numpy as np

# 创建示例数据
X = np.random.rand(200, 2)

# 创建孤立森林模型
clf = IsolationForest(contamination=0.05, n_estimators=200, random_state=42) # contamination参数用于指定异常值的比例

# 拟合模型并预测异常值
clf.fit(df.values)
y_pred = clf.predict(df.values)

# 打印异常值的预测结果
print("异常值预测结果:")
print(y_pred)
异常值预测结果:
[1 1 1 ... 1 1 1]
1
2
df = pd.DataFrame(y_pred)
df.to_csv('output.csv', index=False)
1
2
3
4
5
6
7
8
9
bool_array = y_pred == -1
# bool_l = bool_l.tolist()
# bool_l
# 根据布尔数组剔除样本
df_filtered = df[~bool_array]

# 打印剔除样本后的DataFrame
df_filtered

feature0 feature1 feature2 feature3 feature4 feature5 feature6 feature7 feature8 feature9 ... feature97 feature98 feature99 feature100 feature101 feature102 feature103 feature104 feature105 feature106
0 51.567250 288358.4 1.105027 1.855900 201.460169 6.582261 -0.516321 5.636771e+11 2.222212e+05 2.002797e+14 ... -14.701891 31459.628135 254.582034 0.0 -2.014506e+08 159.299350 0.603211 0.940002 -5.256075 180.977310
1 63.804874 288358.4 1.106802 1.050387 391.605375 13.323439 4.662871 2.181820e+10 -1.442474e+05 2.601894e+15 ... -8.959940 42830.526855 270.580779 0.0 -1.534970e+09 180.099705 0.506220 0.552654 16.505952 314.783263
2 49.138527 288358.4 1.111649 0.767127 130.708067 6.485547 5.696815 5.474603e+11 -4.288403e+05 1.363373e+14 ... 32.159667 249963.241809 160.207067 0.0 7.998345e+08 112.632639 0.080100 0.235920 64.707581 183.304610
3 59.801849 288358.4 1.109169 0.731299 258.521076 0.716737 23.238461 -3.539891e+11 9.530472e+03 -1.338632e+15 ... 1.815809 125478.297201 196.223295 0.0 -9.246920e+07 138.431470 2.548783 1.414810 -9.662399 212.302670
4 76.520831 288358.4 1.113410 5.795408 256.038997 -1.803483 14.040495 -1.071014e+11 6.499723e+05 2.800250e+15 ... 40.623904 186266.423019 179.083883 0.0 -6.796351e+08 259.858740 0.337643 0.228832 59.733069 135.541233
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 66.948837 288358.4 1.102805 5.779815 260.805938 6.663322 -6.495112 -1.959797e+11 -1.169449e+06 -7.013034e+15 ... -2.835490 131837.380427 207.050530 0.0 -5.375408e+08 180.099705 1.598658 1.239977 -1.625234 221.368887
9996 59.801849 288358.4 1.107632 0.679847 240.874068 22.327405 2.251901 8.157356e+11 1.887989e+05 -2.570942e+15 ... -7.159508 3416.075865 228.252191 0.0 4.302888e+08 180.099705 2.348464 3.079268 11.018358 250.312228
9997 42.208350 288358.4 1.110253 0.037611 180.875482 9.471924 6.725166 1.039534e+12 8.453963e+04 -1.985673e+15 ... 49.752609 132354.251442 211.716051 0.0 -5.853507e+07 167.836714 3.840799 0.940002 75.537477 241.702576
9998 26.797447 288358.4 1.094471 0.505019 253.278224 19.199080 -4.451117 5.994078e+11 -3.025228e+05 -3.013453e+15 ... 17.797175 101045.903091 225.324419 0.0 7.009934e+08 27.189454 0.538561 0.940002 -10.481948 113.104089
9999 63.908598 288358.4 1.105027 0.803881 234.035162 16.398958 -2.217535 4.389648e+11 8.104504e+05 -3.890134e+14 ... 12.651208 78226.955793 157.853686 0.0 -8.710744e+07 252.716160 0.869572 0.940002 -3.776887 101.844761

9500 rows × 107 columns

1
y_train_flt = y_train_df[~bool_array]
1
2
3
4
X_train_flt = df_filtered.values
X_train = X_train_df.values
y_train = y_train_df.values
y_train_flt = y_train_flt.values
1
X_train.shape, X_train_flt.shape, y_train.shape, y_train_flt.shape
((10000, 107), (9500, 107), (10000, 1), (9500, 1))
1
2
3
4
clf1 = XGBClassifier()
clf2 = XGBClassifier()
clf1.fit(X_train, y_train)
clf2.fit(X_train_flt, y_train_flt)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)
1
2
3
4
5
from sklearn.metrics import classification_report
X_val = X_val_df.values
y_val = y_val_df.values
print(classification_report(y_val, clf1.predict(X_val)))
print(classification_report(y_val, clf2.predict(X_val)))
              precision    recall  f1-score   support

           0       0.58      0.74      0.65       176
           1       0.48      0.67      0.56       166
           2       0.64      0.40      0.49       171
           3       0.96      0.89      0.92       169
           4       0.89      0.74      0.81       156
           5       1.00      0.93      0.96       162

    accuracy                           0.73      1000
   macro avg       0.76      0.73      0.73      1000
weighted avg       0.75      0.73      0.73      1000

              precision    recall  f1-score   support

           0       0.78      0.64      0.70       176
           1       0.47      0.76      0.58       166
           2       0.68      0.50      0.58       171
           3       0.94      0.90      0.92       169
           4       0.90      0.88      0.89       156
           5       0.99      0.91      0.95       162

    accuracy                           0.76      1000
   macro avg       0.79      0.76      0.77      1000
weighted avg       0.79      0.76      0.77      1000