特徵工程 特徵選擇 reliefF演算法


特徵工程 特徵選擇 reliefF演算法





return X[:, self.top_features[self.n_features_to_keep]]


return X[:, :self.top_features[self.n_features_to_keep]]


  1. 匯入環境
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time

for module in mpl, np, pd, sklearn:
    print(module.__name__, module.__version__)

sys.version_info(major=3, minor=7, micro=7, releaselevel=‘final’, serial=0)
matplotlib 3.3.1
numpy 1.19.1
pandas 1.1.1
sklearn 0.23.2

  1. 資料集
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=5000, n_features=100, random_state=0)

n_features_to_keep = 10
  1. reliefF
from sklearn.neighbors import KDTree

class ReliefF(object):
    """Feature selection using data-mined expert knowledge.
    Based on the ReliefF algorithm as introduced in:
    Kononenko, Igor et al. Overcoming the myopia of inductive learning algorithms with RELIEFF (1997), Applied Intelligence, 7(1), p39-55
    def __init__(self, n_neighbors=100, n_features_to_keep=n_features_to_keep):
        """Sets up ReliefF to perform feature selection.
        n_neighbors: int (default: 100)
            The number of neighbors to consider when assigning feature importance scores.
            More neighbors results in more accurate scores, but takes longer.
        self.feature_scores = None
        self.top_features = None
        self.tree = None
        self.n_neighbors = n_neighbors
        self.n_features_to_keep = n_features_to_keep
    def fit(self, X, y):
        """Computes the feature importance scores from the training data.
        X: array-like {n_samples, n_features}
            Training instances to compute the feature importance scores from
        y: array-like {n_samples}
            Training labels

        self.feature_scores = np.zeros(X.shape[1])
        self.tree = KDTree(X)

        for source_index in range(X.shape[0]):
            distances, indices = self.tree.query(X[source_index].reshape(1, -1), k=self.n_neighbors + 1)

            # First match is self, so ignore it
            for neighbor_index in indices[0][1:]:
                similar_features = X[source_index] == X[neighbor_index]
                label_match = y[source_index] == y[neighbor_index]

                # If the labels match, then increment features that match and decrement features that do not match
                # Do the opposite if the labels do not match
                if label_match:
                    self.feature_scores[similar_features] += 1.
                    self.feature_scores[~similar_features] -= 1.
                    self.feature_scores[~similar_features] += 1.
                    self.feature_scores[similar_features] -= 1.
        self.top_features = np.argsort(self.feature_scores)[::-1]
    def transform(self, X):
        """Reduces the feature set down to the top `n_features_to_keep` features.
        X: array-like {n_samples, n_features}
            Feature matrix to perform feature selection on
        X_reduced: array-like {n_samples, n_features_to_keep}
            Reduced feature matrix
        return X[:, self.top_features[:self.n_features_to_keep]]

rel = ReliefF()
rel.fit(X, y)

[99 36 26 27 28 29 30 31 32 33 34 35 37 98 38 39 40 41 42 43 44 45 46 47
25 24 23 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 48 49 50 75 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 76 74 51 73 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
70 71 72 0]

  1. RFE
estimator = SVR(kernel="linear")
rfe = RFE(estimator, n_features_to_select=n_features_to_keep, step=5)
rfe = rfe.fit(X, y)

[ 1 1 19 1 1 13 12 4 14 7 1 10 8 18 7 19 14 5 10 4 13 14 7 17
3 16 18 8 3 8 6 6 13 18 2 5 12 17 12 2 17 10 9 11 7 15 9 16
9 2 8 4 18 5 15 4 2 6 3 9 10 2 1 1 4 16 12 11 13 11 8 11
5 17 6 1 1 16 19 13 19 19 15 3 11 1 14 12 10 6 17 7 14 18 3 15
16 9 15 5]
