Jupyter Snippet CB2nd 06_random_forest

Jupyter Snippet CB2nd 06_random_forest

8.6. Using a random forest to select important features for regression

import numpy as np
import sklearn as sk
import sklearn.datasets as skd
import sklearn.ensemble as ske
import matplotlib.pyplot as plt
%matplotlib inline
data = skd.load_boston()
reg = ske.RandomForestRegressor()
X = data['data']
y = data['target']
reg.fit(X, y)
fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]
fig, ax = plt.subplots(1, 1, figsize=(8, 3))
labels = data['feature_names'][fet_ind]
pd.Series(fet_imp, index=labels).plot('bar', ax=ax)
ax.set_title('Features importance')

png

fig, ax = plt.subplots(1, 1)
ax.scatter(X[:, -1], y)
ax.set_xlabel('LSTAT indicator')
ax.set_ylabel('Value of houses (k$)')

png

from sklearn import tree
tree.export_graphviz(reg.estimators_[0],
                     'tree.dot')