forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 1
/
plot_adaboost_multiclass.py
101 lines (78 loc) · 3.04 KB
/
plot_adaboost_multiclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
=====================================
Multi-class AdaBoosted Decision Trees
=====================================
This example reproduces Figure 1 of Zhu et al [1] and shows how boosting can
improve prediction accuracy on a multi-class problem. The classification
dataset is constructed by taking a ten-dimensional standard normal distribution
and defining three classes separated by nested concentric ten-dimensional
spheres such that roughly equal numbers of samples are in each class (quantiles
of the :math:`\Chi^2` distribution).
The performance of the SAMME and SAMME.R [1] algorithms are compared.
The error of each algorithm on the test set after each boosting iteration is
shown on the left, the classification error on the test set of each tree is
shown in the middle, and the boost weight of each tree is shown on the right.
.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
"""
print __doc__
# Author: Noel Dawe <noel.dawe@gmail.com>
#
# License: BSD
from itertools import izip
import pylab as pl
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets.samples_generator import make_gaussian_quantiles
from sklearn.metrics import accuracy_score
X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,
n_classes=3, random_state=1)
n_split = 3000
X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]
bdt_real = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=600,
learning_rate=1)
bdt_discrete = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=600,
learning_rate=1.5,
algorithm="SAMME")
bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)
real_test_errors = []
discrete_test_errors = []
for real_test_predict, discrete_train_predict in izip(
bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
real_test_errors.append(
1. - accuracy_score(real_test_predict, y_test))
discrete_test_errors.append(
1. - accuracy_score(discrete_train_predict, y_test))
n_trees = xrange(1, len(bdt_discrete) + 1)
pl.figure(figsize=(15, 5))
pl.subplot(131)
pl.plot(n_trees, discrete_test_errors, c='black', label='SAMME')
pl.plot(n_trees, real_test_errors, c='black',
linestyle='dashed', label='SAMME.R')
pl.legend()
pl.ylim(0.18, 0.62)
pl.ylabel('Test Error')
pl.xlabel('Number of Trees')
pl.subplot(132)
pl.plot(n_trees, bdt_discrete.errors_, "b", label='SAMME')
pl.plot(n_trees, bdt_real.errors_, "r", label='SAMME.R')
pl.legend()
pl.ylabel('Error')
pl.xlabel('Tree')
pl.ylim((.2, max(bdt_real.errors_.max(), bdt_discrete.errors_.max()) * 1.2))
pl.xlim((-20, len(bdt_discrete) + 20))
pl.subplot(133)
pl.plot(n_trees, bdt_discrete.estimator_weights_, "b", label='SAMME')
pl.legend()
pl.ylabel('Weight')
pl.xlabel('Tree')
pl.ylim((0, bdt_discrete.estimator_weights_.max() * 1.2))
pl.xlim((-20, len(bdt_discrete) + 20))
# prevent overlapping y-axis labels
pl.subplots_adjust(wspace=0.25)
pl.show()