forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 2
/
document_classification_20newsgroups.py
172 lines (137 loc) · 5.09 KB
/
document_classification_20newsgroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
======================================================
Classification of text documents using sparse features
======================================================
This is an example showing how the scikit-learn can be used to classify
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.
The dataset used in this example is the 20 newsgroups dataset which will be
automatically downloaded and then cached.
You can adjust the number of categories by giving there name to the dataset
loader or setting them to None to get the 20 of them.
This example demos various linear classifiers with different training
strategies.
To run this example use::
% python examples/document_classification_20newsgroups.py [options]
Options are:
--report
Print a detailed classification report.
--confusion-matrix
Print the confusion matrix.
"""
print __doc__
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# License: Simplified BSD
from time import time
import os
import sys
from scikits.learn.datasets import load_files
from scikits.learn.feature_extraction.text.sparse import Vectorizer
from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.linear_model.stochastic_gradient.sparse import SGDClassifier
from scikits.learn import metrics
# parse commandline arguments
argv = sys.argv[1:]
if "--report" in argv:
print_report = True
else:
print_report = False
if "--confusion-matrix" in argv:
print_cm = True
else:
print_cm = False
################################################################################
# Download the data, if not already on disk
url = "http://people.csail.mit.edu/jrennie/20Newsgroups/20news-18828.tar.gz"
archive_name = "20news-18828.tar.gz"
if not os.path.exists(archive_name[:-7]):
if not os.path.exists(archive_name):
import urllib
print "Downloading data, please Wait (14MB)..."
print url
opener = urllib.urlopen(url)
open(archive_name, 'wb').write(opener.read())
print
import tarfile
print "Decompressiong the archive: " + archive_name
tarfile.open(archive_name, "r:gz").extractall()
print
################################################################################
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
# Uncomment the following to do the analysis on all the categories
#categories = None
print "Loading 20 newsgroups dataset for categories:"
print categories
data = load_files('20news-18828', categories=categories, shuffle=True, rng=42)
print "%d documents" % len(data.filenames)
print "%d categories" % len(data.target_names)
print
# split a training set and a test set
filenames = data.filenames
y = data.target
n = filenames.shape[0]
filenames_train, filenames_test = filenames[:-n/2], filenames[-n/2:]
y_train, y_test = y[:-n/2], y[-n/2:]
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in filenames_train))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print
print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform((open(f).read() for f in filenames_test))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print
################################################################################
# Benchmark classifiers
def benchmark(clf):
print 80 * '_'
print "Training: "
print clf
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print "train time: %0.3fs" % train_time
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print "test time: %0.3fs" % test_time
score = metrics.f1_score(y_test, pred)
print "f1-score: %0.3f" % score
nnz = clf.coef_.nonzero()[0].shape[0]
print "non-zero coef: %d" % nnz
print
if print_report:
print "classification report:"
print metrics.classification_report(y_test, pred,
class_names=categories)
if print_cm:
print "confusion matrix:"
print metrics.confusion_matrix(y_test, pred)
print
return score, train_time, test_time
for penalty in ["l2", "l1"]:
print 80*'='
print "%s penalty" % penalty.upper()
# Train Liblinear model
liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000,
dual=False, eps=1e-3))
# Train SGD model
sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty=penalty))
# Train SGD with Elastic Net penalty
print 80*'='
print "Elastic-Net penalty"
sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty="elasticnet"))