20 Newsgroups: Classifyer Comparison#

Quellen#

5.6.2. The 20 newsgroups text dataset#

Einstieg:

Dort:

Examples using sklearn.datasets.fetch_20newsgroups:

Weitere Beispiele:

Classification of text documents using sparse features#

Reproduktion von https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py:

  • ohne USE_HASHING, ohne SELECT_CHI2

  • reduziert um ein paar wenige Classifier, die trotz hoher Trainingszeit keinen guten Score liefern

# Optional feature selection: either False, or an integer: the number of
# features to select
#SELECT_CHI2 = False
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "talk.religion.misc",
     "comp.graphics",
# "sci.space",
]
data_train = fetch_20newsgroups(
    subset="train", categories=categories, shuffle=True, random_state=42,
    remove = ('headers', 'footers', 'quotes')  # hinzugefügt JB
)

data_test = fetch_20newsgroups(
    subset="test", categories=categories, shuffle=True, random_state=42,
    remove = ('headers', 'footers', 'quotes')   # hinzugefügt JB
)
#print("data loaded")
# order of labels in `target_names` can be different from `categories`
target_names = data_train.target_names
target_names
['alt.atheism', 'comp.graphics', 'talk.religion.misc']
def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print(
    "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)
)
print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb))
print("%d categories:" % len(target_names), target_names)
1441 documents - 1.655MB (training set)
959 documents - 1.376MB (test set)
3 categories: ['alt.atheism', 'comp.graphics', 'talk.religion.misc']
data_train.data[:3]
['Those things,\n\twhich ye have both learned, and received,\n\tand heard, and seen in me,\n\tdo:\n\tand the God of peace shall be with you.',
 'Greetings all.\n\tAccording to a FAQ I read, on 30 July 1992, Joshua C. Jensen posted an \narticle on bitmap manipulation (specifically, scaling and perspective) to the \nnewsgroup rec.games.programmer. (article 7716)\n\tThe article included source code in Turbo Pascal with inline assembly \nlanguage.\n\n\tI have been unable to find an archive for this newsgroup, or a current \nemail address for Joshua C. Jensen.\n\tIf anyone has the above details, or a copy of the code, could they \nplease let me know.\tMany thanks.\n\t\t\t\t\tYours gratefully, etc.  Myles.\n',
 '\nAs many people have mentioned, there is no reason why insurers could not\noffer a contract without abortion services for a different premium.\nThe problem is that there is no guarantee that this premium would be\nlower for those who chose this type of contract.  Although you are\nremoving one service, that may have feedbacks into other types of covered\ncare which results in a net increase in actuarial costs.\n\nFor an illustrative example in the opposite direction, it may be possible\nto ADD services to an insurance contract and REDUCE the premium.  If you\nadd preventative services and this reduces acute care use, then the total\npremium may fall.']
set(data_train.target)
{0, 1, 2}
y_train, y_test = data_train.target, data_test.target
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time()

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
X_train = vectorizer.fit_transform(data_train.data)

duration = time() - t0
print("vectorizer.fit_transform() done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
vectorizer.fit_transform() done in 0.137556s at 12.031MB/s
n_samples: 1441, n_features: 20052
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
done in 0.070050s at 19.638MB/s
n_samples: 959, n_features: 20052
feature_names = vectorizer.get_feature_names_out()
import numpy as np
from sklearn import metrics
from sklearn.utils.extmath import density


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."
def benchmark(clf):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, "coef_"):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return clf_descr, score, train_time, test_time
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier


results = []
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="auto"), "Ridge Classifier"), # solver="sag")
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
#    (KNeighborsClassifier(n_neighbors=10), "kNN"),
#    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print("=" * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet"))
)

# Train NearestCentroid without threshold
print("=" * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
#print("=" * 80)
#print("Naive Bayes")
#results.append(benchmark(MultinomialNB(alpha=0.01)))
# results.append(benchmark(BernoulliNB(alpha=0.01)))
#results.append(benchmark(ComplementNB(alpha=0.1)))

print("=" * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(
    benchmark(
        Pipeline(
            [
                (
                    "feature_selection",
                    SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
                ),
                ("classification", LinearSVC(penalty="l2")),
            ]
        )
    )
)
================================================================================
Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(tol=0.01)
train time: 0.013s
test time:  0.000s
accuracy:   0.779
dimensionality: 20052
density: 1.000000
top 10 keywords per class:
alt.atheism: motto islamic isn atheist deletion bobby religion islam atheists...
comp.graphics: files images 3d image computer looking file hi thanks graphics
talk.religion.misc: koresh blood rosicrucian order christ jesus children fbi ...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.73      0.68      0.70       319
     comp.graphics       0.86      0.97      0.91       389
talk.religion.misc       0.68      0.61      0.64       251

          accuracy                           0.78       959
         macro avg       0.76      0.75      0.75       959
      weighted avg       0.77      0.78      0.77       959

confusion matrix:
[[216  32  71]
 [  9 377   3]
 [ 70  27 154]]

================================================================================
Perceptron
________________________________________________________________________________
Training: 
Perceptron(max_iter=50)
train time: 0.006s
test time:  0.000s
accuracy:   0.765
dimensionality: 20052
density: 0.347114
top 10 keywords per class:
alt.atheism: imaginative motto enlightening keith atheist atheism position na...
comp.graphics: file 42 algorithm hello ftp code thanks keywords ditto graphics
talk.religion.misc: dead moment cult christians critus lunacy cockroaches chr...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.71      0.66      0.68       319
     comp.graphics       0.88      0.93      0.90       389
talk.religion.misc       0.64      0.65      0.64       251

          accuracy                           0.77       959
         macro avg       0.74      0.74      0.74       959
      weighted avg       0.76      0.77      0.76       959

confusion matrix:
[[209  31  79]
 [ 15 363  11]
 [ 69  20 162]]

================================================================================
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(max_iter=50)
train time: 0.008s
test time:  0.000s
accuracy:   0.775
dimensionality: 20052
density: 0.722388
top 10 keywords per class:
alt.atheism: nanci religion motto claim deletion islam atheist bobby atheists...
comp.graphics: looking 3d computer file files hi thanks software image graphics
talk.religion.misc: cult quote wrong rosicrucian children order christ fbi ch...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.72      0.68      0.70       319
     comp.graphics       0.88      0.95      0.91       389
talk.religion.misc       0.66      0.62      0.64       251

          accuracy                           0.77       959
         macro avg       0.75      0.75      0.75       959
      weighted avg       0.77      0.77      0.77       959

confusion matrix:
[[216  27  76]
 [ 12 371   6]
 [ 70  25 156]]

================================================================================
L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, tol=0.001)
train time: 0.019s
test time:  0.000s
accuracy:   0.777
dimensionality: 20052
density: 1.000000
top 10 keywords per class:
alt.atheism: isn perfect islamic deletion atheist religion bobby islam atheis...
comp.graphics: files images computer 3d looking hi file image thanks graphics
talk.religion.misc: cult jesus blood order rosicrucian christ children fbi ch...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.73      0.67      0.70       319
     comp.graphics       0.87      0.97      0.91       389
talk.religion.misc       0.67      0.62      0.64       251

          accuracy                           0.78       959
         macro avg       0.75      0.75      0.75       959
      weighted avg       0.77      0.78      0.77       959

confusion matrix:
[[214  32  73]
 [  9 376   4]
 [ 71  25 155]]

________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50)
train time: 0.006s
test time:  0.000s
accuracy:   0.778
dimensionality: 20052
density: 0.607770
top 10 keywords per class:
alt.atheism: coutesy broken atheist loans nanci keith religion deletion athei...
comp.graphics: images ditto software looking ftp hi file thanks image graphics
talk.religion.misc: creation children quote christ fbi cult rosicrucian wrong...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.75      0.65      0.70       319
     comp.graphics       0.87      0.96      0.91       389
talk.religion.misc       0.66      0.66      0.66       251

          accuracy                           0.78       959
         macro avg       0.76      0.76      0.75       959
      weighted avg       0.77      0.78      0.77       959

confusion matrix:
[[207  33  79]
 [  8 374   7]
 [ 61  25 165]]

================================================================================
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, penalty='l1', tol=0.001)
train time: 0.030s
test time:  0.000s
accuracy:   0.754
dimensionality: 20052
density: 0.016025
top 10 keywords per class:
alt.atheism: motto policy sea religion atheist islam atheists atheism bobby risk
comp.graphics: file 3d ftp images image computer 68070 software hi graphics
talk.religion.misc: blood rosicrucian 666 christian christians cult thou chil...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.69      0.66      0.67       319
     comp.graphics       0.85      0.95      0.90       389
talk.religion.misc       0.65      0.58      0.61       251

          accuracy                           0.75       959
         macro avg       0.73      0.73      0.73       959
      weighted avg       0.75      0.75      0.75       959

confusion matrix:
[[209  37  73]
 [ 14 369   6]
 [ 79  27 145]]

________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50, penalty='l1')
train time: 0.025s
test time:  0.000s
accuracy:   0.767
dimensionality: 20052
density: 0.060094
top 10 keywords per class:
alt.atheism: loans idea satan atheist alternative islam risk atheists bobby a...
comp.graphics: week pc 3d package software looking image computer hi graphics
talk.religion.misc: lunacy critus fake cult rosicrucian order christian fbi c...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.74      0.66      0.69       319
     comp.graphics       0.85      0.96      0.90       389
talk.religion.misc       0.65      0.61      0.63       251

          accuracy                           0.77       959
         macro avg       0.75      0.74      0.74       959
      weighted avg       0.76      0.77      0.76       959

confusion matrix:
[[209  35  75]
 [  8 374   7]
 [ 67  31 153]]

================================================================================
Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50, penalty='elasticnet')
train time: 0.032s
test time:  0.000s
accuracy:   0.767
dimensionality: 20052
density: 0.286405
top 10 keywords per class:
alt.atheism: keith bobby risk atheist enlightening religion islam atheists na...
comp.graphics: 3d ftp looking computer image thanks software hi file graphics
talk.religion.misc: quote wrong blood rosicrucian order christ fbi cult chris...

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.73      0.63      0.68       319
     comp.graphics       0.86      0.95      0.91       389
talk.religion.misc       0.64      0.65      0.65       251

          accuracy                           0.77       959
         macro avg       0.75      0.75      0.74       959
      weighted avg       0.76      0.77      0.76       959

confusion matrix:
[[202  32  85]
 [ 12 370   7]
 [ 61  26 164]]

================================================================================
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid()
train time: 0.003s
test time:  0.001s
accuracy:   0.764
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.71      0.63      0.66       319
     comp.graphics       0.90      0.93      0.91       389
talk.religion.misc       0.63      0.69      0.66       251

          accuracy                           0.76       959
         macro avg       0.74      0.75      0.74       959
      weighted avg       0.76      0.76      0.76       959

confusion matrix:
[[200  23  96]
 [ 22 360   7]
 [ 61  17 173]]

================================================================================
LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1',
                                                     tol=0.001))),
                ('classification', LinearSVC())])
train time: 0.038s
test time:  0.001s
accuracy:   0.740
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.69      0.61      0.65       319
     comp.graphics       0.84      0.95      0.89       389
talk.religion.misc       0.62      0.58      0.60       251

          accuracy                           0.74       959
         macro avg       0.72      0.71      0.71       959
      weighted avg       0.73      0.74      0.73       959

confusion matrix:
[[195  42  82]
 [ 13 369   7]
 [ 75  30 146]]
import matplotlib.pyplot as plt

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, 0.2, label="score", color="navy")
plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c")
plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange")
plt.yticks(())
plt.legend(loc="best")
plt.subplots_adjust(left=0.25)
plt.subplots_adjust(top=0.95)
plt.subplots_adjust(bottom=0.05)

for i, c in zip(indices, clf_names):
    plt.text(-0.3, i, c)

plt.show()
_images/20ng-classifyer-comparison_16_0.png