# Natural Language Toolkit: Interface to scikit-learn classifiers # # Author: Lars Buitinck # URL: # For license information, see LICENSE.TXT """ scikit-learn (http://scikit-learn.org) is a machine learning library for Python, supporting most of the basic classification algorithms, including SVMs, Naive Bayes, logistic regression and decision trees. This package implement a wrapper around scikit-learn classifiers. To use this wrapper, construct a scikit-learn classifier, then use that to construct a SklearnClassifier. E.g., to wrap a linear SVM classifier with default settings, do >>> from sklearn.svm.sparse import LinearSVC >>> from nltk.classify.scikitlearn import SklearnClassifier >>> classif = SklearnClassifier(LinearSVC()) The scikit-learn classifier may be arbitrarily complex. E.g., the following constructs and wraps a Naive Bayes estimator with tf-idf weighting and chi-square feature selection: >>> from sklearn.feature_extraction.text import TfidfTransformer >>> from sklearn.feature_selection import SelectKBest, chi2 >>> from sklearn.naive_bayes import MultinomialNB >>> from sklearn.pipeline import Pipeline >>> pipeline = Pipeline([('tfidf', TfidfTransformer()), ... ('chi2', SelectKBest(chi2, k=1000)), ... ('nb', MultinomialNB())]) >>> classif = SklearnClassifier(pipeline) (Such a classifier could be trained on word counts for text classification.) """ from nltk.classify.api import ClassifierI from nltk.probability import DictionaryProbDist from scipy.sparse import coo_matrix try: import numpy as np except ImportError: pass class SklearnClassifier(ClassifierI): """Wrapper for scikit-learn classifiers.""" def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data; use bool when all features are binary. :param sparse: Whether to use sparse matrices. The estimator must support these; not all scikit-learn classifiers do. The default value is True, since most NLP problems involve sparse feature sets. :type sparse: boolean. """ self._clf = estimator self._dtype = dtype self._sparse = sparse def __repr__(self): return "" % self._clf def batch_classify(self, featuresets): X = self._convert(featuresets) y = self._clf.predict(X) return [self._index_label[int(yi)] for yi in y] def batch_prob_classify(self, featuresets): X = self._convert(featuresets) y_proba = self._clf.predict_proba(X) return [self._make_probdist(y_proba[i]) for i in xrange(len(y_proba))] def labels(self): return self._label_index.keys() def train(self, labeled_featuresets): """ Train (fit) the scikit-learn estimator. :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ self._feature_index = {} self._index_label = [] self._label_index = {} for fs, label in labeled_featuresets: for f in fs.iterkeys(): if f not in self._feature_index: self._feature_index[f] = len(self._feature_index) if label not in self._label_index: self._index_label.append(label) self._label_index[label] = len(self._label_index) featuresets, labels = zip(*labeled_featuresets) X = self._convert(featuresets) y = np.array([self._label_index[l] for l in labels]) self._clf.fit(X, y) return self def _convert(self, featuresets): if self._sparse: return self._featuresets_to_coo(featuresets) else: return self._featuresets_to_array(featuresets) def _featuresets_to_coo(self, featuresets): """Convert featuresets to sparse matrix (COO format).""" i_ind = [] j_ind = [] values = [] for i, fs in enumerate(featuresets): for f, v in fs.iteritems(): try: j = self._feature_index[f] i_ind.append(i) j_ind.append(j) values.append(self._dtype(v)) except KeyError: pass shape = (i + 1, len(self._feature_index)) return coo_matrix((values, (i_ind, j_ind)), shape=shape, dtype=self._dtype) def _featuresets_to_array(self, featuresets): """Convert featureset to Numpy array.""" X = np.zeros((len(featuresets), len(self._feature_index)), dtype=self._dtype) for i, fs in enumerate(featuresets): for f, v in fs.iteritems(): try: X[i, self._feature_index[f]] = self._dtype(v) except KeyError: # feature not seen in training pass return X def _make_probdist(self, y_proba): return DictionaryProbDist(dict((self._index_label[i], p) for i, p in enumerate(y_proba))) if __name__ == "__main__": from nltk.classify.util import names_demo, binary_names_demo_features try: from sklearn.linear_model.sparse import LogisticRegression except ImportError: # separate sparse LR to be removed in 0.12 from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB print("scikit-learn Naive Bayes:") names_demo(SklearnClassifier(BernoulliNB(binarize=False), dtype=bool).train, features=binary_names_demo_features) print("scikit-learn logistic regression:") names_demo(SklearnClassifier(LogisticRegression(), dtype=np.float64).train, features=binary_names_demo_features)