# Natural Language Toolkit: SVM-based classifier # # Copyright (C) 2001-2012 NLTK Project # Author: Leon Derczynski # # URL: # For license information, see LICENSE.TXT """ A classifier based on a support vector machine. This code uses Thorsten Joachims' SVM^light implementation (http://svmlight.joachims.org/), wrapped using PySVMLight (https://bitbucket.org/wcauchois/pysvmlight). The default settings are to train a linear classification kernel, though through minor modification, full SVMlight capabilities should be accessible if needed. Only binary classification is possible at present. """ from nltk.probability import DictionaryProbDist from nltk.classify.api import ClassifierI # # Interface to Support Vector Machine # try: import svmlight except: raise LookupError("\n\n===========================================================================\n NLTK was unable to import SVMlight!\n\n For more information, see \n===========================================================================") # create a boolean feature name for the SVM from a feature/value pair, # that'll take on a 1.0 value if the original feature:value is asserted. def featurename(feature, value): """ :param feature: a string denoting a feature name :param value: the value of the feature """ return '|'.join([feature, type(value).__name__, str(value)]) # convert a set of NLTK classifier features to SVMlight format def map_features_to_svm(features, svmfeatureindex): """ :param features: a dict of features in the format {'feature':value} :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels """ instancefeatures = [] # svmlight supports sparse feature sets and so we simply omit features that we don't include for k,v in features.iteritems(): # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0 # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label # use 1.0 as the feature value to specify the presence of a feature:value couple svmfeaturename = featurename(k, v) if svmfeaturename not in svmfeatureindex: # skip over feature:value pairs that were not in the training data and so not included in our mappings continue instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) ) return instancefeatures # convert a whole instance (including label) from NLTK to SVMlight format def map_instance_to_svm(instance, labelmapping, svmfeatureindex): """ :param instance: an NLTK format instance, which is in the tuple format (dict(), label), where the dict contains feature:value pairs, and the label signifies the target attribute's value for this instance (e.g. its class) :param labelmapping: a previously-defined dict mapping from text labels in the NLTK instance format to SVMlight labels of either +1 or -1 @svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels """ (features, label) = instance instancefeatures = map_features_to_svm(features, svmfeatureindex) return (labelmapping[label], instancefeatures) class SvmClassifier(ClassifierI): """ A Support Vector Machine classifier. To explain briefly, support vector machines (SVM) treat each feature as a dimension, and position features in n-dimensional feature space. An optimal hyperplane is then determined that best divides feature space into classes, and future instances classified based on which side of the hyperplane they lie on, and their proximity to it. This implementation is for a binary SVM - that is, only two classes are supported. You may achieve perform classification with more classes by training an SVM per class and then picking a best option for new instances given results from each binary class-SVM. """ def __init__(self, labels, labelmapping, svmfeatures, model=None): """ :param labels: a list of text labels for classes :param labelmapping: a mapping from labels to SVM classes (-1,+1) :param svmfeatures: a list of svm features, where the index is the integer feature number and the value an feature/value pair :param model: the model generated by svmlight.learn() """ self._labels = labels self._model = model self._labelmapping = labelmapping self._svmfeatures = svmfeatures # _svmfeatureindex is the inverse of svmfeatures, allowing us # to find an SVM feature name (int) given a feature/value self._svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) self._verbose = False def labels(self): """ Return the list of class labels. """ return self._labels def svm_label_name(self, label): """ searches values of _labelmapping to resolve +1 or -1 to a string :param label: the string label to look up """ labelname = [k for k, v in self._labelmapping.iteritems() if v == label][0] return labelname def resolve_prediction(self, prediction): """ resolve a float (in this case, probably from svmlight.learn().classify()) to either -1 or +1, and then look up the label for that class in _labelmapping, and return the text label :param prediction: a signed float describing classifier confidence """ classification = cmp(prediction, 0) return self.svm_label_name(classification) def _get_svm_classification(self, featureset): """ given a set of features, classify them with our trained model and return a signed float :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance """ instance_to_classify = (0, map_features_to_svm(featureset, self._svmfeatureindex)) if self._verbose: print 'instance', instance_to_classify # svmlight.classify expects a list; this should be taken advantage of when writing SvmClassifier.batch_classify / .batch_prob_classify. # it returns a list of floats, too. [prediction] = svmlight.classify(self._model, [instance_to_classify]) return prediction def prob_classify(self, featureset): """ Return a probability distribution of classifications :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance """ if self._model is None: raise Exception('This classifier is not yet trained') return None # do the classification prediction = self._get_svm_classification(featureset) if self._verbose: print 'prediction', prediction # lump it into a boolean class, -1 or +1 predicted_label = cmp(prediction, 0) # sometimes the result is not within -1 ... +1; clip it so # that it is, and we get a sane-looking probability # distribution. this will upset some results with non-linear # partitioning where instance-hyperplane distance can be many # orders of magnitude larger; I don't have a fix for that if prediction < -1.0: prediction = -1.0 if prediction > 1.0: prediction = 1.0 # if the prediction is negative, then we will maximise the # value of the -1 class; otherwise, that of the 1 class will # be greater. if predicted_label == 1: distribution = {str(self.resolve_prediction(1)): prediction, str(self.resolve_prediction(-1)): 1 - prediction} else: distribution = {str(self.resolve_prediction(1)): prediction + 1, str(self.resolve_prediction(-1)): -prediction} return DictionaryProbDist(distribution) def classify(self, featureset): """ Use a trained SVM to predict a label given for an unlabelled instance :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance """ prediction = self._get_svm_classification(featureset) if self._verbose: print 'prediction', prediction return self.resolve_prediction(prediction) @staticmethod def train(featuresets): """ given a set of training instances in nltk format: [ ( {feature:value, ..}, str(label) ) ] train a support vector machine :param featuresets: training instances """ # build a unique list of labels labels = set() for (features, label) in featuresets: labels.add(label) # this is a binary classifier only if len(labels) > 2: raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')') return False # we need ordering, so a set's no good labels = list(labels) # next, assign -1 and 1 labelmapping = {labels[0]:-1, labels[1]:1} # now for feature conversion # iter through instances, building a set of feature:type:str(value) triples svmfeatures = set() for (features, label) in featuresets: for k,v in features.iteritems(): svmfeatures.add(featurename(k, v)) # svmfeatures is indexable by integer svm feature number # svmfeatureindex is the inverse (svm feature name -> number) svmfeatures = list(svmfeatures) svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) # build svm feature set case by case svmfeatureset = [] for instance in featuresets: svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex)) # train the svm # TODO: implement passing of SVMlight parameters from train() to learn() return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification')) def demo(): def gender_features(word): return {'last_letter': word[-1], 'penultimate_letter': word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n,g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print '--- nltk.classify.svm demo ---' print 'Number of training examples:', len(train_set) classifier = SvmClassifier.train(train_set) print 'Total SVM dimensions:', len(classifier._svmfeatureindex) print 'Label mapping:', classifier._labelmapping print '--- Processing an example instance ---' print 'Reference instance:', names[0] print 'NLTK-format features:\n ' + str(test_set[0]) print 'SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)) distr = classifier.prob_classify(test_set[0][0]) print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max()) print '--- Measuring classifier performance ---' print 'Overall accuracy:', accuracy(classifier, test_set) if __name__ == '__main__': demo()