# Natural Language Toolkit: Maximum Entropy Classifiers # # Copyright (C) 2001-2012 NLTK Project # Author: Edward Loper # Dmitry Chichkov (TypedMaxentFeatureEncoding) # URL: # For license information, see LICENSE.TXT """ A classifier model based on maximum entropy modeling framework. This framework considers all of the probability distributions that are empirically consistent with the training data; and chooses the distribution with the highest entropy. A probability distribution is "empirically consistent" with a set of training data if its estimated frequency with which a class and a feature vector value co-occur is equal to the actual frequency in the data. Terminology: 'feature' ====================== The term *feature* is usually used to refer to some property of an unlabeled token. For example, when performing word sense disambiguation, we might define a ``'prevword'`` feature whose value is the word preceding the target word. However, in the context of maxent modeling, the term *feature* is typically used to refer to a property of a "labeled" token. In order to prevent confusion, we will introduce two distinct terms to disambiguate these two different concepts: - An "input-feature" is a property of an unlabeled token. - A "joint-feature" is a property of a labeled token. In the rest of the ``nltk.classify`` module, the term "features" is used to refer to what we will call "input-features" in this module. In literature that describes and discusses maximum entropy models, input-features are typically called "contexts", and joint-features are simply referred to as "features". Converting Input-Features to Joint-Features ------------------------------------------- In maximum entropy models, joint-features are required to have numeric values. Typically, each input-feature ``input_feat`` is mapped to a set of joint-features of the form: | joint_feat(token, label) = { 1 if input_feat(token) == feat_val | { and label == some_label | { | { 0 otherwise For all values of ``feat_val`` and ``some_label``. This mapping is performed by classes that implement the ``MaxentFeatureEncodingI`` interface. """ __docformat__ = 'epytext en' try: import numpy except ImportError: pass import time import tempfile import os import gzip from collections import defaultdict from nltk.util import OrderedDict from nltk.probability import DictionaryProbDist from nltk.classify.api import ClassifierI from nltk.classify.util import attested_labels, CutoffChecker, accuracy, log_likelihood from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights ###################################################################### #{ Classifier Model ###################################################################### class MaxentClassifier(ClassifierI): """ A maximum entropy classifier (also known as a "conditional exponential classifier"). This classifier is parameterized by a set of "weights", which are used to combine the joint-features that are generated from a featureset by an "encoding". In particular, the encoding maps each ``(featureset, label)`` pair to a vector. The probability of each label is then computed using the following equation:: dotprod(weights, encode(fs,label)) prob(fs|label) = --------------------------------------------------- sum(dotprod(weights, encode(fs,l)) for l in labels) Where ``dotprod`` is the dot product:: dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) """ def __init__(self, encoding, weights, logarithmic=True): """ Construct a new maxent classifier model. Typically, new classifier models are created using the ``train()`` method. :type encoding: MaxentFeatureEncodingI :param encoding: An encoding that is used to convert the featuresets that are given to the ``classify`` method into joint-feature vectors, which are used by the maxent classifier model. :type weights: list of float :param weights: The feature weight vector for this classifier. :type logarithmic: bool :param logarithmic: If false, then use non-logarithmic weights. """ self._encoding = encoding self._weights = weights self._logarithmic = logarithmic #self._logarithmic = False assert encoding.length() == len(weights) def labels(self): return self._encoding.labels() def set_weights(self, new_weights): """ Set the feature weight vector for this classifier. :param new_weights: The new feature weight vector. :type new_weights: list of float """ self._weights = new_weights assert (self._encoding.length() == len(new_weights)) def weights(self): """ :return: The feature weight vector for this classifier. :rtype: list of float """ return self._weights def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): prob_dict = {} for label in self._encoding.labels(): feature_vector = self._encoding.encode(featureset, label) if self._logarithmic: total = 0.0 for (f_id, f_val) in feature_vector: total += self._weights[f_id] * f_val prob_dict[label] = total else: prod = 1.0 for (f_id, f_val) in feature_vector: prod *= self._weights[f_id] ** f_val prob_dict[label] = prod # Normalize the dictionary to give a probability distribution return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True) def explain(self, featureset, columns=4): """ Print a table showing the effect of each of the features in the given feature set, and how they combine to determine the probabilities of each label for that featureset. """ descr_width = 50 TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f' pdist = self.prob_classify(featureset) labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) labels = labels[:columns] print ' Feature'.ljust(descr_width)+''.join( '%8s' % str(l)[:7] for l in labels) print ' '+'-'*(descr_width-2+8*len(labels)) sums = defaultdict(int) for i, label in enumerate(labels): feature_vector = self._encoding.encode(featureset, label) feature_vector.sort(key=lambda (fid,_): abs(self._weights[fid]), reverse=True) for (f_id, f_val) in feature_vector: if self._logarithmic: score = self._weights[f_id] * f_val else: score = self._weights[fid] ** f_val descr = self._encoding.describe(f_id) descr = descr.split(' and label is ')[0] # hack descr += ' (%s)' % f_val # hack if len(descr) > 47: descr = descr[:44]+'...' print TEMPLATE % (descr, i*8*' ', score) sums[label] += score print ' '+'-'*(descr_width-1+8*len(labels)) print ' TOTAL:'.ljust(descr_width)+''.join( '%8.3f' % sums[l] for l in labels) print ' PROBS:'.ljust(descr_width)+''.join( '%8.3f' % pdist.prob(l) for l in labels) def show_most_informative_features(self, n=10, show='all'): """ :param show: all, neg, or pos (for negative-only or positive-only) """ fids = sorted(range(len(self._weights)), key=lambda fid: abs(self._weights[fid]), reverse=True) if show == 'pos': fids = [fid for fid in fids if self._weights[fid]>0] elif show == 'neg': fids = [fid for fid in fids if self._weights[fid]<0] for fid in fids[:n]: print '%8.3f %s' % (self._weights[fid], self._encoding.describe(fid)) def __repr__(self): return ('' % (len(self._encoding.labels()), self._encoding.length())) #: A list of the algorithm names that are accepted for the #: ``train()`` method's ``algorithm`` parameter. ALGORITHMS = ['GIS', 'IIS', 'CG', 'BFGS', 'Powell', 'LBFGSB', 'Nelder-Mead', 'MEGAM', 'TADM'] @classmethod def train(cls, train_toks, algorithm=None, trace=3, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, **cutoffs): """ Train a new maxent classifier based on the given corpus of training samples. This classifier will have its weights chosen to maximize entropy while remaining empirically consistent with the training corpus. :rtype: MaxentClassifier :return: The new maxent classifier :type train_toks: list :param train_toks: Training data, represented as a list of pairs, the first member of which is a featureset, and the second of which is a classification label. :type algorithm: str :param algorithm: A case-insensitive string, specifying which algorithm should be used to train the classifier. The following algorithms are currently available. - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``), Improved Iterative Scaling (``'IIS'``) - Optimization Methods (requiring scipy): Conjugate gradient (``'CG'``) Broyden-Fletcher-Goldfarb-Shanno algorithm (``'BFGS'``), Powell algorithm (``'Powell'``), A limited-memory variant of the BFGS algorithm (``'LBFGSB'``), The Nelder-Mead algorithm (``'Nelder-Mead'``). - External Libraries (requiring megam): LM-BFGS algorithm, with training performed by Megam (``'megam'``) The default algorithm is ``'CG'`` if scipy is installed; and ``'IIS'`` otherwise. :type trace: int :param trace: The level of diagnostic tracing output to produce. Higher values produce more verbose output. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. If none is specified, then a ``BinaryMaxentFeatureEncoding`` will be built based on the features that are attested in the training corpus. :type labels: list(str) :param labels: The set of possible labels. If none is given, then the set of all labels attested in the training data will be used instead. :param sparse: If True, then use sparse matrices instead of dense matrices. Currently, this is only supported by the scipy (optimization method) algorithms. For other algorithms, its value is ignored. :param gaussian_prior_sigma: The sigma value for a gaussian prior on model weights. Currently, this is supported by the scipy (optimization method) algorithms and ``megam``. For other algorithms, its value is ignored. :param cutoffs: Arguments specifying various conditions under which the training should be halted. (Some of the cutoff conditions are not supported by some algorithms.) - ``max_iter=v``: Terminate after ``v`` iterations. - ``min_ll=v``: Terminate after the negative average log-likelihood drops under ``v``. - ``min_lldelta=v``: Terminate if a single iteration improves log likelihood by less than ``v``. - ``tolerance=v``: Terminate a scipy optimization method when improvement drops below a tolerance level ``v``. The exact meaning of this tolerance depends on the scipy algorithm used. See ``scipy`` documentation for more info. Default values: 1e-3 for CG, 1e-5 for LBFGSB, and 1e-4 for other algorithms. (``scipy`` only) """ if algorithm is None: try: import scipy algorithm = 'cg' except ImportError: algorithm = 'iis' for key in cutoffs: if key not in ('max_iter', 'min_ll', 'min_lldelta', 'tolerance', 'max_acc', 'min_accdelta', 'count_cutoff', 'norm', 'explicit', 'bernoulli'): raise TypeError('Unexpected keyword arg %r' % key) algorithm = algorithm.lower() if algorithm == 'iis': return train_maxent_classifier_with_iis( train_toks, trace, encoding, labels, **cutoffs) elif algorithm == 'gis': return train_maxent_classifier_with_gis( train_toks, trace, encoding, labels, **cutoffs) elif algorithm in cls._SCIPY_ALGS: return train_maxent_classifier_with_scipy( train_toks, trace, encoding, labels, cls._SCIPY_ALGS[algorithm], sparse, gaussian_prior_sigma, **cutoffs) elif algorithm == 'megam': return train_maxent_classifier_with_megam( train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs) elif algorithm == 'tadm': kwargs = cutoffs kwargs['trace'] = trace kwargs['encoding'] = encoding kwargs['labels'] = labels kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma return TadmMaxentClassifier.train(train_toks, **kwargs) else: raise ValueError('Unknown algorithm %s' % algorithm) _SCIPY_ALGS = {'cg':'CG', 'bfgs':'BFGS', 'powell':'Powell', 'lbfgsb':'LBFGSB', 'nelder-mead':'Nelder-Mead'} #: Alias for MaxentClassifier. ConditionalExponentialClassifier = MaxentClassifier ###################################################################### #{ Feature Encodings ###################################################################### class MaxentFeatureEncodingI(object): """ A mapping that converts a set of input-feature values to a vector of joint-feature values, given a label. This conversion is necessary to translate featuresets into a format that can be used by maximum entropy models. The set of joint-features used by a given encoding is fixed, and each index in the generated joint-feature vectors corresponds to a single joint-feature. The length of the generated joint-feature vectors is therefore constant (for a given encoding). Because the joint-feature vectors generated by ``MaxentFeatureEncodingI`` are typically very sparse, they are represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. Feature encodings are generally created using the ``train()`` method, which generates an appropriate encoding based on the input-feature values and labels that are present in a given corpus. """ def encode(self, featureset, label): """ Given a (featureset, label) pair, return the corresponding vector of joint-feature values. This vector is represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. :type featureset: dict :rtype: list(tuple(int, int)) """ raise NotImplementedError() def length(self): """ :return: The size of the fixed-length joint-feature vectors that are generated by this encoding. :rtype: int """ raise NotImplementedError() def labels(self): """ :return: A list of the \"known labels\" -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. :rtype: list """ raise NotImplementedError() def describe(self, fid): """ :return: A string describing the value of the joint-feature whose index in the generated feature vectors is ``fid``. :rtype: str """ raise NotImplementedError() def train(cls, train_toks): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. """ raise NotImplementedError() class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that calls a user-supplied function to map a given featureset/label pair to a sparse joint-feature vector. """ def __init__(self, func, length, labels): """ Construct a new feature encoding based on the given function. :type func: (callable) :param func: A function that takes two arguments, a featureset and a label, and returns the sparse joint feature vector that encodes them: >>> func(featureset, label) -> feature_vector This sparse joint feature vector (``feature_vector``) is a list of ``(index,value)`` tuples. :type length: int :param length: The size of the fixed-length joint-feature vectors that are generated by this encoding. :type labels: list :param labels: A list of the \"known labels\" for this encoding -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. """ self._length = length self._func = func self._labels = labels def encode(self, featureset, label): return self._func(featureset, label) def length(self): return self._length def labels(self): return self._labels def describe(self, fid): return 'no description available' class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing a binary joint-features of the form: | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. This method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. The ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form:: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode(..., fname:fval, ..., label)[id]`` is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError('Mapping values must be exactly the ' 'set of integers from 0...len(mapping)') self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = dict([(label,i+self._length) for (i,label) in enumerate(labels)]) self._length += len(self._alwayson) if unseen_features: fnames = set(fname for (fname, fval, label) in mapping) self._unseen = dict([(fname, i+self._length) for (i, fname) in enumerate(fnames)]) self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, (int, long)): raise TypeError('describe() expected an int') try: self._inv_mapping except AttributeError: self._inv_mapping = [-1]*len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return '%s==%r and label is %r' % (fname, fval, label) elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id==f_id2: return 'label is %r' % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id==f_id2: return '%s is unseen' % fname else: raise ValueError('Bad feature id') def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``BinaryMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError('Unexpected label %s' % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname,fval] += 1 if count[fname,fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) class GISEncoding(BinaryMaxentFeatureEncoding): """ A binary feature encoding which adds one new joint-feature to the joint-features defined by ``BinaryMaxentFeatureEncoding``: a correction feature, whose value is chosen to ensure that the sparse vector always sums to a constant non-negative number. This new feature is used to ensure two preconditions for the GIS training algorithm: - At least one feature vector index must be nonzero for every token. - The feature vector must sum to a constant non-negative number for every token. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False, C=None): """ :param C: The correction constant. The value of the correction feature is based on this value. In particular, its value is ``C - sum([v for (f,v) in encoding])``. :seealso: ``BinaryMaxentFeatureEncoding.__init__`` """ BinaryMaxentFeatureEncoding.__init__( self, labels, mapping, unseen_features, alwayson_features) if C is None: C = len(set([fname for (fname,fval,label) in mapping]))+1 self._C = C @property def C(self): """The non-negative constant that all encoded feature vectors will sum to.""" return self._C def encode(self, featureset, label): # Get the basic encoding. encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) base_length = BinaryMaxentFeatureEncoding.length(self) # Add a correction feature. total = sum([v for (f,v) in encoding]) if total >= self._C: raise ValueError('Correction feature is not high enough!') encoding.append( (base_length, self._C-total) ) # Return the result return encoding def length(self): return BinaryMaxentFeatureEncoding.length(self) + 1 def describe(self, f_id): if f_id == BinaryMaxentFeatureEncoding.length(self): return 'Correction feature (%s)' % self._C else: return BinaryMaxentFeatureEncoding.describe(self, f_id) class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding): def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): self._mapping = OrderedDict(mapping) self._label_mapping = OrderedDict() BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping, unseen_features, alwayson_features) def encode(self, featureset, label): encoding = [] for feature, value in featureset.items(): if (feature, label) not in self._mapping: self._mapping[(feature, label)] = len(self._mapping) if value not in self._label_mapping: if not isinstance(value, int): self._label_mapping[value] = len(self._label_mapping) else: self._label_mapping[value] = value encoding.append((self._mapping[(feature, label)], self._label_mapping[value])) return encoding def labels(self): return self._labels def describe(self, fid): for (feature, label) in self._mapping: if self._mapping[(feature, label)] == fid: return (feature, label) def length(self): return len(self._mapping) @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): mapping = OrderedDict() if not labels: labels = [] # This gets read twice, so compute the values in case it's lazy. train_toks = list(train_toks) for (featureset, label) in train_toks: if label not in labels: labels.append(label) for (featureset, label) in train_toks: for label in labels: for feature in featureset: if (feature, label) not in mapping: mapping[(feature, label)] = len(mapping) return cls(labels, mapping, **options) class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing integer, float and binary joint-features of the form: Binary (for string and boolean features): | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Value (for integer and float features): | joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) | { and (l == label) | { | { not encoded otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. For string and boolean features [type(fval) not in (int, float)] this method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. For integer and float features [type(fval) in (int, float)] this method will create one feature for each combination of ``fname`` and ``label`` that occurs at least once in the training corpus. For binary features the ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode({..., fname:fval, ...``, label)[id]} is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError('Mapping values must be exactly the ' 'set of integers from 0...len(mapping)') self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = dict([(label,i+self._length) for (i,label) in enumerate(labels)]) self._length += len(self._alwayson) if unseen_features: fnames = set(fname for (fname, fval, label) in mapping) self._unseen = dict([(fname, i+self._length) for (i, fname) in enumerate(fnames)]) self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): if(type(fval) in (int, float)): # Known feature name & value: if (fname, type(fval), label) in self._mapping: encoding.append((self._mapping[fname, type(fval), label], fval)) else: # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, (int, long)): raise TypeError('describe() expected an int') try: self._inv_mapping except AttributeError: self._inv_mapping = [-1]*len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return '%s==%r and label is %r' % (fname, fval, label) elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id==f_id2: return 'label is %r' % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id==f_id2: return '%s is unseen' % fname else: raise ValueError('Bad feature id') def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``TypedMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. Note: recognized feature values types are (int, float), over types are interpreted as regular binary features. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError('Unexpected label %s' % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): if(type(fval) in (int, float)): fval = type(fval) # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname,fval] += 1 if count[fname,fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) ###################################################################### #{ Classifier Trainer: Generalized Iterative Scaling ###################################################################### def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None, labels=None, **cutoffs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Generalized Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault('max_iter', 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = GISEncoding.train(train_toks, labels=labels) if not hasattr(encoding, 'C'): raise TypeError('The GIS algorithm requires an encoding that ' 'defines C (e.g., GISEncoding).') # Cinv is the inverse of the sum of each joint feature vector. # This controls the learning rate: higher Cinv (or lower C) gives # faster learning. Cinv = 1.0/encoding.C # Count how many times each feature occurs in the training data. empirical_fcount = calculate_empirical_fcount(train_toks, encoding) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_fcount==0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_fcount), 'd') for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) # Take the log of the empirical fcount. log_empirical_fcount = numpy.log2(empirical_fcount) del empirical_fcount # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print ' %9d %14.5f %9.3f' % (iternum, ll, acc) # Use the model to estimate the number of times each # feature should occur in the training data. estimated_fcount = calculate_estimated_fcount( classifier, train_toks, encoding) # Take the log of estimated fcount (avoid taking log(0).) for fid in unattested: estimated_fcount[fid] += 1 log_estimated_fcount = numpy.log2(estimated_fcount) del estimated_fcount # Update the classifier weights weights = classifier.weights() weights += (log_empirical_fcount - log_estimated_fcount) * Cinv classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print ' Training stopped: keyboard interrupt' except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier def calculate_empirical_fcount(train_toks, encoding): fcount = numpy.zeros(encoding.length(), 'd') for tok, label in train_toks: for (index, val) in encoding.encode(tok, label): fcount[index] += val return fcount def calculate_estimated_fcount(classifier, train_toks, encoding): fcount = numpy.zeros(encoding.length(), 'd') for tok, label in train_toks: pdist = classifier.prob_classify(tok) for label in pdist.samples(): prob = pdist.prob(label) for (fid, fval) in encoding.encode(tok, label): fcount[fid] += prob*fval return fcount ###################################################################### #{ Classifier Trainer: Improved Iterative Scaling ###################################################################### def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None, labels=None, **cutoffs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Improved Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault('max_iter', 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) # Count how many times each feature occurs in the training data. empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) / len(train_toks)) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = calculate_nfmap(train_toks, encoding) nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_ffreq==0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_ffreq), 'd') for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print ' %9d %14.5f %9.3f' % (iternum, ll, acc) # Calculate the deltas for this iteration, using Newton's method. deltas = calculate_deltas( train_toks, classifier, unattested, empirical_ffreq, nfmap, nfarray, nftranspose, encoding) # Use the deltas to update our weights. weights = classifier.weights() weights += deltas classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print ' Training stopped: keyboard interrupt' except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier def calculate_nfmap(train_toks, encoding): """ Construct a map that can be used to compress ``nf`` (which is typically sparse). *nf(feature_vector)* is the sum of the feature values for *feature_vector*. This represents the number of features that are active for a given labeled text. This method finds all values of *nf(t)* that are attested for at least one token in the given list of training tokens; and constructs a dictionary mapping these attested values to a continuous range *0...N*. For example, if the only values of *nf()* that were attested were 3, 5, and 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``. :return: A map that can be used to compress ``nf`` to a dense vector. :rtype: dict(int -> int) """ # Map from nf to indices. This allows us to use smaller arrays. nfset = set() for tok, _ in train_toks: for label in encoding.labels(): nfset.add(sum([val for (id,val) in encoding.encode(tok,label)])) return dict([(nf, i) for (i, nf) in enumerate(nfset)]) def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical, nfmap, nfarray, nftranspose, encoding): """ Calculate the update values for the classifier weights for this iteration of IIS. These update weights are the value of ``delta`` that solves the equation:: ffreq_empirical[i] = SUM[fs,l] (classifier.prob_classify(fs).prob(l) * feature_vector(fs,l)[i] * exp(delta[i] * nf(feature_vector(fs,l)))) Where: - *(fs,l)* is a (featureset, label) tuple from ``train_toks`` - *feature_vector(fs,l)* = ``encoding.encode(fs,l)`` - *nf(vector)* = ``sum([val for (id,val) in vector])`` This method uses Newton's method to solve this equation for *delta[i]*. In particular, it starts with a guess of ``delta[i]`` = 1; and iteratively updates ``delta`` with: | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) until convergence, where *sum1* and *sum2* are defined as: | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l))) | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) . | feature_vector(fs,l)[i] . | exp(delta[i] . nf(feature_vector(fs,l)))) Note that *sum1* and *sum2* depend on ``delta``; so they need to be re-computed each iteration. The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are used to generate a dense encoding for *nf(ltext)*. This allows ``_deltas`` to calculate *sum1* and *sum2* using matrices, which yields a significant performance improvement. :param train_toks: The set of training tokens. :type train_toks: list(tuple(dict, str)) :param classifier: The current classifier. :type classifier: ClassifierI :param ffreq_empirical: An array containing the empirical frequency for each feature. The *i*\ th element of this array is the empirical frequency for feature *i*. :type ffreq_empirical: sequence of float :param unattested: An array that is 1 for features that are not attested in the training data; and 0 for features that are attested. In other words, ``unattested[i]==0`` iff ``ffreq_empirical[i]==0``. :type unattested: sequence of int :param nfmap: A map that can be used to compress ``nf`` to a dense vector. :type nfmap: dict(int -> int) :param nfarray: An array that can be used to uncompress ``nf`` from a dense vector. :type nfarray: array(float) :param nftranspose: The transpose of ``nfarray`` :type nftranspose: array(float) """ # These parameters control when we decide that we've # converged. It probably should be possible to set these # manually, via keyword arguments to train. NEWTON_CONVERGE = 1e-12 MAX_NEWTON = 300 deltas = numpy.ones(encoding.length(), 'd') # Precompute the A matrix: # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) # over all label,fs s.t. num_features[label,fs]=nf A = numpy.zeros((len(nfmap), encoding.length()), 'd') for tok, label in train_toks: dist = classifier.prob_classify(tok) for label in encoding.labels(): # Generate the feature vector feature_vector = encoding.encode(tok,label) # Find the number of active features nf = sum([val for (id, val) in feature_vector]) # Update the A matrix for (id, val) in feature_vector: A[nfmap[nf], id] += dist.prob(label) * val A /= len(train_toks) # Iteratively solve for delta. Use the following variables: # - nf_delta[x][y] = nfarray[x] * delta[y] # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # exp(delta[i]nf) # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # nf exp(delta[i]nf) for rangenum in range(MAX_NEWTON): nf_delta = numpy.outer(nfarray, deltas) exp_nf_delta = 2 ** nf_delta nf_exp_nf_delta = nftranspose * exp_nf_delta sum1 = numpy.sum(exp_nf_delta * A, axis=0) sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) # Avoid division by zero. for fid in unattested: sum2[fid] += 1 # Update the deltas. deltas -= (ffreq_empirical - sum1) / -sum2 # We can stop once we converge. n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/ numpy.sum(abs(deltas))) if n_error < NEWTON_CONVERGE: return deltas return deltas ###################################################################### #{ Classifier Trainer: scipy algorithms (GC, LBFGSB, etc.) ###################################################################### # [xx] n.b.: it's possible to supply custom trace functions, which # could be used to make trace output consistent with iis/gis. def train_maxent_classifier_with_scipy(train_toks, trace=3, encoding=None, labels=None, algorithm='CG', sparse=True, gaussian_prior_sigma=0, **cutoffs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the specified ``scipy`` optimization algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. :require: The ``scipy`` package must be installed. """ try: import scipy except ImportError, e: raise ValueError('The maxent training algorithm %r requires ' 'that the scipy package be installed. See ' 'http://www.scipy.org/' % algorithm) try: # E.g., if libgfortran.2.dylib is not found. import scipy.sparse, scipy.maxentropy except ImportError, e: raise ValueError('Import of scipy package failed: %s' % e) # Construct an encoding from the training data. if encoding is None: encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) elif labels is not None: raise ValueError('Specify encoding or labels, not both') labels = encoding.labels() labelnum = dict([(label, i) for (i, label) in enumerate(labels)]) num_features = encoding.length() num_toks = len(train_toks) num_labels = len(labels) # Decide whether to use a sparse matrix or a dense one. Very # limited testing has shown that the lil matrix format # (list-of-lists) performs better than csr and csc formats. # Limited testing also suggests that the sparse matrix format # doesn't save much memory over the dense format in practice # (in terms of max memory usage). if sparse: zeros = scipy.sparse.lil_matrix else: zeros = numpy.zeros # Construct the 'F' matrix, which lists the feature values for # each training instance. F[i, j*len(labels)+k] is equal to the # value of the i'th feature for the feature vector corresponding # to (tok[j], label[k]). F = zeros((num_features, num_toks*num_labels)) # Construct the 'N' matrix, which specifies the correct label for # each training instance. N[0, j*len(labels)+k] is equal to one # iff label[k] is the correct label for tok[j]. N = zeros((1, num_toks*num_labels)) # Fill in the 'F' and 'N' matrices (just make one pass through the # training tokens.) for toknum, (featureset, label) in enumerate(train_toks): N[0, toknum*len(labels) + labelnum[label]] += 1 for label2 in labels: for (fid, fval) in encoding.encode(featureset, label2): F[fid, toknum*len(labels) + labelnum[label2]] = fval # Set up the scipy model, based on the matrices F and N. model = scipy.maxentropy.conditionalmodel(F, N, num_toks) # note -- model.setsmooth() is buggy. if gaussian_prior_sigma: model.sigma2 = gaussian_prior_sigma**2 if algorithm == 'LBFGSB': model.log = None if trace >= 3: model.verbose = True if 'max_iter' in cutoffs: model.maxiter = cutoffs['max_iter'] if 'tolerance' in cutoffs: if algorithm == 'CG': model.avegtol = cutoffs['tolerance'] elif algorithm == 'LBFGSB': model.maxgtol = cutoffs['tolerance'] else: model.tol = cutoffs['tolerance'] # Train the model. model.fit(algorithm=algorithm) # Convert the model's weights from base-e to base-2 weights. weights = model.params * numpy.log2(numpy.e) # Build the classifier return MaxentClassifier(encoding, weights) ###################################################################### #{ Classifier Trainer: megam ###################################################################### # [xx] possible extension: add support for using implicit file format; # this would need to put requirements on what encoding is used. But # we may need this for other maxent classifier trainers that require # implicit formats anyway. def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the external ``megam`` library. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. :see: ``nltk.classify.megam`` """ explicit = True bernoulli = True if 'explicit' in kwargs: explicit = kwargs['explicit'] if 'bernoulli' in kwargs: bernoulli = kwargs['bernoulli'] # Construct an encoding from the training data. if encoding is None: # Count cutoff can also be controlled by megam with the -minfc # option. Not sure where the best place for it is. count_cutoff = kwargs.get('count_cutoff', 0) encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff, labels=labels, alwayson_features=True) elif labels is not None: raise ValueError('Specify encoding or labels, not both') # Write a training file for megam. try: fd, trainfile_name = tempfile.mkstemp(prefix='nltk-', suffix='.gz') trainfile = gzip.open(trainfile_name, 'wb') write_megam_file(train_toks, encoding, trainfile, \ explicit=explicit, bernoulli=bernoulli) trainfile.close() except (OSError, IOError, ValueError), e: raise ValueError('Error while creating megam training file: %s' % e) # Run megam on the training file. options = [] options += ['-nobias', '-repeat', '10'] if explicit: options += ['-explicit'] if not bernoulli: options += ['-fvals'] if gaussian_prior_sigma: # Lambda is just the precision of the Gaussian prior, i.e. it's the # inverse variance, so the parameter conversion is 1.0/sigma**2. # See http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf. inv_variance = 1.0 / gaussian_prior_sigma**2 else: inv_variance = 0 options += ['-lambda', '%.2f' % inv_variance, '-tune'] if trace < 3: options += ['-quiet'] if 'max_iter' in kwargs: options += ['-maxi', '%s' % kwargs['max_iter']] if 'll_delta' in kwargs: # [xx] this is actually a perplexity delta, not a log # likelihood delta options += ['-dpp', '%s' % abs(kwargs['ll_delta'])] if hasattr(encoding, 'cost'): options += ['-multilabel'] # each possible la options += ['multiclass', trainfile_name] stdout = call_megam(options) # print './megam_i686.opt ', ' '.join(options) # Delete the training file try: os.remove(trainfile_name) except (OSError, IOError), e: print 'Warning: unable to delete %s: %s' % (trainfile_name, e) # Parse the generated weight vector. weights = parse_megam_weights(stdout, encoding.length(), explicit) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return MaxentClassifier(encoding, weights) ###################################################################### #{ Classifier Trainer: tadm ###################################################################### class TadmMaxentClassifier(MaxentClassifier): @classmethod def train(cls, train_toks, **kwargs): algorithm = kwargs.get('algorithm', 'tao_lmvm') trace = kwargs.get('trace', 3) encoding = kwargs.get('encoding', None) labels = kwargs.get('labels', None) sigma = kwargs.get('gaussian_prior_sigma', 0) count_cutoff = kwargs.get('count_cutoff', 0) max_iter = kwargs.get('max_iter') ll_delta = kwargs.get('min_lldelta') # Construct an encoding from the training data. if not encoding: encoding = TadmEventMaxentFeatureEncoding.train(train_toks, count_cutoff, labels=labels) trainfile_fd, trainfile_name = \ tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz') weightfile_fd, weightfile_name = \ tempfile.mkstemp(prefix='nltk-tadm-weights-') trainfile = gzip.open(trainfile_name, 'wb') write_tadm_file(train_toks, encoding, trainfile) trainfile.close() options = [] options.extend(['-monitor']) options.extend(['-method', algorithm]) if sigma: options.extend(['-l2', '%.6f' % sigma**2]) if max_iter: options.extend(['-max_it', '%d' % max_iter]) if ll_delta: options.extend(['-fatol', '%.6f' % abs(ll_delta)]) options.extend(['-events_in', trainfile_name]) options.extend(['-params_out', weightfile_name]) if trace < 3: options.extend(['2>&1']) else: options.extend(['-summary']) call_tadm(options) weightfile = open(weightfile_name, 'rb') weights = parse_tadm_weights(weightfile) weightfile.close() os.remove(trainfile_name) os.remove(weightfile_name) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return cls(encoding, weights) ###################################################################### #{ Demo ###################################################################### def demo(): from nltk.classify.util import names_demo classifier = names_demo(MaxentClassifier.train) if __name__ == '__main__': demo()