python 2.7 - My Maxent Classifier works fine with gis algorithm but does not work with iis algorithm. It is not throwing any error, just some warnings -
i trying implement maxent classifier facing problem while using iis algorithm.the following code works fine gis algorithm.
import nltk nltk.classify import maxentclassifier, accuracy featx import split_label_feats, label_feats_from_corpus nltk.corpus import movie_reviews nltk.classify import megam openpyxl import load_workbook featx import bag_of_non_words nltk.tokenize import word_tokenize movie_reviews.categories() lfeats = label_feats_from_corpus(movie_reviews) lfeats.keys() train_feats, test_feats = split_label_feats(lfeats) me_classifier = nltk.maxentclassifier.train(train_feats, algorithm='iis', trace=0, max_iter=3) print accuracy(me_classifier, test_feats)
i working on win32 machine , above code nltk book jacob perkins. warning thrown
c:\python27\lib\site-packages\nltk\classify\maxent.py:1308: runtimewarning: invalid value encountered in multiply sum1 = numpy.sum(exp_nf_delta * a, axis=0) c:\python27\lib\site-packages\nltk\classify\maxent.py:1309: runtimewarning: invalid value encountered in multiply sum2 = numpy.sum(nf_exp_nf_delta * a, axis=0) c:\python27\lib\site-packages\nltk\classify\maxent.py:1315: runtimewarning: invalid value encountered in divide deltas -= (ffreq_empirical - sum1) / -sum2
and computer hangs.so have stop execution.
.
firstly, way you're importing libraries unsorted confusing. there lot of unused imports. after googling, let's cut down imports , stick this:
from collections import defaultdict import nltk nltk.classify import maxentclassifier, accuracy nltk.corpus import movie_reviews
then found featx
example module jacob perkins using book, better source (https://github.com/sophist114/python/blob/master/emotionanalysis.py). let's here's documented version explanation of functions doing:
def bag_of_words(words): """ change document bow feature vector represented dict object. """ return dict([(word, true) word in words]) def label_feats_from_corpus(corp, feature_detector=bag_of_words): """ change corpus feature matrix. proceess known vectorization. default use bow features. """ label_feats = defaultdict(list) label in corp.categories(): fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats def split_label_feats(lfeats, split=0.75): """ splits corpus train , test portion. module used after using `label_feats_from_corpus`. """ train_feats = [] test_feats = [] label, feats in lfeats.iteritems(): cutoff = int(len(feats) * split) train_feats.extend([(feat, label) feat in feats[:cutoff]]) test_feats.extend([(feat, label) feat in feats[cutoff:]]) return train_feats, test_feats
now let's go through process of training model , testing it, first, feature extraction:
# extract features corpus , each document label appropriate labels. label_feats = label_feats_from_corpus(movie_reviews)
let's see after calling label_feats_from_corpus
:
for label in label_feats: document in label_feats[label]: print label, document break break
[out]:
neg {u'all': true, u'concept': true, u'skip': true, u'go': true, u'seemed': true, u'suits': true, u'presents': true, u'to': true, u'sitting': true, u'very': true, u'horror': true, u'continues': true, u'every': true, u'exact': true, u'cool': true, u'entire': true, u'did': true, u'dig': true, u'flick': true, u'neighborhood': true, u'crow': true, u'street': true, u'video': true, u'further': true, u'even': true, u'what': true, u'hide': true, u'giving': true, u'new': true, u'ever': true, u'here': true, u'understanding': true, u'entertain': true, u'studio': true, u'others': true, u'kudos': true, u'weird': true, u'makes': true, u'explained': true, u'rarely': true, u'plot': true, u'fed': true, u'disappearances': true, u'from': true, u'would': true, u'&': true, u'two': true, u'music': true, u'films': true, u'themselves': true, u'until': true, u'more': true, u'teen': true, u'clue': true, u'stick': true, u'given': true, u'me': true, u'this': true, u'package': true, u'movies': true, u'making': true, u'my': true, u'give': true, u'fuck': true, u'want': true, u'sense': true, u'!': true, u'holds': true, u'write': true, u'how': true, u'hot': true, u'stir': true, u'okay': true, u'beauty': true, u'mess': true, u'overall': true, u'after': true, u'coming': true, u'such': true, u'guys': true, u'types': true, u'a': true, u'downshifts': true, u'chasing': true, u'redundant': true, u'so': true, u'enter': true, u'playing': true, u'executed': true, u'over': true, u'insight': true, u'years': true, u'still': true, u'its': true, u'before': true, u'thrilling': true, u'somewhere': true, u',': true, u'actually': true, u'meantime': true, u'production': true, u'main': true, u'might': true, u'then': true, u'good': true, u'break': true, u'they': true, u'half': true, u'not': true, u'now': true, u'always': true, u'didn': true, u'arrow': true, u'mean': true, u'bentley': true, u'generation': true, u'idea': true, u'engaging': true, u'happen': true, u'out': true, u"'": true, u'since': true, u'7': true, u'got': true, u'highway': true, u'shows': true, u'blair': true, u'turning': true, u'little': true, u'completely': true, u'shelves': true, u'starts': true, u'terribly': true, u'american': true, u'jumbled': true, u'chopped': true, u'one': true, u'fantasy': true, u'visions': true, u'guess': true, u'"': true, u'2': true, u'too': true, u'wrapped': true, u'final': true, u'slasher': true, u'that': true, u'explanation': true, u'took': true, u'part': true, u'attempt': true, u'10': true, u'kind': true, u'scenes': true, u'feeling': true, u'and': true, u'mind': true, u'sad': true, u'have': true, u'need': true, u'seem': true, u'apparently': true, u'-': true, u'also': true, u'which': true, u'sure': true, u'normal': true, u'who': true, u'most': true, u'don': true, u'drive': true, u'ways': true, u'entertaining': true, u'review': true, u'came': true, u'ending': true, u'find': true, u'touches': true, u'craziness': true, u'(': true, u'should': true, u'only': true, u'going': true, u'pretty': true, u'joblo': true, u'folks': true, u'8': true, u'do': true, u'his': true, u'get': true, u'watch': true, u'feels': true, u'despite': true, u'him': true, u'bad': true, u'where': true, u'lazy': true, u'see': true, u'decided': true, u'are': true, u'sorta': true, u'movie': true, u'nightmare': true, u'3': true, u'unravel': true, u'melissa': true, u'correctly': true, u'flicks': true, u'we': true, u'packaged': true, u'nightmares': true, u'genre': true, u'20': true, u'memento': true, u'both': true, u'accident': true, u's': true, u'witch': true, u'point': true, u'character': true, u'whatever': true, u'tons': true, u'simply': true, u'church': true, u'throughout': true, u'decent': true, u'been': true, u'.': true, u'secret': true, u'life': true, u'kids': true, u'personally': true, u'look': true, u'these': true, u'plain': true, u'harder': true, u'apparitions': true, u'while': true, u'neat': true, u've': true, u'is': true, u'it': true, u'couples': true, u'someone': true, u'in': true, u'chase': true, u'different': true, u')': true, u'things': true, u'make': true, u'same': true, u'member': true, u'strange': true, u'9': true, u'party': true, u'applaud': true, u'drink': true, u'director': true, u'running': true, u'characters': true, u'off': true, u'i': true, u'salvation': true, u'well': true, u'obviously': true, u'edge': true, u'echoes': true, u'the': true, u'away': true, u'just': true, u'generally': true, u'elm': true, u'excites': true, u'seems': true, u'snag': true, u'wes': true, u'4': true, u'has': true, u'big': true, u'showing': true, u'five': true, u'know': true, u'world': true, u'bit': true, u'password': true, u'dreams': true, u'like': true, u'lost': true, u'audience': true, u't': true, u'looooot': true, u'because': true, u'deal': true, u'people': true, u'back': true, u'dead': true, u'unraveling': true, u'critique': true, u'confusing': true, u'for': true, u'bottom': true, u'/': true, u'does': true, u'assuming': true, u'?': true, u'be': true, u'although': true, u'by': true, u'on': true, u'about': true, u'oh': true, u'of': true, u'runtime': true, u'or': true, u'own': true, u'strangeness': true, u'into': true, u'down': true, u'your': true, u'her': true, u'there': true, u'start': true, u'way': true, u'biggest': true, u':': true, u'head': true, u'offering': true, u'but': true, u'taken': true, u'line': true, u'trying': true, u'with': true, u'he': true, u'up': true, u'us': true, u'problem': true, u'minutes': true, u'figured': true, u'doesn': true, u'an': true, u'as': true, u'girlfriend': true, u'mold': true, u'sagemiller': true, u'film': true, u'again': true, u'no': true, u'when': true, u'actors': true, u'you': true, u'really': true, u'dies': true, u'problems': true, u'ago': true}
so document neg
label , each word in our document, see words true. each document contains feature (i.e. word) has.
let's move on:
# let's split data train , test. train_feats, test_feats = split_label_feats(label_feat)
now see split_label_feats
change key value structure such each iteration of train_feats gives document tuple of (features, label)
for features, label in train_documents: label, features break print len(train_documents) print len(test_documents) # number of documents in movie_review corpus num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) cat in movie_reviews.categories()]))) print len(train_documents) + len(test_documents) == num_docs_in_corpus
[out]:
1500 500 true
so seems error can caused last 2 lines of code, when run line:
# train tagger. me_classifier = nltk.maxentclassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3)
you these warnings do note code still building model !!!! it's warnings due underflow, see what arithmetic underflow , overflow in c?
it takes while build classifier fear not, just wait till it's finish , don't ctr + c
end python process. if kill process, see this:
training stopped: keyboard interrupt
so let's understand why warning occurs, there 4 warnings given:
/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1306: runtimewarning: overflow encountered in power exp_nf_delta = 2 ** nf_delta /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1308: runtimewarning: invalid value encountered in multiply sum1 = numpy.sum(exp_nf_delta * a, axis=0) /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1309: runtimewarning: invalid value encountered in multiply sum2 = numpy.sum(nf_exp_nf_delta * a, axis=0) /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1315: runtimewarning: invalid value encountered in divide deltas -= (ffreq_empirical - sum1) / -sum2
all of them points same function used calculate delta in nltk's maxent implementation, i.e. https://github.com/nltk/nltk/blob/develop/nltk/classify/maxent.py#l1208 . , find out delta calculation specific iis (improved iterative scaling) algorithm.
at point, need learn machine learning , supervised learning, https://en.wikipedia.org/wiki/supervised_learning
to answer question, warming merely indication delta hard calculate @ point it's still reasonable deal with, possibly because of super small values when calculating delta. the algorithm working. it's not hanging, it's training.
in order appreciate neat implementation of maxent in nltk, suggest go through course https://www.youtube.com/playlist?list=pl6397e4b26d00a269 or more hardcore machine learning course, go https://www.coursera.org/course/ml
training classifier takes time , computing juice , after wait long enough, should see does:
print accuracy(me_classifier, test_feats)
[out]:
0.5
you can see accuracy bad, expected since delta calculation going far, 0.5 baseline. go through courses listed above , should able produce better classifiers after knowing how come , how tune them.
btw, remember pickle classifier don't have retrain next time, see save naive bayes trained classifier in nltk , pickling trained classifier yields different results results obtained directly newly identically trained classifier
here's full code:
from itertools import chain collections import defaultdict import nltk nltk.classify import maxentclassifier, accuracy nltk.corpus import movie_reviews def bag_of_words(words): """ change document bow feature vector represented dict object. """ return dict([(word, true) word in words]) def label_feats_from_corpus(corp, feature_detector=bag_of_words): """ change corpus feature matrix. proceess known vectorization. default use bow features. """ label_feats = defaultdict(list) label in corp.categories(): fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats def split_label_feats(lfeats, split=0.75): """ splits corpus train , test portion. module used after using `label_feats_from_corpus`. """ train_feats = [] test_feats = [] label, feats in lfeats.iteritems(): cutoff = int(len(feats) * split) train_feats.extend([(feat, label) feat in feats[:cutoff]]) test_feats.extend([(feat, label) feat in feats[cutoff:]]) return train_feats, test_feats # extract features corpus , each document label appropriate labels. label_feats = label_feats_from_corpus(movie_reviews) ''' label in label_feats: document in label_feats[label]: print label, document break break ''' # let's split data train , test. train_documents, test_documents = split_label_feats(label_feats) ''' # see `split_label_feats` change key value structure such each iteration of train_feats gives document tuple of (features, label) features, label in train_documents: print label, features break print len(train_documents) print len(test_documents) # number of documents in movie_review corpus num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) cat in movie_reviews.categories()]))) print len(train_documents) + len(test_documents) == num_docs_in_corpus ''' # train tagger. me_classifier = nltk.maxentclassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3) print accuracy(me_classifier, test_feats)
Comments
Post a Comment