python 2.7 - My Maxent Classifier works fine with gis algorithm but does not work with iis algorithm. It is not throwing any error, just some warnings -


i trying implement maxent classifier facing problem while using iis algorithm.the following code works fine gis algorithm.

import nltk nltk.classify import maxentclassifier, accuracy featx import split_label_feats, label_feats_from_corpus nltk.corpus import movie_reviews nltk.classify import megam openpyxl import load_workbook featx import bag_of_non_words   nltk.tokenize import word_tokenize movie_reviews.categories() lfeats = label_feats_from_corpus(movie_reviews)  lfeats.keys() train_feats, test_feats = split_label_feats(lfeats) me_classifier = nltk.maxentclassifier.train(train_feats, algorithm='iis', trace=0, max_iter=3) print accuracy(me_classifier, test_feats) 

i working on win32 machine , above code nltk book jacob perkins. warning thrown

c:\python27\lib\site-packages\nltk\classify\maxent.py:1308: runtimewarning: invalid value encountered in multiply   sum1 = numpy.sum(exp_nf_delta * a, axis=0) c:\python27\lib\site-packages\nltk\classify\maxent.py:1309: runtimewarning: invalid value encountered in multiply   sum2 = numpy.sum(nf_exp_nf_delta * a, axis=0) c:\python27\lib\site-packages\nltk\classify\maxent.py:1315: runtimewarning: invalid value encountered in divide   deltas -= (ffreq_empirical - sum1) / -sum2 

and computer hangs.so have stop execution.

.

firstly, way you're importing libraries unsorted confusing. there lot of unused imports. after googling, let's cut down imports , stick this:

from collections import defaultdict  import nltk nltk.classify import maxentclassifier, accuracy nltk.corpus import movie_reviews 

then found featx example module jacob perkins using book, better source (https://github.com/sophist114/python/blob/master/emotionanalysis.py). let's here's documented version explanation of functions doing:

def bag_of_words(words):     """     change document bow feature vector represented dict object.     """     return dict([(word, true) word in words])   def label_feats_from_corpus(corp, feature_detector=bag_of_words):     """     change corpus feature matrix. proceess      known vectorization. default use bow features.     """     label_feats = defaultdict(list)     label in corp.categories():         fileid in corp.fileids(categories=[label]):             feats = feature_detector(corp.words(fileids=[fileid]))             label_feats[label].append(feats)     return label_feats   def split_label_feats(lfeats, split=0.75):     """     splits corpus train , test portion.     module used after using `label_feats_from_corpus`.     """     train_feats = []     test_feats = []     label, feats in lfeats.iteritems():         cutoff = int(len(feats) * split)         train_feats.extend([(feat, label) feat in feats[:cutoff]])         test_feats.extend([(feat, label) feat in feats[cutoff:]])     return train_feats, test_feats 

now let's go through process of training model , testing it, first, feature extraction:

# extract features corpus , each document label appropriate labels.  label_feats = label_feats_from_corpus(movie_reviews) 

let's see after calling label_feats_from_corpus:

for label in label_feats:     document in label_feats[label]:          print label, document         break     break 

[out]:

neg {u'all': true, u'concept': true, u'skip': true, u'go': true, u'seemed': true, u'suits': true, u'presents': true, u'to': true, u'sitting': true, u'very': true, u'horror': true, u'continues': true, u'every': true, u'exact': true, u'cool': true, u'entire': true, u'did': true, u'dig': true, u'flick': true, u'neighborhood': true, u'crow': true, u'street': true, u'video': true, u'further': true, u'even': true, u'what': true, u'hide': true, u'giving': true, u'new': true, u'ever': true, u'here': true, u'understanding': true, u'entertain': true, u'studio': true, u'others': true, u'kudos': true, u'weird': true, u'makes': true, u'explained': true, u'rarely': true, u'plot': true, u'fed': true, u'disappearances': true, u'from': true, u'would': true, u'&': true, u'two': true, u'music': true, u'films': true, u'themselves': true, u'until': true, u'more': true, u'teen': true, u'clue': true, u'stick': true, u'given': true, u'me': true, u'this': true, u'package': true, u'movies': true, u'making': true, u'my': true, u'give': true, u'fuck': true, u'want': true, u'sense': true, u'!': true, u'holds': true, u'write': true, u'how': true, u'hot': true, u'stir': true, u'okay': true, u'beauty': true, u'mess': true, u'overall': true, u'after': true, u'coming': true, u'such': true, u'guys': true, u'types': true, u'a': true, u'downshifts': true, u'chasing': true, u'redundant': true, u'so': true, u'enter': true, u'playing': true, u'executed': true, u'over': true, u'insight': true, u'years': true, u'still': true, u'its': true, u'before': true, u'thrilling': true, u'somewhere': true, u',': true, u'actually': true, u'meantime': true, u'production': true, u'main': true, u'might': true, u'then': true, u'good': true, u'break': true, u'they': true, u'half': true, u'not': true, u'now': true, u'always': true, u'didn': true, u'arrow': true, u'mean': true, u'bentley': true, u'generation': true, u'idea': true, u'engaging': true, u'happen': true, u'out': true, u"'": true, u'since': true, u'7': true, u'got': true, u'highway': true, u'shows': true, u'blair': true, u'turning': true, u'little': true, u'completely': true, u'shelves': true, u'starts': true, u'terribly': true, u'american': true, u'jumbled': true, u'chopped': true, u'one': true, u'fantasy': true, u'visions': true, u'guess': true, u'"': true, u'2': true, u'too': true, u'wrapped': true, u'final': true, u'slasher': true, u'that': true, u'explanation': true, u'took': true, u'part': true, u'attempt': true, u'10': true, u'kind': true, u'scenes': true, u'feeling': true, u'and': true, u'mind': true, u'sad': true, u'have': true, u'need': true, u'seem': true, u'apparently': true, u'-': true, u'also': true, u'which': true, u'sure': true, u'normal': true, u'who': true, u'most': true, u'don': true, u'drive': true, u'ways': true, u'entertaining': true, u'review': true, u'came': true, u'ending': true, u'find': true, u'touches': true, u'craziness': true, u'(': true, u'should': true, u'only': true, u'going': true, u'pretty': true, u'joblo': true, u'folks': true, u'8': true, u'do': true, u'his': true, u'get': true, u'watch': true, u'feels': true, u'despite': true, u'him': true, u'bad': true, u'where': true, u'lazy': true, u'see': true, u'decided': true, u'are': true, u'sorta': true, u'movie': true, u'nightmare': true, u'3': true, u'unravel': true, u'melissa': true, u'correctly': true, u'flicks': true, u'we': true, u'packaged': true, u'nightmares': true, u'genre': true, u'20': true, u'memento': true, u'both': true, u'accident': true, u's': true, u'witch': true, u'point': true, u'character': true, u'whatever': true, u'tons': true, u'simply': true, u'church': true, u'throughout': true, u'decent': true, u'been': true, u'.': true, u'secret': true, u'life': true, u'kids': true, u'personally': true, u'look': true, u'these': true, u'plain': true, u'harder': true, u'apparitions': true, u'while': true, u'neat': true, u've': true, u'is': true, u'it': true, u'couples': true, u'someone': true, u'in': true, u'chase': true, u'different': true, u')': true, u'things': true, u'make': true, u'same': true, u'member': true, u'strange': true, u'9': true, u'party': true, u'applaud': true, u'drink': true, u'director': true, u'running': true, u'characters': true, u'off': true, u'i': true, u'salvation': true, u'well': true, u'obviously': true, u'edge': true, u'echoes': true, u'the': true, u'away': true, u'just': true, u'generally': true, u'elm': true, u'excites': true, u'seems': true, u'snag': true, u'wes': true, u'4': true, u'has': true, u'big': true, u'showing': true, u'five': true, u'know': true, u'world': true, u'bit': true, u'password': true, u'dreams': true, u'like': true, u'lost': true, u'audience': true, u't': true, u'looooot': true, u'because': true, u'deal': true, u'people': true, u'back': true, u'dead': true, u'unraveling': true, u'critique': true, u'confusing': true, u'for': true, u'bottom': true, u'/': true, u'does': true, u'assuming': true, u'?': true, u'be': true, u'although': true, u'by': true, u'on': true, u'about': true, u'oh': true, u'of': true, u'runtime': true, u'or': true, u'own': true, u'strangeness': true, u'into': true, u'down': true, u'your': true, u'her': true, u'there': true, u'start': true, u'way': true, u'biggest': true, u':': true, u'head': true, u'offering': true, u'but': true, u'taken': true, u'line': true, u'trying': true, u'with': true, u'he': true, u'up': true, u'us': true, u'problem': true, u'minutes': true, u'figured': true, u'doesn': true, u'an': true, u'as': true, u'girlfriend': true, u'mold': true, u'sagemiller': true, u'film': true, u'again': true, u'no': true, u'when': true, u'actors': true, u'you': true, u'really': true, u'dies': true, u'problems': true, u'ago': true} 

so document neg label , each word in our document, see words true. each document contains feature (i.e. word) has.

let's move on:

# let's split data train , test. train_feats, test_feats = split_label_feats(label_feat)  

now see split_label_feats change key value structure such each iteration of train_feats gives document tuple of (features, label)

for features, label in train_documents:     label, features     break  print len(train_documents) print len(test_documents) # number of documents in movie_review corpus num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) cat in movie_reviews.categories()]))) print len(train_documents) + len(test_documents) == num_docs_in_corpus 

[out]:

1500 500 true 

so seems error can caused last 2 lines of code, when run line:

# train tagger. me_classifier = nltk.maxentclassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3) 

you these warnings do note code still building model !!!! it's warnings due underflow, see what arithmetic underflow , overflow in c?

it takes while build classifier fear not, just wait till it's finish , don't ctr + c end python process. if kill process, see this:

training stopped: keyboard interrupt 

so let's understand why warning occurs, there 4 warnings given:

/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1306: runtimewarning: overflow encountered in power   exp_nf_delta = 2 ** nf_delta /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1308: runtimewarning: invalid value encountered in multiply   sum1 = numpy.sum(exp_nf_delta * a, axis=0) /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1309: runtimewarning: invalid value encountered in multiply   sum2 = numpy.sum(nf_exp_nf_delta * a, axis=0) /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1315: runtimewarning: invalid value encountered in divide   deltas -= (ffreq_empirical - sum1) / -sum2 

all of them points same function used calculate delta in nltk's maxent implementation, i.e. https://github.com/nltk/nltk/blob/develop/nltk/classify/maxent.py#l1208 . , find out delta calculation specific iis (improved iterative scaling) algorithm.

at point, need learn machine learning , supervised learning, https://en.wikipedia.org/wiki/supervised_learning

to answer question, warming merely indication delta hard calculate @ point it's still reasonable deal with, possibly because of super small values when calculating delta. the algorithm working. it's not hanging, it's training.

in order appreciate neat implementation of maxent in nltk, suggest go through course https://www.youtube.com/playlist?list=pl6397e4b26d00a269 or more hardcore machine learning course, go https://www.coursera.org/course/ml

training classifier takes time , computing juice , after wait long enough, should see does:

print accuracy(me_classifier, test_feats) 

[out]:

0.5 

you can see accuracy bad, expected since delta calculation going far, 0.5 baseline. go through courses listed above , should able produce better classifiers after knowing how come , how tune them.

btw, remember pickle classifier don't have retrain next time, see save naive bayes trained classifier in nltk , pickling trained classifier yields different results results obtained directly newly identically trained classifier

here's full code:

from itertools import chain collections import defaultdict  import nltk nltk.classify import maxentclassifier, accuracy nltk.corpus import movie_reviews  def bag_of_words(words):     """     change document bow feature vector represented dict object.     """     return dict([(word, true) word in words])   def label_feats_from_corpus(corp, feature_detector=bag_of_words):     """     change corpus feature matrix. proceess      known vectorization. default use bow features.     """     label_feats = defaultdict(list)     label in corp.categories():         fileid in corp.fileids(categories=[label]):             feats = feature_detector(corp.words(fileids=[fileid]))             label_feats[label].append(feats)     return label_feats   def split_label_feats(lfeats, split=0.75):     """     splits corpus train , test portion.     module used after using `label_feats_from_corpus`.     """     train_feats = []     test_feats = []     label, feats in lfeats.iteritems():         cutoff = int(len(feats) * split)         train_feats.extend([(feat, label) feat in feats[:cutoff]])         test_feats.extend([(feat, label) feat in feats[cutoff:]])     return train_feats, test_feats   # extract features corpus , each document label appropriate labels.  label_feats = label_feats_from_corpus(movie_reviews) ''' label in label_feats:     document in label_feats[label]:          print label, document         break     break '''  # let's split data train , test. train_documents, test_documents = split_label_feats(label_feats)  ''' # see `split_label_feats` change key value structure such each iteration of train_feats gives document tuple of (features, label) features, label in train_documents:     print label, features     break  print len(train_documents) print len(test_documents) # number of documents in movie_review corpus num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) cat in movie_reviews.categories()]))) print len(train_documents) + len(test_documents) == num_docs_in_corpus '''  # train tagger. me_classifier = nltk.maxentclassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3) print accuracy(me_classifier, test_feats) 

Comments

Popular posts from this blog

c++ - OpenMP unpredictable overhead -

ruby on rails - RuntimeError: Circular dependency detected while autoloading constant - ActiveAdmin.register Role -

javascript - Wordpress slider, not displayed 100% width -