[docs]defconfig_megam(bin=None):""" Configure NLTK's interface to the ``megam`` maxent optimization package. :param bin: The full path to the ``megam`` binary. If not specified, then nltk will search the system for a ``megam`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """global_megam_bin_megam_bin=find_binary('megam',bin,env_vars=['MEGAM'],binary_names=['megam.opt','megam','megam_686','megam_i686.opt'],url='http://www.umiacs.umd.edu/~hal/megam/index.html')#######################################################################{ Megam Interface Functions######################################################################

[docs]defwrite_megam_file(train_toks,encoding,stream,bernoulli=True,explicit=True):""" Generate an input file for ``megam`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. May optionally implement a cost() method in order to assign different costs to different class predictions. :type stream: stream :param stream: The stream to which the megam input file should be written. :param bernoulli: If true, then use the 'bernoulli' format. I.e., all joint features have binary values, and are listed iff they are true. Otherwise, list feature values explicitly. If ``bernoulli=False``, then you must call ``megam`` with the ``-fvals`` option. :param explicit: If true, then use the 'explicit' format. I.e., list the features that would fire for any of the possible labels, for each token. If ``explicit=True``, then you must call ``megam`` with the ``-explicit`` option. """# Look up the set of labels.labels=encoding.labels()labelnum=dict((label,i)for(i,label)inenumerate(labels))# Write the file, which contains one line per instance.forfeatureset,labelintrain_toks:# First, the instance number (or, in the weighted multiclass case, the cost of each label).ifhasattr(encoding,'cost'):stream.write(':'.join(str(encoding.cost(featureset,label,l))forlinlabels))else:stream.write('%d'%labelnum[label])# For implicit file formats, just list the features that fire# for this instance's actual label.ifnotexplicit:_write_megam_features(encoding.encode(featureset,label),stream,bernoulli)# For explicit formats, list the features that would fire for# any of the possible labels.else:forlinlabels:stream.write(' #')_write_megam_features(encoding.encode(featureset,l),stream,bernoulli)# End of the instance.stream.write('\n')

[docs]defparse_megam_weights(s,features_count,explicit=True):""" Given the stdout output generated by ``megam`` when training a model, return a ``numpy`` array containing the corresponding weight vector. This function does not currently handle bias features. """ifnumpyisNone:raiseValueError('This function requires that numpy be installed')assertexplicit,'non-explicit not supported yet'lines=s.strip().split('\n')weights=numpy.zeros(features_count,'d')forlineinlines:ifline.strip():fid,weight=line.split()weights[int(fid)]=float(weight)returnweights

def_write_megam_features(vector,stream,bernoulli):ifnotvector:raiseValueError('MEGAM classifier requires the use of an ''always-on feature.')for(fid,fval)invector:ifbernoulli:iffval==1:stream.write(' %s'%fid)eliffval!=0:raiseValueError('If bernoulli=True, then all''features must be binary.')else:stream.write(' %s%s'%(fid,fval))

[docs]defcall_megam(args):""" Call the ``megam`` binary with the given arguments. """ifisinstance(args,compat.string_types):raiseTypeError('args should be a list of strings')if_megam_binisNone:config_megam()# Call megam via a subprocesscmd=[_megam_bin]+argsp=subprocess.Popen(cmd,stdout=subprocess.PIPE)(stdout,stderr)=p.communicate()# Check the return code.ifp.returncode!=0:print()print(stderr)raiseOSError('megam command failed!')ifisinstance(stdout,compat.string_types):returnstdoutelse:returnstdout.decode('utf-8')