[docs]defis_infinite(self):""" Returns True if the length of the data source is indeterminate (e.g., MQ.) """returnFalse

[docs]defpreprocess(self,ent):""" Preprocesses the given dict-like object into another dict-like object. The default implementation does not alter the object. Users can override this method to perform custom process. You can yield None to skip the record. """returnent

def__iter__(self):""" Loads each row from the data source. """forentinself.rows():processed=self.preprocess(ent)ifprocessedisnotNone:yieldprocessed

[docs]defrows(self):""" Subclasses must override this method and yield each row of data source in flat dict-like object. You can yield None to skip the record. """raiseNotImplementedError()

[docs]classBaseSchema(object):""" Schema defines data types for each key of the data. BaseSchema defines the fundamental 3 data types. - IGNORE: ignores the key (mainly intended for fallback) - AUTO: use the type of the key as its data type - INFER: guess the type of the key from its value; note that this is discouraged as it may result in unstable result. """# Data Types: single-character type names are reserved by jubakit.# External subclasses must use 2+ characters for type names.IGNORE='_'AUTO='.'INFER='?'

[docs]def__init__(self,mapping,fallback=None):""" Defines a Schema. Schema is an immutable object and cannot be modified. `mapping` is a dict-like object that maps row keys to the data type. Optionally you can assign an alias name for the key to handle different loaders with the same configuration. """self._fallback=fallbackself._key2type,self._key2name=BaseSchema._normalize_mapping(mapping)

@staticmethoddef_normalize_mapping(mapping):""" Normalizes the schema mapping. """key2type={}key2name={}for(key,ent)inmapping.items():ifisinstance(ent,(tuple,list,)):(key_type,key_name)=entelse:(key_type,key_name)=(ent,key)key2type[key]=key_typekey2name[key]=key_namereturnkey2type,key2name@staticmethoddef_get_unique_mapping(mapping,fallback,key_type,name,optional=False):""" Validates the schema key uniqueness. This is an utility method for subclasses. """iffallback==key_type:raiseRuntimeError('{0} key cannot be specified as fallback in schema'.format(name))key2type,_=BaseSchema._normalize_mapping(mapping)keys=[kforkinkey2type.keys()ifkey2type[k]==key_type]iflen(keys)==0:ifoptional:returnNoneraiseRuntimeError('{0} key must be specified in schema'.format(name))elif1<len(keys):raiseRuntimeError('{0} key must be an unique key in schema'.format(name))returnkeys[0]def__repr__(self):return'<jubakit: Schema {0}>'.format(str({'keys':self._key2name,'types':self._key2type,'fallback_type':self._fallback}))

[docs]deftransform(self,row):""" Transforms the row (represented in dict-like object) as Datum. Subclasses that define their own data types should override this method and handle them. """returnself._transform_as_datum(row)

def_add_to_datum(self,d,t,k,v):""" Add value `v` whose type and name are `t` and `k` resp. to Datum `d`. """ifvisNone:returnift==self.STRING:ifisinstance(v,bytes):v=v.decode()ifisinstance(v,bool):""" We avoid unicode_t(v), which results in string constant "True" / "False", as the default configuration of STRING features is set to unigram. """v='1'ifvelse'0'd.add_string(k,unicode_t(v))elift==self.NUMBER:# Empty unicode/bytes values cannot be cast to float; treat them as NA.ifisinstance(v,(unicode_t,bytes))andlen(v)==0:returnd.add_number(k,float(v))elift==self.BINARY:d.add_binary(k,v)elift==self.AUTOort==self.INFER:(pred_type,pred_v)=self._predict_type(v,(t==self.AUTO))_logger.debug('key %s predicted as type %s',k,pred_type)self._add_to_datum(d,pred_type,k,pred_v)elift==self.IGNORE:passelse:raiseRuntimeError('invalid type {0} for key {1}'.format(t,k))def_transform_as_datum(self,row,d=None,skip_keys=[]):""" Transforms the row as Datum. If the original Datum `d` is specified, feature vectors will be added to it. """ifdisNone:d=jubatus.common.Datum()for(key,value)inrow.items():ifkeyinskip_keys:continuekey_type=self._key2type.get(key,self._fallback)key_name=self._key2name.get(key,key)ifkey_typeisNone:raiseRuntimeError('schema does not match: unknown key {0}'.format(key))self._add_to_datum(d,key_type,key_name,value)returnd

@classmethoddef_predict_type(cls,v,typed):""" Predicts a data type for the given data. if `typed` is True, no type conversion will be tried against `v`. """ifisinstance(v,bool):# isintance(True, int) returns True; so it should be checked first.return(cls.STRING,'1'ifvelse'0')elifisinstance(v,(int,long_t,float)):return(cls.NUMBER,v)elifisinstance(v,unicode_t):ifnottyped:try:return(cls.NUMBER,float(v))exceptValueError:passreturn(cls.STRING,v)elifisinstance(v,bytes):ifnottyped:try:return(cls.NUMBER,float(v))exceptValueError:passtry:return(cls.STRING,v.decode())exceptUnicodeDecodeError:passreturn(cls.BINARY,v)raiseValueError('cannot detect data type of {0}: {1}'.format(type(v),v))

[docs]classBaseDataset(object):""" Dataset is an abstract representation of set of data. """

[docs]def__init__(self,loader,schema=None,static=None,_data=None):""" Defines a new dataset. Datasets are immutable and cannot be modified. Data will be loaded from the given `loader` using `schema`. When `static` is set to True (which is the default for non-infinite loaders), data will be loaded on memory immedeately; otherwise data will be loaded one-by-one from `loader`, which may be better when processing a large dataset. For "infinite" loaders (like MQ and Twitter stream), `static` cannot be set to True. Note that some features (e.g., index access) are not available for non-static datasets, which may be needed for some features like cross-validation etc. """self._loader=loaderself._schema=schema# ``_index`` and ``_buffer` hold the current cursor position and the# current "raw" (i.e. value loaded from Loader) row content currently# being iterated.self._index=-1self._buffer=NoneifstaticisNone:# Non-infinite loaders are static by default.static=notloader.is_infinite()self._static=static# `_data` is internally used to create a shallow subset of the Dataset.if_dataisNone:self._data=[]else:self._data=_datareturn# the data is already loadedifstatic:ifloader.is_infinite():# Infinite data sources (e.g., MQ) cannot be loaded statically on memory.raiseRuntimeError('infinite loaders cannot be staticized')# Load all data entries._logger.info('loading all records from loader %s',loader)forrowinloader:# Predict schema.ifself._schemaisNone:self._schema=self._predict(row)self._data.append(row)_logger.info('records loaded (%d entries)',len(self._data))# Don't hold a ref to the loader for static datasets.self._loader=None

@classmethoddef_predict(cls,row):""" Predict the Schema for the given row using the corresponding Schema class. """# return GenericSchema.predict(row, False)raiseNotImplementedError()

[docs]defconvert(self,func):""" Applies the given callable (which is expected to perform batch pre-processing like `shuffle`) to the whole data entries and returns a new immutable Dataset. """ifnotself._static:raiseRuntimeError('non-static datasets cannot be converted')new_data=func(self._data)ifnotisinstance(new_data,collections.Iterable):raiseRuntimeError('convert function returned non-iterable: {0}'.format(new_data.__class__))returnself.__class__(self._loader,self._schema,True,new_data)

[docs]defget(self,idx):""" Returns the raw entry loaded by Loader. """ifidx==self._index:# For convenience, even non-static datasets can access the raw record for# the index that is currently being iterated.returnself._bufferifnotself._static:raiseRuntimeError('non-static datasets cannot be random accessed by index')returnself._data[idx]

def__len__(self):""" Returns the number of entries. """ifnotself._static:raiseRuntimeError('length of non-static datasets cannot be retrieved')returnlen(self._data)def__getitem__(self,index):""" Returns row(s) at the position `index`. `index` can be an iterable (like numpy array) or just an int. """ifisinstance(index,int)andindex==self._index:# For convenience, even non-static datasets can access the record for the# index that is currently being iterated.returnself._schema.transform(self._buffer)ifnotself._static:raiseRuntimeError('non-static datasets cannot be random accessed by index')ifisinstance(index,slice):returnself.__class__(self._loader,self._schema,True,self._data[index])elifisinstance(index,collections.Iterable):subdata=[]foriinindex:subdata.append(self._data[i])returnself.__class__(self._loader,self._schema,True,subdata)else:returnself._schema.transform(self._data[index])def__repr__(self):ifself._static:return'<jubakit: Static Dataset {0} records>'.format(len(self._data))return'<jubakit: Non-static Dataset>'def__iter__(self):""" Iteratively access each transformed rows. """try:source=self._dataifself._staticelseself._loaderself._index=0forrowinsource:ifrowisNone:# May contain None in self._data if Dataset.convert is used.continue# Predict schema (for non-static Datasets)ifself._schemaisNone:self._schema=self._predict(row)self._buffer=rowyield(self._index,self._schema.transform(row))self._index+=1finally:self._index=-1self._buffer=Noneself._loader=None

[docs]classBaseService(object):""" Service provides an interface to machine learning features. """

[docs]def__init__(self,host='127.0.0.1',port=9199,cluster='',timeout=0):""" Creates a new service that connects to the exsiting server. """self._host=hostself._port=portself._cluster=clusterself._timeout=timeoutself._embedded=Falseself._backend=None

def__del__(self):# Invoke the backend destructor as fast as possible.self._backend=None

@classmethoddef_client_class(cls):""" Subclasses must override this method and return the client class. """#return jubatus.classifier.client.ClassifierraiseNotImplementedError()@classmethoddef_embedded_class(cls):""" Subclasses must override this method and return the embedded class. """#return jubatus.embedded.ClassifierraiseNotImplementedError()

[docs]@classmethoddefrun(cls,config,port=None,embedded=False):""" Runs a new standalone server or embedded instance and returns the service instance. """ifembedded:backend=_ServiceBackendEmbedded(cls._embedded_class(),config)service=cls()service._backend=backendservice._embedded=Trueelse:backend=_ServiceBackend(cls.name(),config,port)_logger.info('service %s started on port %d',cls.name(),backend.port)service=cls('127.0.0.1',backend.port)service._backend=backendreturnservice

def_client(self):ifself._embedded:returnself._backend.modelreturnself._client_class()(self._host,self._port,self._cluster,self._timeout)def_shell(self,**kwargs):ifself._embedded:raiseRuntimeError('embedded service does not support shell')returnJubaShell(host=self._host,port=self._port,cluster=self._cluster,service=self.name(),timeout=self._timeout,keepalive=True,**kwargs)

[docs]defsave(self,name,path=None):""" Saves the model using `name`. If `path` is specified, copy the saved model file to local `path`. """self._client().save(name)_logger.info('model saved: %s',name)

# TODO copy source from `jubafetch` and make path option work.

[docs]defload(self,name,path=None):""" Loads the model using `name`. If `path` is specified, copy the model file from local `path` to remote location. """ifnotself._client().load(name):raiseRuntimeError('failed to load model: {0}'.format(name))_logger.info('model loaded: %s',name)

# TODO copy source from `jubafetch` and make path option work.

[docs]defget_status(self):""" Returns the status of this server. In distributed mode, returns statuses of all members. """ifself._embedded:return{'127.0.0.1_0':self._backend.get_status()}returnself._client().get_status()

[docs]defshell(self,**kwargs):""" Starts an interactive shell session for this service. """self._shell(**kwargs).interact()

def__repr__(self):ifself._embedded:return'<jubakit: Embedded Service ({0})>'.format(self.name())return'<jubakit: RPC Service ({0}) [{1}@{2}:{3}]{4}>'.format(self.name(),self._cluster,self._host,self._port,', started by jubakit'ifself._backendelse'')

class_ServiceBackendEmbedded(object):def__init__(self,clazz,config):self.model=clazz(config)defstop(self):passdefget_status(self):""" get_status API is not supported in embedded service. """return{}

[docs]classBaseConfig(dict):""" Config is a convenient class to build new config. """

[docs]def__init__(self,*args,**kwargs):""" Creates a new Config with default configuration. """super(BaseConfig,self).__init__(self)self._default(self)

@classmethoddef_default(cls,cfg):""" Initializes the given config (dict-like) with the default configuration. """raiseNotImplementedError()

[docs]classGenericConfig(BaseConfig):""" GenericConfig is a base Config class for generic services that have `converter`, `method` and `parameter` in its config data. """_CONVERTER_TEMPLATE={'string_filter_types':{},'string_filter_rules':[],'num_filter_types':{},'num_filter_rules':[],'string_types':{},'string_rules':[],'num_types':{},'num_rules':[],'binary_types':{},'binary_rules':[],}

@classmethoddef_default(cls,cfg):cfg.clear()method=cls._default_method()parameter=cls._default_parameter(method)converter=cls._default_converter()ifmethodisnotNone:cfg['method']=methodifparameterisnotNone:cfg['parameter']=parameterifconverterisnotNone:cfg['converter']=converter@classmethoddef_default_method(cls):""" Subclasses must override this method and return the preferred default method. """#return 'AROW'raiseNotImplementedError()@classmethoddef_default_parameter(cls,method):""" Subclasses must override this method and return the preferred default parameter set for the specified method. Return `None` if the method does not require `parameter` block. """#return {'regularization_weight': 0.1}raiseNotImplementedError()@classmethoddef_default_converter(cls):""" Returns a default converter contents. """cfg=copy.deepcopy(cls._CONVERTER_TEMPLATE)cfg['string_types']={'unigram':{'method':'ngram','char_num':'1'},'bigram':{'method':'ngram','char_num':'2'},'trigram':{'method':'ngram','char_num':'3'},}cfg['string_rules']=[{'key':'*','type':'unigram','sample_weight':'tf','global_weight':'idf'}]cfg['num_rules']=[{'key':'*','type':'num'}]returncfg

[docs]@classmethoddefmethods(cls):""" Subclasses must override this method and return methods available for this service. """#return ['perceptron', 'PA', 'AROW']raiseNotImplementedError()

[docs]defclear_converter(self):""" Initialize the `converter` section of the config with an empty template. """self['converter']=copy.deepcopy(self._CONVERTER_TEMPLATE)