[docs]defset_field(self,field_name,data):"""Set property into the Dataset. Parameters ---------- field_name: string The field name of the information. data: list, numpy array or None The array of data to be set. """ifself.handleisNone:raiseException("Cannot set %s before construct dataset"%field_name)ifdataisNone:# set to None_safe_call(_LIB.LGBM_DatasetSetField(self.handle,c_str(field_name),None,ctypes.c_int(0),ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))returndtype=np.float32iffield_name=='group':dtype=np.int32eliffield_name=='init_score':dtype=np.float64data=list_to_1d_numpy(data,dtype,name=field_name)ifdata.dtype==np.float32ordata.dtype==np.float64:ptr_data,type_data,_=c_float_array(data)elifdata.dtype==np.int32:ptr_data,type_data,_=c_int_array(data)else:raiseTypeError("Excepted np.float32/64 or np.int32, meet type({})".format(data.dtype))iftype_data!=FIELD_TYPE_MAPPER[field_name]:raiseTypeError("Input type error for set_field")_safe_call(_LIB.LGBM_DatasetSetField(self.handle,c_str(field_name),ptr_data,ctypes.c_int(len(data)),ctypes.c_int(type_data)))

[docs]defget_field(self,field_name):"""Get property from the Dataset. Parameters ---------- field_name: string The field name of the information. Returns ------- info : numpy array A numpy array with information from the Dataset. """ifself.handleisNone:raiseException("Cannot get %s before construct Dataset"%field_name)tmp_out_len=ctypes.c_int()out_type=ctypes.c_int()ret=ctypes.POINTER(ctypes.c_void_p)()_safe_call(_LIB.LGBM_DatasetGetField(self.handle,c_str(field_name),ctypes.byref(tmp_out_len),ctypes.byref(ret),ctypes.byref(out_type)))ifout_type.value!=FIELD_TYPE_MAPPER[field_name]:raiseTypeError("Return type error for get_field")iftmp_out_len.value==0:returnNoneifout_type.value==C_API_DTYPE_INT32:returncint32_array_to_numpy(ctypes.cast(ret,ctypes.POINTER(ctypes.c_int32)),tmp_out_len.value)elifout_type.value==C_API_DTYPE_FLOAT32:returncfloat32_array_to_numpy(ctypes.cast(ret,ctypes.POINTER(ctypes.c_float)),tmp_out_len.value)elifout_type.value==C_API_DTYPE_FLOAT64:returncfloat64_array_to_numpy(ctypes.cast(ret,ctypes.POINTER(ctypes.c_double)),tmp_out_len.value)else:raiseTypeError("Unknown type")

[docs]defset_categorical_feature(self,categorical_feature):"""Set categorical features. Parameters ---------- categorical_feature : list of int or strings Names or indices of categorical features. """ifself.categorical_feature==categorical_feature:returnifself.dataisnotNone:ifself.categorical_featureisNone:self.categorical_feature=categorical_featureself._free_handle()elifcategorical_feature=='auto':warnings.warn('Using categorical_feature in Dataset.')else:warnings.warn('categorical_feature in Dataset is overrided. New categorical_feature is {}'.format(sorted(list(categorical_feature))))self.categorical_feature=categorical_featureself._free_handle()else:raiseLightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")

def_set_predictor(self,predictor):""" Set predictor for continued training, not recommand for user to call this function. Please set init_model in engine.train or engine.cv """ifpredictorisself._predictor:returnifself.dataisnotNone:self._predictor=predictorself._free_handle()else:raiseLightGBMError("Cannot set predictor after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")

[docs]defset_reference(self,reference):"""Set reference Dataset. Parameters ---------- reference : Dataset Reference that is used as a template to consturct the current Dataset. """self.set_categorical_feature(reference.categorical_feature)self.set_feature_name(reference.feature_name)self._set_predictor(reference._predictor)# we're done if self and reference share a common upstrem referenceifself.get_ref_chain().intersection(reference.get_ref_chain()):returnifself.dataisnotNone:self.reference=referenceself._free_handle()else:raiseLightGBMError("Cannot set reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")

[docs]defset_label(self,label):"""Set label of Dataset Parameters ---------- label: list, numpy array or None The label information to be set into Dataset. """self.label=labelifself.handleisnotNone:label=list_to_1d_numpy(label,name='label')self.set_field('label',label)

[docs]defset_weight(self,weight):"""Set weight of each instance. Parameters ---------- weight : list, numpy array or None Weight to be set for each data point. """ifweightisnotNoneandnp.all(weight==1):weight=Noneself.weight=weightifself.handleisnotNoneandweightisnotNone:weight=list_to_1d_numpy(weight,name='weight')self.set_field('weight',weight)

[docs]defset_group(self,group):"""Set group size of Dataset (used for ranking). Parameters ---------- group : list, numpy array or None Group size of each group. """self.group=groupifself.handleisnotNoneandgroupisnotNone:group=list_to_1d_numpy(group,np.int32,name='group')self.set_field('group',group)

[docs]defget_label(self):"""Get the label of the Dataset. Returns ------- label : numpy array The label information from the Dataset. """ifself.labelisNone:self.label=self.get_field('label')returnself.label

[docs]defget_weight(self):"""Get the weight of the Dataset. Returns ------- weight : numpy array Weight for each data point from the Dataset. """ifself.weightisNone:self.weight=self.get_field('weight')returnself.weight

[docs]defget_group(self):"""Get the group of the Dataset. Returns ------- group : numpy array Group size of each group. """ifself.groupisNone:self.group=self.get_field('group')ifself.groupisnotNone:# group data from LightGBM is boundaries data, need to convert to group sizenew_group=[]foriinrange_(len(self.group)-1):new_group.append(self.group[i+1]-self.group[i])self.group=new_groupreturnself.group

[docs]defnum_data(self):"""Get the number of rows in the Dataset. Returns ------- number_of_rows : int The number of rows in the Dataset. """ifself.handleisnotNone:ret=ctypes.c_int()_safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,ctypes.byref(ret)))returnret.valueelse:raiseLightGBMError("Cannot get num_data before construct dataset")

[docs]defnum_feature(self):"""Get the number of columns (features) in the Dataset. Returns ------- number_of_columns : int The number of columns (features) in the Dataset. """ifself.handleisnotNone:ret=ctypes.c_int()_safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,ctypes.byref(ret)))returnret.valueelse:raiseLightGBMError("Cannot get num_feature before construct dataset")

[docs]defget_ref_chain(self,ref_limit=100):"""Get a chain of Dataset objects, starting with r, then going to r.reference if exists, then to r.reference.reference, etc. until we hit ``ref_limit`` or a reference loop. Parameters ---------- ref_limit : int, optional (default=100) The limit number of references. Returns ------- ref_chain : set of Dataset Chain of references of the Datasets. """head=selfref_chain=set()whilelen(ref_chain)<ref_limit:ifisinstance(head,Dataset):ref_chain.add(head)if(head.referenceisnotNone)and(head.referencenotinref_chain):head=head.referenceelse:breakelse:breakreturn(ref_chain)

[docs]defupdate(self,train_set=None,fobj=None):"""Update for one iteration. Parameters ---------- train_set : Dataset or None, optional (default=None) Training data. If None, last training data is used. fobj : callable or None, optional (default=None) Customized objective function. For multi-class task, the score is group by class_id first, then group by row_id. If you want to get i-th row score in j-th class, the access way is score[j * num_data + i] and you should group grad and hess in this way as well. Returns ------- is_finished : bool Whether the update was successfully finished. """# need reset training dataiftrain_setisnotNoneandtrain_setisnotself.train_set:ifnotisinstance(train_set,Dataset):raiseTypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))iftrain_set._predictorisnotself.__init_predictor:raiseLightGBMError("Replace training data failed, you should use same predictor for these data")self.train_set=train_set_safe_call(_LIB.LGBM_BoosterResetTrainingData(self.handle,self.train_set.construct().handle))self.__inner_predict_buffer[0]=Noneis_finished=ctypes.c_int(0)iffobjisNone:ifself.__set_objective_to_none:raiseValueError('Cannot update due to null objective function.')_safe_call(_LIB.LGBM_BoosterUpdateOneIter(self.handle,ctypes.byref(is_finished)))self.__is_predicted_cur_iter=[Falsefor_inrange_(self.__num_dataset)]returnis_finished.value==1else:ifnotself.__set_objective_to_none:self.reset_parameter({"objective":"none"})self.__set_objective_to_none=Truegrad,hess=fobj(self.__inner_predict(0),self.train_set)returnself.__boost(grad,hess)

def__boost(self,grad,hess):""" Boost the booster for one iteration, with customized gradient statistics. Note: for multi-class task, the score is group by class_id first, then group by row_id if you want to get i-th row score in j-th class, the access way is score[j*num_data+i] and you should group grad and hess in this way as well Parameters ---------- grad : 1d numpy or 1d list The first order of gradient. hess : 1d numpy or 1d list The second order of gradient. Returns ------- is_finished, bool """grad=list_to_1d_numpy(grad,name='gradient')hess=list_to_1d_numpy(hess,name='hessian')assertgrad.flags.c_contiguousasserthess.flags.c_contiguousiflen(grad)!=len(hess):raiseValueError("Lengths of gradient({}) and hessian({}) don't match".format(len(grad),len(hess)))is_finished=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(self.handle,grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),ctypes.byref(is_finished)))self.__is_predicted_cur_iter=[Falsefor_inrange_(self.__num_dataset)]returnis_finished.value==1

[docs]defrollback_one_iter(self):"""Rollback one iteration."""_safe_call(_LIB.LGBM_BoosterRollbackOneIter(self.handle))self.__is_predicted_cur_iter=[Falsefor_inrange_(self.__num_dataset)]

[docs]defcurrent_iteration(self):"""Get the index of the current iteration. Returns ------- cur_iter : int The index of the current iteration. """out_cur_iter=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(self.handle,ctypes.byref(out_cur_iter)))returnout_cur_iter.value

[docs]defsave_model(self,filename,num_iteration=-1):"""Save Booster to file. Parameters ---------- filename : string Filename to save Booster. num_iteration: int, optional (default=-1) Index of the iteration that should to saved. If <0, the best iteration (if exists) is saved. """ifnum_iteration<=0:num_iteration=self.best_iteration_safe_call(_LIB.LGBM_BoosterSaveModel(self.handle,ctypes.c_int(num_iteration),c_str(filename)))_save_pandas_categorical(filename,self.pandas_categorical)

def_load_model_from_string(self,model_str,verbose=True):"""[Private] Load model from string"""ifself.handleisnotNone:_safe_call(_LIB.LGBM_BoosterFree(self.handle))self._free_buffer()self.handle=ctypes.c_void_p()out_num_iterations=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterLoadModelFromString(c_str(model_str),ctypes.byref(out_num_iterations),ctypes.byref(self.handle)))out_num_class=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterGetNumClasses(self.handle,ctypes.byref(out_num_class)))ifverbose:print('Finished loading model, total used %d iterations'%(int(out_num_iterations.value)))self.__num_class=out_num_class.valuedef_save_model_to_string(self,num_iteration=-1):"""[Private] Save model to string"""ifnum_iteration<=0:num_iteration=self.best_iterationbuffer_len=1<<20tmp_out_len=ctypes.c_int64(0)string_buffer=ctypes.create_string_buffer(buffer_len)ptr_string_buffer=ctypes.c_char_p(*[ctypes.addressof(string_buffer)])_safe_call(_LIB.LGBM_BoosterSaveModelToString(self.handle,ctypes.c_int(num_iteration),ctypes.c_int64(buffer_len),ctypes.byref(tmp_out_len),ptr_string_buffer))actual_len=tmp_out_len.value'''if buffer length is not long enough, re-allocate a buffer'''ifactual_len>buffer_len:string_buffer=ctypes.create_string_buffer(actual_len)ptr_string_buffer=ctypes.c_char_p(*[ctypes.addressof(string_buffer)])_safe_call(_LIB.LGBM_BoosterSaveModelToString(self.handle,ctypes.c_int(num_iteration),ctypes.c_int64(actual_len),ctypes.byref(tmp_out_len),ptr_string_buffer))returnstring_buffer.value.decode()

[docs]defdump_model(self,num_iteration=-1):"""Dump Booster to json format. Parameters ---------- num_iteration: int, optional (default=-1) Index of the iteration that should to dumped. If <0, the best iteration (if exists) is dumped. Returns ------- json_repr : dict Json format of Booster. """ifnum_iteration<=0:num_iteration=self.best_iterationbuffer_len=1<<20tmp_out_len=ctypes.c_int64(0)string_buffer=ctypes.create_string_buffer(buffer_len)ptr_string_buffer=ctypes.c_char_p(*[ctypes.addressof(string_buffer)])_safe_call(_LIB.LGBM_BoosterDumpModel(self.handle,ctypes.c_int(num_iteration),ctypes.c_int64(buffer_len),ctypes.byref(tmp_out_len),ptr_string_buffer))actual_len=tmp_out_len.value'''if buffer length is not long enough, reallocate a buffer'''ifactual_len>buffer_len:string_buffer=ctypes.create_string_buffer(actual_len)ptr_string_buffer=ctypes.c_char_p(*[ctypes.addressof(string_buffer)])_safe_call(_LIB.LGBM_BoosterDumpModel(self.handle,ctypes.c_int(num_iteration),ctypes.c_int64(actual_len),ctypes.byref(tmp_out_len),ptr_string_buffer))returnjson.loads(string_buffer.value.decode())

[docs]defget_leaf_output(self,tree_id,leaf_id):"""Get the output of a leaf. Parameters ---------- tree_id : int The index of the tree. leaf_id : int The index of the leaf in the tree. Returns ------- result : float The output of the leaf. """ret=ctypes.c_double(0)_safe_call(_LIB.LGBM_BoosterGetLeafValue(self.handle,ctypes.c_int(tree_id),ctypes.c_int(leaf_id),ctypes.byref(ret)))returnret.value

def_to_predictor(self,pred_parameter=None):"""Convert to predictor"""predictor=_InnerPredictor(booster_handle=self.handle,pred_parameter=pred_parameter)predictor.pandas_categorical=self.pandas_categoricalreturnpredictor

[docs]defnum_feature(self):"""Get number of features. Returns ------- num_feature : int The number of features. """out_num_feature=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterGetNumFeature(self.handle,ctypes.byref(out_num_feature)))returnout_num_feature.value

[docs]deffeature_name(self):"""Get names of features. Returns ------- result : list List with names of features. """num_feature=self.num_feature()# Get name of featurestmp_out_len=ctypes.c_int(0)string_buffers=[ctypes.create_string_buffer(255)foriinrange_(num_feature)]ptr_string_buffers=(ctypes.c_char_p*num_feature)(*map(ctypes.addressof,string_buffers))_safe_call(_LIB.LGBM_BoosterGetFeatureNames(self.handle,ctypes.byref(tmp_out_len),ptr_string_buffers))ifnum_feature!=tmp_out_len.value:raiseValueError("Length of feature names doesn't equal with num_feature")return[string_buffers[i].value.decode()foriinrange_(num_feature)]

[docs]deffeature_importance(self,importance_type='split',iteration=-1):"""Get feature importances. Parameters ---------- importance_type : string, optional (default="split") How the importance is calculated. If "split", result contains numbers of times the feature is used in a model. If "gain", result contains total gains of splits which use the feature. Returns ------- result : numpy array Array with feature importances. """ifimportance_type=="split":importance_type_int=0elifimportance_type=="gain":importance_type_int=1else:importance_type_int=-1num_feature=self.num_feature()result=np.array([0for_inrange_(num_feature)],dtype=np.float64)_safe_call(_LIB.LGBM_BoosterFeatureImportance(self.handle,ctypes.c_int(iteration),ctypes.c_int(importance_type_int),result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))ifimportance_type_int==0:returnresult.astype(int)else:returnresult

def__inner_eval(self,data_name,data_idx,feval=None):""" Evaulate training or validation data """ifdata_idx>=self.__num_dataset:raiseValueError("Data_idx should be smaller than number of dataset")self.__get_eval_info()ret=[]ifself.__num_inner_eval>0:result=np.array([0.0for_inrange_(self.__num_inner_eval)],dtype=np.float64)tmp_out_len=ctypes.c_int(0)_safe_call(_LIB.LGBM_BoosterGetEval(self.handle,ctypes.c_int(data_idx),ctypes.byref(tmp_out_len),result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))iftmp_out_len.value!=self.__num_inner_eval:raiseValueError("Wrong length of eval results")foriinrange_(self.__num_inner_eval):ret.append((data_name,self.__name_inner_eval[i],result[i],self.__higher_better_inner_eval[i]))iffevalisnotNone:ifdata_idx==0:cur_data=self.train_setelse:cur_data=self.valid_sets[data_idx-1]feval_ret=feval(self.__inner_predict(data_idx),cur_data)ifisinstance(feval_ret,list):foreval_name,val,is_higher_betterinfeval_ret:ret.append((data_name,eval_name,val,is_higher_better))else:eval_name,val,is_higher_better=feval_retret.append((data_name,eval_name,val,is_higher_better))returnretdef__inner_predict(self,data_idx):""" Predict for training and validation dataset """ifdata_idx>=self.__num_dataset:raiseValueError("Data_idx should be smaller than number of dataset")ifself.__inner_predict_buffer[data_idx]isNone:ifdata_idx==0:n_preds=self.train_set.num_data()*self.__num_classelse:n_preds=self.valid_sets[data_idx-1].num_data()*self.__num_classself.__inner_predict_buffer[data_idx]= \
np.array([0.0for_inrange_(n_preds)],dtype=np.float64,copy=False)# avoid to predict many time in one iterationifnotself.__is_predicted_cur_iter[data_idx]:tmp_out_len=ctypes.c_int64(0)data_ptr=self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))_safe_call(_LIB.LGBM_BoosterGetPredict(self.handle,ctypes.c_int(data_idx),ctypes.byref(tmp_out_len),data_ptr))iftmp_out_len.value!=len(self.__inner_predict_buffer[data_idx]):raiseValueError("Wrong length of predict results for data %d"%(data_idx))self.__is_predicted_cur_iter[data_idx]=Truereturnself.__inner_predict_buffer[data_idx]def__get_eval_info(self):""" Get inner evaluation count and names """ifself.__need_reload_eval_info:self.__need_reload_eval_info=Falseout_num_eval=ctypes.c_int(0)# Get num of inner evals_safe_call(_LIB.LGBM_BoosterGetEvalCounts(self.handle,ctypes.byref(out_num_eval)))self.__num_inner_eval=out_num_eval.valueifself.__num_inner_eval>0:# Get name of evalstmp_out_len=ctypes.c_int(0)string_buffers=[ctypes.create_string_buffer(255)foriinrange_(self.__num_inner_eval)]ptr_string_buffers=(ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof,string_buffers))_safe_call(_LIB.LGBM_BoosterGetEvalNames(self.handle,ctypes.byref(tmp_out_len),ptr_string_buffers))ifself.__num_inner_eval!=tmp_out_len.value:raiseValueError("Length of eval names doesn't equal with num_evals")self.__name_inner_eval= \
[string_buffers[i].value.decode()foriinrange_(self.__num_inner_eval)]self.__higher_better_inner_eval= \
[name.startswith(('auc','ndcg@','map@'))fornameinself.__name_inner_eval]

[docs]defattr(self,key):"""Get attribute string from the Booster. Parameters ---------- key : string The name of the attribute. Returns ------- value : string or None The attribute value. Returns None if attribute do not exist. """returnself.__attr.get(key,None)

[docs]defset_attr(self,**kwargs):"""Set the attribute of the Booster. Parameters ---------- **kwargs The attributes to set. Setting a value to None deletes an attribute. """forkey,valueinkwargs.items():ifvalueisnotNone:ifnotisinstance(value,string_type):raiseValueError("Set attr only accepts strings")self.__attr[key]=valueelse:self.__attr.pop(key,None)