[docs]defpredict(self,X):""" Produces a list of most likely class labels as determined by the fine-tuned model. :param X: A list / array of text, shape [batch] :returns: list of class labels. """chunk_size=self.config.max_length-2step_size=chunk_size//3arr_encoded=list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids([x])forxinX))labels,batch_probas=[],[]forpredinself._inference(X,mode=None):labels.append(self.input_pipeline.label_encoder.inverse_transform(pred[PredictMode.NORMAL]))batch_probas.append(pred[PredictMode.PROBAS])all_subseqs=[]all_labels=[]all_probs=[]doc_idx=-1forchunk_idx,(label_seq,proba_seq)inenumerate(zip(labels,batch_probas)):position_seq=arr_encoded[chunk_idx].char_locsstart_of_doc=arr_encoded[chunk_idx].token_ids[0][0]==ENCODER.startend_of_doc=(chunk_idx+1>=len(arr_encoded)orarr_encoded[chunk_idx+1].token_ids[0][0]==ENCODER.start)""" Chunk idx for prediction. Dividers at `step_size` increments. [ 1 | 1 | 2 | 3 | 3 ] """start,end=0,Noneifstart_of_doc:# if this is the first chunk in a document, start accumulating from scratchdoc_subseqs=[]doc_labels=[]doc_probs=[]doc_idx+=1start_of_token=0ifnotend_of_doc:end=step_size*2else:ifend_of_doc:# predict on the rest of sequencestart=step_sizeelse:# predict only on middle thirdstart,end=step_size,step_size*2label_seq=label_seq[start:end]position_seq=position_seq[start:end]proba_seq=proba_seq[start:end]forlabel,position,probainzip(label_seq,position_seq,proba_seq):ifposition==-1:# indicates padding / special tokenscontinue# if there are no current subsequence# or the current subsequence has the wrong labelifnotdoc_subseqsorlabel!=doc_labels[-1]:# start new subsequencedoc_subseqs.append(X[doc_idx][start_of_token:position])doc_labels.append(label)doc_probs.append([proba])else:# continue appending to current subsequencedoc_subseqs[-1]+=X[doc_idx][start_of_token:position]doc_probs[-1].append(proba)start_of_token=positionifend_of_doc:# last chunk in a documentprob_dicts=[]forprob_seqindoc_probs:# format probabilities as dictionaryprobs=np.mean(np.vstack(prob_seq),axis=0)prob_dicts.append(dict(zip(self.input_pipeline.label_encoder.classes_,probs)))ifself.multi_label:delprob_dicts[-1][self.config.pad_token]all_subseqs.append(doc_subseqs)all_labels.append(doc_labels)all_probs.append(prob_dicts)_,doc_annotations=finetune_to_indico_sequence(raw_texts=X,subseqs=all_subseqs,labels=all_labels,probs=all_probs,subtoken_predictions=self.config.subtoken_predictions)returndoc_annotations

[docs]deffeaturize(self,X):""" Embeds inputs in learned feature space. Can be called before or after calling :meth:`finetune`. :param Xs: An iterable of lists or array of text, shape [batch, n_inputs, tokens] :returns: np.array of features of shape (n_examples, embedding_size). """returnself._featurize(X)