/* ------------------------------------------------------------------------ Python Codec Registry and support functionsWritten by Marc-Andre Lemburg (mal@lemburg.com).Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */#include "Python.h"#include <ctype.h>/* --- Codec Registry ----------------------------------------------------- *//* Import the standard encodings package which will register the first codec search function. This is done in a lazy way so that the Unicode implementation does not downgrade startup time of scripts not needing it. ImportErrors are silently ignored by this function. Only one try is made.*/staticint_PyCodecRegistry_Init(void);/* Forward */intPyCodec_Register(PyObject*search_function){PyInterpreterState*interp=PyThreadState_GET()->interp;if(interp->codec_search_path==NULL&&_PyCodecRegistry_Init())gotoonError;if(search_function==NULL){PyErr_BadArgument();gotoonError;}if(!PyCallable_Check(search_function)){PyErr_SetString(PyExc_TypeError,"argument must be callable");gotoonError;}returnPyList_Append(interp->codec_search_path,search_function);onError:return-1;}/* Convert a string to a normalized Python string: all characters are converted to lower case, spaces are replaced with underscores. */staticPyObject*normalizestring(constchar*string){registersize_ti;size_tlen=strlen(string);char*p;PyObject*v;if(len>PY_SSIZE_T_MAX){PyErr_SetString(PyExc_OverflowError,"string is too large");returnNULL;}p=PyMem_Malloc(len+1);if(p==NULL)returnNULL;for(i=0;i<len;i++){registercharch=string[i];if(ch==' ')ch='-';elsech=tolower(Py_CHARMASK(ch));p[i]=ch;}p[i]='\0';v=PyUnicode_FromString(p);if(v==NULL)returnNULL;PyMem_Free(p);returnv;}/* Lookup the given encoding and return a tuple providing the codec facilities. The encoding string is looked up converted to all lower-case characters. This makes encodings looked up through this mechanism effectively case-insensitive. If no codec is found, a LookupError is set and NULL returned. As side effect, this tries to load the encodings package, if not yet done. This is part of the lazy load strategy for the encodings package.*/PyObject*_PyCodec_Lookup(constchar*encoding){PyInterpreterState*interp;PyObject*result,*args=NULL,*v;Py_ssize_ti,len;if(encoding==NULL){PyErr_BadArgument();gotoonError;}interp=PyThreadState_GET()->interp;if(interp->codec_search_path==NULL&&_PyCodecRegistry_Init())gotoonError;/* Convert the encoding to a normalized Python string: all characters are converted to lower case, spaces and hyphens are replaced with underscores. */v=normalizestring(encoding);if(v==NULL)gotoonError;PyUnicode_InternInPlace(&v);/* First, try to lookup the name in the registry dictionary */result=PyDict_GetItem(interp->codec_search_cache,v);if(result!=NULL){Py_INCREF(result);Py_DECREF(v);returnresult;}/* Next, scan the search functions in order of registration */args=PyTuple_New(1);if(args==NULL)gotoonError;PyTuple_SET_ITEM(args,0,v);len=PyList_Size(interp->codec_search_path);if(len<0)gotoonError;if(len==0){PyErr_SetString(PyExc_LookupError,"no codec search functions registered: ""can't find encoding");gotoonError;}for(i=0;i<len;i++){PyObject*func;func=PyList_GetItem(interp->codec_search_path,i);if(func==NULL)gotoonError;result=PyEval_CallObject(func,args);if(result==NULL)gotoonError;if(result==Py_None){Py_DECREF(result);continue;}if(!PyTuple_Check(result)||PyTuple_GET_SIZE(result)!=4){PyErr_SetString(PyExc_TypeError,"codec search functions must return 4-tuples");Py_DECREF(result);gotoonError;}break;}if(i==len){/* XXX Perhaps we should cache misses too ? */PyErr_Format(PyExc_LookupError,"unknown encoding: %s",encoding);gotoonError;}/* Cache and return the result */if(PyDict_SetItem(interp->codec_search_cache,v,result)<0){Py_DECREF(result);gotoonError;}Py_DECREF(args);returnresult;onError:Py_XDECREF(args);returnNULL;}staticPyObject*args_tuple(PyObject*object,constchar*errors){PyObject*args;args=PyTuple_New(1+(errors!=NULL));if(args==NULL)returnNULL;Py_INCREF(object);PyTuple_SET_ITEM(args,0,object);if(errors){PyObject*v;v=PyUnicode_FromString(errors);if(v==NULL){Py_DECREF(args);returnNULL;}PyTuple_SET_ITEM(args,1,v);}returnargs;}/* Helper function to get a codec item */staticPyObject*codec_getitem(constchar*encoding,intindex){PyObject*codecs;PyObject*v;codecs=_PyCodec_Lookup(encoding);if(codecs==NULL)returnNULL;v=PyTuple_GET_ITEM(codecs,index);Py_DECREF(codecs);Py_INCREF(v);returnv;}/* Helper function to create an incremental codec. */staticPyObject*codec_getincrementalcodec(constchar*encoding,constchar*errors,constchar*attrname){PyObject*codecs,*ret,*inccodec;codecs=_PyCodec_Lookup(encoding);if(codecs==NULL)returnNULL;inccodec=PyObject_GetAttrString(codecs,attrname);Py_DECREF(codecs);if(inccodec==NULL)returnNULL;if(errors)ret=PyObject_CallFunction(inccodec,"s",errors);elseret=PyObject_CallFunction(inccodec,NULL);Py_DECREF(inccodec);returnret;}/* Helper function to create a stream codec. */staticPyObject*codec_getstreamcodec(constchar*encoding,PyObject*stream,constchar*errors,constintindex){PyObject*codecs,*streamcodec,*codeccls;codecs=_PyCodec_Lookup(encoding);if(codecs==NULL)returnNULL;codeccls=PyTuple_GET_ITEM(codecs,index);if(errors!=NULL)streamcodec=PyObject_CallFunction(codeccls,"Os",stream,errors);elsestreamcodec=PyObject_CallFunction(codeccls,"O",stream);Py_DECREF(codecs);returnstreamcodec;}/* Convenience APIs to query the Codec registry. All APIs return a codec object with incremented refcount. */PyObject*PyCodec_Encoder(constchar*encoding){returncodec_getitem(encoding,0);}PyObject*PyCodec_Decoder(constchar*encoding){returncodec_getitem(encoding,1);}PyObject*PyCodec_IncrementalEncoder(constchar*encoding,constchar*errors){returncodec_getincrementalcodec(encoding,errors,"incrementalencoder");}PyObject*PyCodec_IncrementalDecoder(constchar*encoding,constchar*errors){returncodec_getincrementalcodec(encoding,errors,"incrementaldecoder");}PyObject*PyCodec_StreamReader(constchar*encoding,PyObject*stream,constchar*errors){returncodec_getstreamcodec(encoding,stream,errors,2);}PyObject*PyCodec_StreamWriter(constchar*encoding,PyObject*stream,constchar*errors){returncodec_getstreamcodec(encoding,stream,errors,3);}/* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). errors is passed to the encoder factory as argument if non-NULL. */PyObject*PyCodec_Encode(PyObject*object,constchar*encoding,constchar*errors){PyObject*encoder=NULL;PyObject*args=NULL,*result=NULL;PyObject*v=NULL;encoder=PyCodec_Encoder(encoding);if(encoder==NULL)gotoonError;args=args_tuple(object,errors);if(args==NULL)gotoonError;result=PyEval_CallObject(encoder,args);if(result==NULL)gotoonError;if(!PyTuple_Check(result)||PyTuple_GET_SIZE(result)!=2){PyErr_SetString(PyExc_TypeError,"encoder must return a tuple (object, integer)");gotoonError;}v=PyTuple_GET_ITEM(result,0);if(PyBytes_Check(v)){charmsg[100];PyOS_snprintf(msg,sizeof(msg),"encoder %s returned buffer instead of bytes",encoding);if(PyErr_WarnEx(PyExc_RuntimeWarning,msg,1)<0){v=NULL;gotoonError;}v=PyString_FromStringAndSize(PyBytes_AS_STRING(v),Py_Size(v));}elseif(PyString_Check(v))Py_INCREF(v);else{PyErr_SetString(PyExc_TypeError,"encoding must return a tuple(bytes, integer)");v=NULL;}/* We don't check or use the second (integer) entry. */onError:Py_XDECREF(result);Py_XDECREF(args);Py_XDECREF(encoder);returnv;}/* Decode an object (usually a Python string) using the given encoding and return an equivalent object (e.g. an Unicode object). errors is passed to the decoder factory as argument if non-NULL. */PyObject*PyCodec_Decode(PyObject*object,constchar*encoding,constchar*errors){PyObject*decoder=NULL;PyObject*args=NULL,*result=NULL;PyObject*v;decoder=PyCodec_Decoder(encoding);if(decoder==NULL)gotoonError;args=args_tuple(object,errors);if(args==NULL)gotoonError;result=PyEval_CallObject(decoder,args);if(result==NULL)gotoonError;if(!PyTuple_Check(result)||PyTuple_GET_SIZE(result)!=2){PyErr_SetString(PyExc_TypeError,"decoder must return a tuple (object,integer)");gotoonError;}v=PyTuple_GET_ITEM(result,0);Py_INCREF(v);/* We don't check or use the second (integer) entry. */Py_DECREF(args);Py_DECREF(decoder);Py_DECREF(result);returnv;onError:Py_XDECREF(args);Py_XDECREF(decoder);Py_XDECREF(result);returnNULL;}/* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the callback name, when name is specified as the error parameter in the call to the encode/decode function. Return 0 on success, -1 on error */intPyCodec_RegisterError(constchar*name,PyObject*error){PyInterpreterState*interp=PyThreadState_GET()->interp;if(interp->codec_search_path==NULL&&_PyCodecRegistry_Init())return-1;if(!PyCallable_Check(error)){PyErr_SetString(PyExc_TypeError,"handler must be callable");return-1;}returnPyDict_SetItemString(interp->codec_error_registry,(char*)name,error);}/* Lookup the error handling callback function registered under the name error. As a special case NULL can be passed, in which case the error handling callback for strict encoding will be returned. */PyObject*PyCodec_LookupError(constchar*name){PyObject*handler=NULL;PyInterpreterState*interp=PyThreadState_GET()->interp;if(interp->codec_search_path==NULL&&_PyCodecRegistry_Init())returnNULL;if(name==NULL)name="strict";handler=PyDict_GetItemString(interp->codec_error_registry,(char*)name);if(!handler)PyErr_Format(PyExc_LookupError,"unknown error handler name '%.400s'",name);elsePy_INCREF(handler);returnhandler;}staticvoidwrong_exception_type(PyObject*exc){PyObject*type=PyObject_GetAttrString(exc,"__class__");if(type!=NULL){PyObject*name=PyObject_GetAttrString(type,"__name__");Py_DECREF(type);if(name!=NULL){PyErr_Format(PyExc_TypeError,"don't know how to handle %S in error callback",name);Py_DECREF(name);}}}PyObject*PyCodec_StrictErrors(PyObject*exc){if(PyExceptionInstance_Check(exc))PyErr_SetObject(PyExceptionInstance_Class(exc),exc);elsePyErr_SetString(PyExc_TypeError,"codec must pass exception instance");returnNULL;}PyObject*PyCodec_IgnoreErrors(PyObject*exc){Py_ssize_tend;if(PyObject_IsInstance(exc,PyExc_UnicodeEncodeError)){if(PyUnicodeEncodeError_GetEnd(exc,&end))returnNULL;}elseif(PyObject_IsInstance(exc,PyExc_UnicodeDecodeError)){if(PyUnicodeDecodeError_GetEnd(exc,&end))returnNULL;}elseif(PyObject_IsInstance(exc,PyExc_UnicodeTranslateError)){if(PyUnicodeTranslateError_GetEnd(exc,&end))returnNULL;}else{wrong_exception_type(exc);returnNULL;}/* ouch: passing NULL, 0, pos gives None instead of u'' */returnPy_BuildValue("(u#n)",&end,0,end);}PyObject*PyCodec_ReplaceErrors(PyObject*exc){PyObject*restuple;Py_ssize_tstart;Py_ssize_tend;Py_ssize_ti;if(PyObject_IsInstance(exc,PyExc_UnicodeEncodeError)){PyObject*res;Py_UNICODE*p;if(PyUnicodeEncodeError_GetStart(exc,&start))returnNULL;if(PyUnicodeEncodeError_GetEnd(exc,&end))returnNULL;res=PyUnicode_FromUnicode(NULL,end-start);if(res==NULL)returnNULL;for(p=PyUnicode_AS_UNICODE(res),i=start;i<end;++p,++i)*p='?';restuple=Py_BuildValue("(On)",res,end);Py_DECREF(res);returnrestuple;}elseif(PyObject_IsInstance(exc,PyExc_UnicodeDecodeError)){Py_UNICODEres=Py_UNICODE_REPLACEMENT_CHARACTER;if(PyUnicodeDecodeError_GetEnd(exc,&end))returnNULL;returnPy_BuildValue("(u#n)",&res,1,end);}elseif(PyObject_IsInstance(exc,PyExc_UnicodeTranslateError)){PyObject*res;Py_UNICODE*p;if(PyUnicodeTranslateError_GetStart(exc,&start))returnNULL;if(PyUnicodeTranslateError_GetEnd(exc,&end))returnNULL;res=PyUnicode_FromUnicode(NULL,end-start);if(res==NULL)returnNULL;for(p=PyUnicode_AS_UNICODE(res),i=start;i<end;++p,++i)*p=Py_UNICODE_REPLACEMENT_CHARACTER;restuple=Py_BuildValue("(On)",res,end);Py_DECREF(res);returnrestuple;}else{wrong_exception_type(exc);returnNULL;}}PyObject*PyCodec_XMLCharRefReplaceErrors(PyObject*exc){if(PyObject_IsInstance(exc,PyExc_UnicodeEncodeError)){PyObject*restuple;PyObject*object;Py_ssize_tstart;Py_ssize_tend;PyObject*res;Py_UNICODE*p;Py_UNICODE*startp;Py_UNICODE*outp;intressize;if(PyUnicodeEncodeError_GetStart(exc,&start))returnNULL;if(PyUnicodeEncodeError_GetEnd(exc,&end))returnNULL;if(!(object=PyUnicodeEncodeError_GetObject(exc)))returnNULL;startp=PyUnicode_AS_UNICODE(object);for(p=startp+start,ressize=0;p<startp+end;++p){if(*p<10)ressize+=2+1+1;elseif(*p<100)ressize+=2+2+1;elseif(*p<1000)ressize+=2+3+1;elseif(*p<10000)ressize+=2+4+1;#ifndef Py_UNICODE_WIDEelseressize+=2+5+1;#elseelseif(*p<100000)ressize+=2+5+1;elseif(*p<1000000)ressize+=2+6+1;elseressize+=2+7+1;#endif}/* allocate replacement */res=PyUnicode_FromUnicode(NULL,ressize);if(res==NULL){Py_DECREF(object);returnNULL;}/* generate replacement */for(p=startp+start,outp=PyUnicode_AS_UNICODE(res);p<startp+end;++p){Py_UNICODEc=*p;intdigits;intbase;*outp++='&';*outp++='#';if(*p<10){digits=1;base=1;}elseif(*p<100){digits=2;base=10;}elseif(*p<1000){digits=3;base=100;}elseif(*p<10000){digits=4;base=1000;}#ifndef Py_UNICODE_WIDEelse{digits=5;base=10000;}#elseelseif(*p<100000){digits=5;base=10000;}elseif(*p<1000000){digits=6;base=100000;}else{digits=7;base=1000000;}#endifwhile(digits-->0){*outp++='0'+c/base;c%=base;base/=10;}*outp++=';';}restuple=Py_BuildValue("(On)",res,end);Py_DECREF(res);Py_DECREF(object);returnrestuple;}else{wrong_exception_type(exc);returnNULL;}}staticPy_UNICODEhexdigits[]={'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};PyObject*PyCodec_BackslashReplaceErrors(PyObject*exc){if(PyObject_IsInstance(exc,PyExc_UnicodeEncodeError)){PyObject*restuple;PyObject*object;Py_ssize_tstart;Py_ssize_tend;PyObject*res;Py_UNICODE*p;Py_UNICODE*startp;Py_UNICODE*outp;intressize;if(PyUnicodeEncodeError_GetStart(exc,&start))returnNULL;if(PyUnicodeEncodeError_GetEnd(exc,&end))returnNULL;if(!(object=PyUnicodeEncodeError_GetObject(exc)))returnNULL;startp=PyUnicode_AS_UNICODE(object);for(p=startp+start,ressize=0;p<startp+end;++p){#ifdef Py_UNICODE_WIDEif(*p>=0x00010000)ressize+=1+1+8;else#endifif(*p>=0x100){ressize+=1+1+4;}elseressize+=1+1+2;}res=PyUnicode_FromUnicode(NULL,ressize);if(res==NULL)returnNULL;for(p=startp+start,outp=PyUnicode_AS_UNICODE(res);p<startp+end;++p){Py_UNICODEc=*p;*outp++='\\';#ifdef Py_UNICODE_WIDEif(c>=0x00010000){*outp++='U';*outp++=hexdigits[(c>>28)&0xf];*outp++=hexdigits[(c>>24)&0xf];*outp++=hexdigits[(c>>20)&0xf];*outp++=hexdigits[(c>>16)&0xf];*outp++=hexdigits[(c>>12)&0xf];*outp++=hexdigits[(c>>8)&0xf];}else#endifif(c>=0x100){*outp++='u';*outp++=hexdigits[(c>>12)&0xf];*outp++=hexdigits[(c>>8)&0xf];}else*outp++='x';*outp++=hexdigits[(c>>4)&0xf];*outp++=hexdigits[c&0xf];}restuple=Py_BuildValue("(On)",res,end);Py_DECREF(res);Py_DECREF(object);returnrestuple;}else{wrong_exception_type(exc);returnNULL;}}staticPyObject*strict_errors(PyObject*self,PyObject*exc){returnPyCodec_StrictErrors(exc);}staticPyObject*ignore_errors(PyObject*self,PyObject*exc){returnPyCodec_IgnoreErrors(exc);}staticPyObject*replace_errors(PyObject*self,PyObject*exc){returnPyCodec_ReplaceErrors(exc);}staticPyObject*xmlcharrefreplace_errors(PyObject*self,PyObject*exc){returnPyCodec_XMLCharRefReplaceErrors(exc);}staticPyObject*backslashreplace_errors(PyObject*self,PyObject*exc){returnPyCodec_BackslashReplaceErrors(exc);}staticint_PyCodecRegistry_Init(void){staticstruct{char*name;PyMethodDefdef;}methods[]={{"strict",{"strict_errors",strict_errors,METH_O}},{"ignore",{"ignore_errors",ignore_errors,METH_O}},{"replace",{"replace_errors",replace_errors,METH_O}},{"xmlcharrefreplace",{"xmlcharrefreplace_errors",xmlcharrefreplace_errors,METH_O}},{"backslashreplace",{"backslashreplace_errors",backslashreplace_errors,METH_O}}};PyInterpreterState*interp=PyThreadState_GET()->interp;PyObject*mod;unsignedi;if(interp->codec_search_path!=NULL)return0;interp->codec_search_path=PyList_New(0);interp->codec_search_cache=PyDict_New();interp->codec_error_registry=PyDict_New();if(interp->codec_error_registry){for(i=0;i<sizeof(methods)/sizeof(methods[0]);++i){PyObject*func=PyCFunction_New(&methods[i].def,NULL);intres;if(!func)Py_FatalError("can't initialize codec error registry");res=PyCodec_RegisterError(methods[i].name,func);Py_DECREF(func);if(res)Py_FatalError("can't initialize codec error registry");}}if(interp->codec_search_path==NULL||interp->codec_search_cache==NULL||interp->codec_error_registry==NULL)Py_FatalError("can't initialize codec registry");mod=PyImport_ImportModuleLevel("encodings",NULL,NULL,NULL,0);if(mod==NULL){if(PyErr_ExceptionMatches(PyExc_ImportError)){/* Ignore ImportErrors... this is done so that distributions can disable the encodings package. Note that other errors are not masked, e.g. SystemErrors raised to inform the user of an error in the Python configuration are still reported back to the user. */PyErr_Clear();return0;}return-1;}Py_DECREF(mod);return0;}