/* ------------------------------------------------------------------------ unicodedata -- Provides access to the Unicode 3.0 data base. Data was extracted from the Unicode 3.0 UnicodeData.txt file. Written by Marc-Andre Lemburg (mal@lemburg.com). Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */#include "Python.h"#include "ucnhash.h"/* character properties */typedefstruct{constunsignedcharcategory;/* index into _PyUnicode_CategoryNames */constunsignedcharcombining;/* combining class value 0 - 255 */constunsignedcharbidirectional;/* index into _PyUnicode_BidirectionalNames */constunsignedcharmirrored;/* true if mirrored in bidir mode */}_PyUnicode_DatabaseRecord;/* data file generated by Tools/unicode/makeunicodedata.py */#include "unicodedata_db.h"staticconst_PyUnicode_DatabaseRecord*_getrecord(PyUnicodeObject*v){intcode;intindex;code=(int)*PyUnicode_AS_UNICODE(v);if(code<0||code>=65536)index=0;else{index=index1[(code>>SHIFT)];index=index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];}return&_PyUnicode_Database_Records[index];}/* --- Module API --------------------------------------------------------- */staticPyObject*unicodedata_decimal(PyObject*self,PyObject*args){PyUnicodeObject*v;PyObject*defobj=NULL;longrc;if(!PyArg_ParseTuple(args,"O!|O:decimal",&PyUnicode_Type,&v,&defobj))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}rc=Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));if(rc<0){if(defobj==NULL){PyErr_SetString(PyExc_ValueError,"not a decimal");returnNULL;}else{Py_INCREF(defobj);returndefobj;}}returnPyInt_FromLong(rc);}staticPyObject*unicodedata_digit(PyObject*self,PyObject*args){PyUnicodeObject*v;PyObject*defobj=NULL;longrc;if(!PyArg_ParseTuple(args,"O!|O:digit",&PyUnicode_Type,&v,&defobj))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}rc=Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));if(rc<0){if(defobj==NULL){PyErr_SetString(PyExc_ValueError,"not a digit");returnNULL;}else{Py_INCREF(defobj);returndefobj;}}returnPyInt_FromLong(rc);}staticPyObject*unicodedata_numeric(PyObject*self,PyObject*args){PyUnicodeObject*v;PyObject*defobj=NULL;doublerc;if(!PyArg_ParseTuple(args,"O!|O:numeric",&PyUnicode_Type,&v,&defobj))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}rc=Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));if(rc<0){if(defobj==NULL){PyErr_SetString(PyExc_ValueError,"not a numeric character");returnNULL;}else{Py_INCREF(defobj);returndefobj;}}returnPyFloat_FromDouble(rc);}staticPyObject*unicodedata_category(PyObject*self,PyObject*args){PyUnicodeObject*v;intindex;if(!PyArg_ParseTuple(args,"O!:category",&PyUnicode_Type,&v))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}index=(int)_getrecord(v)->category;returnPyString_FromString(_PyUnicode_CategoryNames[index]);}staticPyObject*unicodedata_bidirectional(PyObject*self,PyObject*args){PyUnicodeObject*v;intindex;if(!PyArg_ParseTuple(args,"O!:bidirectional",&PyUnicode_Type,&v))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}index=(int)_getrecord(v)->bidirectional;returnPyString_FromString(_PyUnicode_BidirectionalNames[index]);}staticPyObject*unicodedata_combining(PyObject*self,PyObject*args){PyUnicodeObject*v;if(!PyArg_ParseTuple(args,"O!:combining",&PyUnicode_Type,&v))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}returnPyInt_FromLong((int)_getrecord(v)->combining);}staticPyObject*unicodedata_mirrored(PyObject*self,PyObject*args){PyUnicodeObject*v;if(!PyArg_ParseTuple(args,"O!:mirrored",&PyUnicode_Type,&v))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}returnPyInt_FromLong((int)_getrecord(v)->mirrored);}staticPyObject*unicodedata_decomposition(PyObject*self,PyObject*args){PyUnicodeObject*v;chardecomp[256];intcode,index,count,i;if(!PyArg_ParseTuple(args,"O!:decomposition",&PyUnicode_Type,&v))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}code=(int)*PyUnicode_AS_UNICODE(v);if(code<0||code>=65536)index=0;else{index=decomp_index1[(code>>DECOMP_SHIFT)];index=decomp_index2[(index<<DECOMP_SHIFT)+(code&((1<<DECOMP_SHIFT)-1))];}/* high byte is number of hex bytes (usually one or two), low byte is prefix code (from*/count=decomp_data[index]>>8;/* XXX: could allocate the PyString up front instead (strlen(prefix) + 5 * count + 1 bytes) *//* copy prefix */i=strlen(decomp_prefix[decomp_data[index]&255]);memcpy(decomp,decomp_prefix[decomp_data[index]&255],i);while(count-->0){if(i)decomp[i++]=' ';assert((size_t)i<sizeof(decomp));PyOS_snprintf(decomp+i,sizeof(decomp)-i,"%04X",decomp_data[++index]);i+=strlen(decomp+i);}decomp[i]='\0';returnPyString_FromString(decomp);}/* -------------------------------------------------------------------- *//* unicode character name tables *//* data file generated by Tools/unicode/makeunicodedata.py */#include "unicodename_db.h"/* -------------------------------------------------------------------- *//* database code (cut and pasted from the unidb package) */staticunsignedlong_gethash(constchar*s,intlen,intscale){inti;unsignedlongh=0;unsignedlongix;for(i=0;i<len;i++){h=(h*scale)+(unsignedchar)toupper(s[i]);ix=h&0xff000000;if(ix)h=(h^((ix>>24)&0xff))&0x00ffffff;}returnh;}staticint_getname(Py_UCS4code,char*buffer,intbuflen){intoffset;inti;intword;unsignedchar*w;if(code>=65536)return0;/* get offset into phrasebook */offset=phrasebook_offset1[(code>>phrasebook_shift)];offset=phrasebook_offset2[(offset<<phrasebook_shift)+(code&((1<<phrasebook_shift)-1))];if(!offset)return0;i=0;for(;;){/* get word index */word=phrasebook[offset]-phrasebook_short;if(word>=0){word=(word<<8)+phrasebook[offset+1];offset+=2;}elseword=phrasebook[offset++];if(i){if(i>buflen)return0;/* buffer overflow */buffer[i++]=' ';}/* copy word string from lexicon. the last character in the word has bit 7 set. the last word in a string ends with 0x80 */w=lexicon+lexicon_offset[word];while(*w<128){if(i>=buflen)return0;/* buffer overflow */buffer[i++]=*w++;}if(i>=buflen)return0;/* buffer overflow */buffer[i++]=*w&127;if(*w==128)break;/* end of word */}return1;}staticint_cmpname(intcode,constchar*name,intnamelen){/* check if code corresponds to the given name */inti;charbuffer[NAME_MAXLEN];if(!_getname(code,buffer,sizeof(buffer)))return0;for(i=0;i<namelen;i++){if(toupper(name[i])!=buffer[i])return0;}returnbuffer[namelen]=='\0';}staticint_getcode(constchar*name,intnamelen,Py_UCS4*code){unsignedinth,v;unsignedintmask=code_size-1;unsignedinti,incr;/* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */h=(unsignedint)_gethash(name,namelen,code_magic);i=(~h)&mask;v=code_hash[i];if(!v)return0;if(_cmpname(v,name,namelen)){*code=v;return1;}incr=(h^(h>>3))&mask;if(!incr)incr=mask;for(;;){i=(i+incr)&mask;v=code_hash[i];if(!v)return0;if(_cmpname(v,name,namelen)){*code=v;return1;}incr=incr<<1;if(incr>mask)incr=incr^code_poly;}}staticconst_PyUnicode_Name_CAPIhashAPI={sizeof(_PyUnicode_Name_CAPI),_getname,_getcode};/* -------------------------------------------------------------------- *//* Python bindings */staticPyObject*unicodedata_name(PyObject*self,PyObject*args){charname[NAME_MAXLEN];PyUnicodeObject*v;PyObject*defobj=NULL;if(!PyArg_ParseTuple(args,"O!|O:name",&PyUnicode_Type,&v,&defobj))returnNULL;if(PyUnicode_GET_SIZE(v)!=1){PyErr_SetString(PyExc_TypeError,"need a single Unicode character as parameter");returnNULL;}if(!_getname((Py_UCS4)*PyUnicode_AS_UNICODE(v),name,sizeof(name))){if(defobj==NULL){PyErr_SetString(PyExc_ValueError,"no such name");returnNULL;}else{Py_INCREF(defobj);returndefobj;}}returnPy_BuildValue("s",name);}staticPyObject*unicodedata_lookup(PyObject*self,PyObject*args){Py_UCS4code;Py_UNICODEstr[1];char*name;intnamelen;if(!PyArg_ParseTuple(args,"s#:lookup",&name,&namelen))returnNULL;if(!_getcode(name,namelen,&code)){PyErr_SetString(PyExc_KeyError,"undefined character name");returnNULL;}str[0]=(Py_UNICODE)code;returnPyUnicode_FromUnicode(str,1);}/* XXX Add doc strings. */staticPyMethodDefunicodedata_functions[]={{"decimal",unicodedata_decimal,METH_VARARGS},{"digit",unicodedata_digit,METH_VARARGS},{"numeric",unicodedata_numeric,METH_VARARGS},{"category",unicodedata_category,METH_VARARGS},{"bidirectional",unicodedata_bidirectional,METH_VARARGS},{"combining",unicodedata_combining,METH_VARARGS},{"mirrored",unicodedata_mirrored,METH_VARARGS},{"decomposition",unicodedata_decomposition,METH_VARARGS},{"name",unicodedata_name,METH_VARARGS},{"lookup",unicodedata_lookup,METH_VARARGS},{NULL,NULL}/* sentinel */};staticchar*unicodedata_docstring="unicode character database";DL_EXPORT(void)initunicodedata(void){PyObject*m,*d,*v;m=Py_InitModule3("unicodedata",unicodedata_functions,unicodedata_docstring);if(!m)return;d=PyModule_GetDict(m);if(!d)return;/* Export C API */v=PyCObject_FromVoidPtr((void*)&hashAPI,NULL);if(v!=NULL){PyDict_SetItemString(d,"ucnhash_CAPI",v);Py_DECREF(v);}}