/* Tokenizer implementation */#include "pgenheaders.h"#include <ctype.h>#include "tokenizer.h"#include "errcode.h"externchar*PyOS_Readline(char*);/* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; NULL if interrupted *//* Don't ever change this -- it would break the portability of Python code */#define TABSIZE 8/* Convert a possibly signed character to a nonnegative int *//* XXX This assumes characters are 8 bits wide */#ifdef __CHAR_UNSIGNED__#define Py_CHARMASK(c) (c)#else#define Py_CHARMASK(c) ((c) & 0xff)#endif/* Forward */staticstructtok_state*tok_new(void);staticinttok_nextc(structtok_state*tok);staticvoidtok_backup(structtok_state*tok,intc);/* Token names */char*_PyParser_TokenNames[]={"ENDMARKER","NAME","NUMBER","STRING","NEWLINE","INDENT","DEDENT","LPAR","RPAR","LSQB","RSQB","COLON","COMMA","SEMI","PLUS","MINUS","STAR","SLASH","VBAR","AMPER","LESS","GREATER","EQUAL","DOT","PERCENT","BACKQUOTE","LBRACE","RBRACE","EQEQUAL","NOTEQUAL","LESSEQUAL","GREATEREQUAL","TILDE","CIRCUMFLEX","LEFTSHIFT","RIGHTSHIFT","DOUBLESTAR","PLUSEQUAL","MINEQUAL","STAREQUAL","SLASHEQUAL","PERCENTEQUAL","AMPEREQUAL","VBAREQUAL","CIRCUMFLEXEQUAL","LEFTSHIFTEQUAL","RIGHTSHIFTEQUAL","DOUBLESTAREQUAL",/* This table must match the #defines in token.h! */"OP","<ERRORTOKEN>","<N_TOKENS>"};/* Create and initialize a new tok_state structure */staticstructtok_state*tok_new(void){structtok_state*tok=PyMem_NEW(structtok_state,1);if(tok==NULL)returnNULL;tok->buf=tok->cur=tok->end=tok->inp=tok->start=NULL;tok->done=E_OK;tok->fp=NULL;tok->tabsize=TABSIZE;tok->indent=0;tok->indstack[0]=0;tok->atbol=1;tok->pendin=0;tok->prompt=tok->nextprompt=NULL;tok->lineno=0;tok->level=0;tok->filename=NULL;tok->altwarning=0;tok->alterror=0;tok->alttabsize=1;tok->altindstack[0]=0;returntok;}/* Set up tokenizer for string */structtok_state*PyTokenizer_FromString(char*str){structtok_state*tok=tok_new();if(tok==NULL)returnNULL;tok->buf=tok->cur=tok->end=tok->inp=str;returntok;}/* Set up tokenizer for file */structtok_state*PyTokenizer_FromFile(FILE*fp,char*ps1,char*ps2){structtok_state*tok=tok_new();if(tok==NULL)returnNULL;if((tok->buf=PyMem_NEW(char,BUFSIZ))==NULL){PyMem_DEL(tok);returnNULL;}tok->cur=tok->inp=tok->buf;tok->end=tok->buf+BUFSIZ;tok->fp=fp;tok->prompt=ps1;tok->nextprompt=ps2;returntok;}/* Free a tok_state structure */voidPyTokenizer_Free(structtok_state*tok){if(tok->fp!=NULL&&tok->buf!=NULL)PyMem_DEL(tok->buf);PyMem_DEL(tok);}/* Get next char, updating state; error code goes into tok->done */staticinttok_nextc(registerstructtok_state*tok){for(;;){if(tok->cur!=tok->inp){returnPy_CHARMASK(*tok->cur++);/* Fast path */}if(tok->done!=E_OK)returnEOF;if(tok->fp==NULL){char*end=strchr(tok->inp,'\n');if(end!=NULL)end++;else{end=strchr(tok->inp,'\0');if(end==tok->inp){tok->done=E_EOF;returnEOF;}}if(tok->start==NULL)tok->buf=tok->cur;tok->lineno++;tok->inp=end;returnPy_CHARMASK(*tok->cur++);}if(tok->prompt!=NULL){char*new=PyOS_Readline(tok->prompt);if(tok->nextprompt!=NULL)tok->prompt=tok->nextprompt;if(new==NULL)tok->done=E_INTR;elseif(*new=='\0'){PyMem_FREE(new);tok->done=E_EOF;}elseif(tok->start!=NULL){size_tstart=tok->start-tok->buf;size_toldlen=tok->cur-tok->buf;size_tnewlen=oldlen+strlen(new);char*buf=tok->buf;PyMem_RESIZE(buf,char,newlen+1);tok->lineno++;if(buf==NULL){PyMem_DEL(tok->buf);tok->buf=NULL;PyMem_FREE(new);tok->done=E_NOMEM;returnEOF;}tok->buf=buf;tok->cur=tok->buf+oldlen;strcpy(tok->buf+oldlen,new);PyMem_FREE(new);tok->inp=tok->buf+newlen;tok->end=tok->inp+1;tok->start=tok->buf+start;}else{tok->lineno++;if(tok->buf!=NULL)PyMem_DEL(tok->buf);tok->buf=new;tok->cur=tok->buf;tok->inp=strchr(tok->buf,'\0');tok->end=tok->inp+1;}}else{intdone=0;intcur=0;char*pt;if(tok->start==NULL){if(tok->buf==NULL){tok->buf=PyMem_NEW(char,BUFSIZ);if(tok->buf==NULL){tok->done=E_NOMEM;returnEOF;}tok->end=tok->buf+BUFSIZ;}if(fgets(tok->buf,(int)(tok->end-tok->buf),tok->fp)==NULL){tok->done=E_EOF;done=1;}else{tok->done=E_OK;tok->inp=strchr(tok->buf,'\0');done=tok->inp[-1]=='\n';}}else{cur=tok->cur-tok->buf;if(feof(tok->fp)){tok->done=E_EOF;done=1;}elsetok->done=E_OK;}tok->lineno++;/* Read until '\n' or EOF */while(!done){intcurstart=tok->start==NULL?-1:tok->start-tok->buf;intcurvalid=tok->inp-tok->buf;intnewsize=curvalid+BUFSIZ;char*newbuf=tok->buf;PyMem_RESIZE(newbuf,char,newsize);if(newbuf==NULL){tok->done=E_NOMEM;tok->cur=tok->inp;returnEOF;}tok->buf=newbuf;tok->inp=tok->buf+curvalid;tok->end=tok->buf+newsize;tok->start=curstart<0?NULL:tok->buf+curstart;if(fgets(tok->inp,(int)(tok->end-tok->inp),tok->fp)==NULL){/* Last line does not end in \n, fake one */strcpy(tok->inp,"\n");}tok->inp=strchr(tok->inp,'\0');done=tok->inp[-1]=='\n';}tok->cur=tok->buf+cur;#ifndef macintosh/* replace "\r\n" with "\n" *//* For Mac we leave the \r, giving a syntax error */pt=tok->inp-2;if(pt>=tok->buf&&*pt=='\r'){*pt++='\n';*pt='\0';tok->inp=pt;}#endif}if(tok->done!=E_OK){if(tok->prompt!=NULL)PySys_WriteStderr("\n");tok->cur=tok->inp;returnEOF;}}/*NOTREACHED*/}/* Back-up one character */staticvoidtok_backup(registerstructtok_state*tok,registerintc){if(c!=EOF){if(--tok->cur<tok->buf)Py_FatalError("tok_backup: begin of buffer");if(*tok->cur!=c)*tok->cur=c;}}/* Return the token corresponding to a single character */intPyToken_OneChar(intc){switch(c){case'(':returnLPAR;case')':returnRPAR;case'[':returnLSQB;case']':returnRSQB;case':':returnCOLON;case',':returnCOMMA;case';':returnSEMI;case'+':returnPLUS;case'-':returnMINUS;case'*':returnSTAR;case'/':returnSLASH;case'|':returnVBAR;case'&':returnAMPER;case'<':returnLESS;case'>':returnGREATER;case'=':returnEQUAL;case'.':returnDOT;case'%':returnPERCENT;case'`':returnBACKQUOTE;case'{':returnLBRACE;case'}':returnRBRACE;case'^':returnCIRCUMFLEX;case'~':returnTILDE;default:returnOP;}}intPyToken_TwoChars(intc1,intc2){switch(c1){case'=':switch(c2){case'=':returnEQEQUAL;}break;case'!':switch(c2){case'=':returnNOTEQUAL;}break;case'<':switch(c2){case'>':returnNOTEQUAL;case'=':returnLESSEQUAL;case'<':returnLEFTSHIFT;}break;case'>':switch(c2){case'=':returnGREATEREQUAL;case'>':returnRIGHTSHIFT;}break;case'+':switch(c2){case'=':returnPLUSEQUAL;}break;case'-':switch(c2){case'=':returnMINEQUAL;}break;case'*':switch(c2){case'*':returnDOUBLESTAR;case'=':returnSTAREQUAL;}break;case'/':switch(c2){case'=':returnSLASHEQUAL;}break;case'|':switch(c2){case'=':returnVBAREQUAL;}break;case'%':switch(c2){case'=':returnPERCENTEQUAL;}break;case'&':switch(c2){case'=':returnAMPEREQUAL;}break;case'^':switch(c2){case'=':returnCIRCUMFLEXEQUAL;}break;}returnOP;}intPyToken_ThreeChars(intc1,intc2,intc3){switch(c1){case'<':switch(c2){case'<':switch(c3){case'=':returnLEFTSHIFTEQUAL;break;}break;}break;case'>':switch(c2){case'>':switch(c3){case'=':returnRIGHTSHIFTEQUAL;break;}break;}break;case'*':switch(c2){case'*':switch(c3){case'=':returnDOUBLESTAREQUAL;break;}break;}break;}returnOP;}staticintindenterror(structtok_state*tok){if(tok->alterror){tok->done=E_TABSPACE;tok->cur=tok->inp;return1;}if(tok->altwarning){PySys_WriteStderr("%s: inconsistent use of tabs and spaces ""in indentation\n",tok->filename);tok->altwarning=0;}return0;}/* Get next token, after space stripping etc. */intPyTokenizer_Get(registerstructtok_state*tok,char**p_start,char**p_end){registerintc;intblankline;*p_start=*p_end=NULL;nextline:tok->start=NULL;blankline=0;/* Get indentation level */if(tok->atbol){registerintcol=0;registerintaltcol=0;tok->atbol=0;for(;;){c=tok_nextc(tok);if(c==' ')col++,altcol++;elseif(c=='\t'){col=(col/tok->tabsize+1)*tok->tabsize;altcol=(altcol/tok->alttabsize+1)*tok->alttabsize;}elseif(c=='\014')/* Control-L (formfeed) */col=altcol=0;/* For Emacs users */elsebreak;}tok_backup(tok,c);if(c=='#'||c=='\n'){/* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */if(col==0&&c=='\n'&&tok->prompt!=NULL)blankline=0;/* Let it through */elseblankline=1;/* Ignore completely *//* We can't jump back right here since we still may need to skip to the end of a comment */}if(!blankline&&tok->level==0){if(col==tok->indstack[tok->indent]){/* No change */if(altcol!=tok->altindstack[tok->indent]){if(indenterror(tok))returnERRORTOKEN;}}elseif(col>tok->indstack[tok->indent]){/* Indent -- always one */if(tok->indent+1>=MAXINDENT){tok->done=E_TOODEEP;tok->cur=tok->inp;returnERRORTOKEN;}if(altcol<=tok->altindstack[tok->indent]){if(indenterror(tok))returnERRORTOKEN;}tok->pendin++;tok->indstack[++tok->indent]=col;tok->altindstack[tok->indent]=altcol;}else/* col < tok->indstack[tok->indent] */{/* Dedent -- any number, must be consistent */while(tok->indent>0&&col<tok->indstack[tok->indent]){tok->pendin--;tok->indent--;}if(col!=tok->indstack[tok->indent]){tok->done=E_DEDENT;tok->cur=tok->inp;returnERRORTOKEN;}if(altcol!=tok->altindstack[tok->indent]){if(indenterror(tok))returnERRORTOKEN;}}}}tok->start=tok->cur;/* Return pending indents/dedents */if(tok->pendin!=0){if(tok->pendin<0){tok->pendin++;returnDEDENT;}else{tok->pendin--;returnINDENT;}}again:tok->start=NULL;/* Skip spaces */do{c=tok_nextc(tok);}while(c==' '||c=='\t'||c=='\014');/* Set start of current token */tok->start=tok->cur-1;/* Skip comment, while looking for tab-setting magic */if(c=='#'){staticchar*tabforms[]={"tab-width:",/* Emacs */":tabstop=",/* vim, full form */":ts=",/* vim, abbreviated form */"set tabsize=",/* will vi never die? *//* more templates can be added here to support other editors */};charcbuf[80];char*tp,**cp;tp=cbuf;do{*tp++=c=tok_nextc(tok);}while(c!=EOF&&c!='\n'&&tp-cbuf+1<sizeof(cbuf));*tp='\0';for(cp=tabforms;cp<tabforms+sizeof(tabforms)/sizeof(tabforms[0]);cp++){if((tp=strstr(cbuf,*cp))){intnewsize=atoi(tp+strlen(*cp));if(newsize>=1&&newsize<=40){tok->tabsize=newsize;if(Py_VerboseFlag)PySys_WriteStderr("Tab size set to %d\n",newsize);}}}while(c!=EOF&&c!='\n')c=tok_nextc(tok);}/* Check for EOF and errors now */if(c==EOF){returntok->done==E_EOF?ENDMARKER:ERRORTOKEN;}/* Identifier (most frequent token!) */if(isalpha(c)||c=='_'){/* Process r"", u"" and ur"" */switch(c){case'r':case'R':c=tok_nextc(tok);if(c=='"'||c=='\'')gotoletter_quote;break;case'u':case'U':c=tok_nextc(tok);if(c=='r'||c=='R')c=tok_nextc(tok);if(c=='"'||c=='\'')gotoletter_quote;break;}while(isalnum(c)||c=='_'){c=tok_nextc(tok);}tok_backup(tok,c);*p_start=tok->start;*p_end=tok->cur;returnNAME;}/* Newline */if(c=='\n'){tok->atbol=1;if(blankline||tok->level>0)gotonextline;*p_start=tok->start;*p_end=tok->cur-1;/* Leave '\n' out of the string */returnNEWLINE;}#ifdef macintoshif(c=='\r'){PySys_WriteStderr("File contains \\r characters (incorrect line endings?)\n");tok->done=E_TOKEN;tok->cur=tok->inp;returnERRORTOKEN;}#endif /* Period or number starting with period? */if(c=='.'){c=tok_nextc(tok);if(isdigit(c)){gotofraction;}else{tok_backup(tok,c);*p_start=tok->start;*p_end=tok->cur;returnDOT;}}/* Number */if(isdigit(c)){if(c=='0'){/* Hex or octal */c=tok_nextc(tok);if(c=='.')gotofraction;#ifndef WITHOUT_COMPLEXif(c=='j'||c=='J')gotoimaginary;#endifif(c=='x'||c=='X'){/* Hex */do{c=tok_nextc(tok);}while(isxdigit(c));}else{/* XXX This is broken! E.g., 09.9 should be accepted as float! *//* Octal; c is first char of it *//* There's no 'isoctdigit' macro, sigh */while('0'<=c&&c<'8'){c=tok_nextc(tok);}}if(c=='l'||c=='L')c=tok_nextc(tok);}else{/* Decimal */do{c=tok_nextc(tok);}while(isdigit(c));if(c=='l'||c=='L')c=tok_nextc(tok);else{/* Accept floating point numbers. XXX This accepts incomplete things like XXX 12e or 1e+; worry run-time */if(c=='.'){fraction:/* Fraction */do{c=tok_nextc(tok);}while(isdigit(c));}if(c=='e'||c=='E'){/* Exponent part */c=tok_nextc(tok);if(c=='+'||c=='-')c=tok_nextc(tok);while(isdigit(c)){c=tok_nextc(tok);}}#ifndef WITHOUT_COMPLEXif(c=='j'||c=='J')/* Imaginary part */imaginary:c=tok_nextc(tok);#endif}}tok_backup(tok,c);*p_start=tok->start;*p_end=tok->cur;returnNUMBER;}letter_quote:/* String */if(c=='\''||c=='"'){intquote2=tok->cur-tok->start+1;intquote=c;inttriple=0;inttripcount=0;for(;;){c=tok_nextc(tok);if(c=='\n'){if(!triple){tok->done=E_TOKEN;tok_backup(tok,c);returnERRORTOKEN;}tripcount=0;}elseif(c==EOF){tok->done=E_TOKEN;tok->cur=tok->inp;returnERRORTOKEN;}elseif(c==quote){tripcount++;if(tok->cur-tok->start==quote2){c=tok_nextc(tok);if(c==quote){triple=1;tripcount=0;continue;}tok_backup(tok,c);}if(!triple||tripcount==3)break;}elseif(c=='\\'){tripcount=0;c=tok_nextc(tok);if(c==EOF){tok->done=E_TOKEN;tok->cur=tok->inp;returnERRORTOKEN;}}elsetripcount=0;}*p_start=tok->start;*p_end=tok->cur;returnSTRING;}/* Line continuation */if(c=='\\'){c=tok_nextc(tok);if(c!='\n'){tok->done=E_TOKEN;tok->cur=tok->inp;returnERRORTOKEN;}gotoagain;/* Read next line */}/* Check for two-character token */{intc2=tok_nextc(tok);inttoken=PyToken_TwoChars(c,c2);if(token!=OP){intc3=tok_nextc(tok);inttoken3=PyToken_ThreeChars(c,c2,c3);if(token3!=OP){token=token3;}else{tok_backup(tok,c3);}*p_start=tok->start;*p_end=tok->cur;returntoken;}tok_backup(tok,c2);}/* Keep track of parentheses nesting level */switch(c){case'(':case'[':case'{':tok->level++;break;case')':case']':case'}':tok->level--;break;}/* Punctuation character */*p_start=tok->start;*p_end=tok->cur;returnPyToken_OneChar(c);}#ifdef Py_DEBUGvoidtok_dump(inttype,char*start,char*end){printf("%s",_PyParser_TokenNames[type]);if(type==NAME||type==NUMBER||type==STRING||type==OP)printf("(%.*s)",(int)(end-start),start);}#endif