{-# LANGUAGE BangPatterns #-}{-# OPTIONS_GHC -O #-}-- We always optimise this, otherwise performance of a non-optimised-- compiler is severely affected-- --------------------------------------------------------------------------------- (c) The University of Glasgow, 1997-2006---- Character encodings---- -----------------------------------------------------------------------------moduleEncoding(-- * UTF-8utf8DecodeChar#,utf8PrevChar,utf8CharStart,utf8DecodeChar,utf8DecodeString,utf8EncodeChar,utf8EncodeString,utf8EncodedLength,countUTF8Chars,-- * Z-encodingzEncodeString,zDecodeString)where#include "HsVersions.h"importForeignimportData.CharimportNumericimportGHC.Ptr(Ptr(..))importGHC.Base-- ------------------------------------------------------------------------------- UTF-8-- We can't write the decoder as efficiently as we'd like without-- resorting to unboxed extensions, unfortunately. I tried to write-- an IO version of this function, but GHC can't eliminate boxed-- results from an IO-returning function.---- We assume we can ignore overflow when parsing a multibyte character here.-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences-- before decoding them (see StringBuffer.hs).{-# INLINE utf8DecodeChar# #-}utf8DecodeChar#::Addr#->(#Char#,Addr##)utf8DecodeChar#a#=let!ch0=word2Int#(indexWord8OffAddr#a#0#)incase()of_|ch0<=#0x7F#->(#chr#ch0,a#`plusAddr#`1##)|ch0>=#0xC0#&&ch0<=#0xDF#->let!ch1=word2Int#(indexWord8OffAddr#a#1#)inifch1<#0x80#||ch1>=#0xC0#thenfail1#else(#chr#(((ch0-#0xC0#)`uncheckedIShiftL#`6#)+#(ch1-#0x80#)),a#`plusAddr#`2##)|ch0>=#0xE0#&&ch0<=#0xEF#->let!ch1=word2Int#(indexWord8OffAddr#a#1#)inifch1<#0x80#||ch1>=#0xC0#thenfail1#elselet!ch2=word2Int#(indexWord8OffAddr#a#2#)inifch2<#0x80#||ch2>=#0xC0#thenfail2#else(#chr#(((ch0-#0xE0#)`uncheckedIShiftL#`12#)+#((ch1-#0x80#)`uncheckedIShiftL#`6#)+#(ch2-#0x80#)),a#`plusAddr#`3##)|ch0>=#0xF0#&&ch0<=#0xF8#->let!ch1=word2Int#(indexWord8OffAddr#a#1#)inifch1<#0x80#||ch1>=#0xC0#thenfail1#elselet!ch2=word2Int#(indexWord8OffAddr#a#2#)inifch2<#0x80#||ch2>=#0xC0#thenfail2#elselet!ch3=word2Int#(indexWord8OffAddr#a#3#)inifch3<#0x80#||ch3>=#0xC0#thenfail3#else(#chr#(((ch0-#0xF0#)`uncheckedIShiftL#`18#)+#((ch1-#0x80#)`uncheckedIShiftL#`12#)+#((ch2-#0x80#)`uncheckedIShiftL#`6#)+#(ch3-#0x80#)),a#`plusAddr#`4##)|otherwise->fail1#where-- all invalid sequences end up here:failn=(#'\0'#,a#`plusAddr#`n#)-- '\xFFFD' would be the usual replacement character, but-- that's a valid symbol in Haskell, so will result in a-- confusing parse error later on. Instead we use '\0' which-- will signal a lexer error immediately.utf8DecodeChar::PtrWord8->(Char,PtrWord8)utf8DecodeChar(Ptra#)=caseutf8DecodeChar#a#of(#c#,b##)->(C#c#,Ptrb#)-- UTF-8 is cleverly designed so that we can always figure out where-- the start of the current character is, given any position in a-- stream. This function finds the start of the previous character,-- assuming there *is* a previous character.utf8PrevChar::PtrWord8->IO(PtrWord8)utf8PrevCharp=utf8CharStart(p`plusPtr`(-1))utf8CharStart::PtrWord8->IO(PtrWord8)utf8CharStartp=gopwheregop=dow<-peekpifw>=0x80&&w<0xC0thengo(p`plusPtr`(-1))elsereturnputf8DecodeString::PtrWord8->Int->IO[Char]STRICT2(utf8DecodeString)utf8DecodeString(Ptra#)(I#len#)=unpacka#where!end#=addr2Int#(a#`plusAddr#`len#)unpackp#|addr2Int#p#>=#end#=return[]|otherwise=caseutf8DecodeChar#p#of(#c#,q##)->dochs<-unpackq#return(C#c#:chs)countUTF8Chars::PtrWord8->Int->IOIntcountUTF8Charsptrbytes=goptr0whereend=ptr`plusPtr`bytesSTRICT2(go)goptrn|ptr>=end=returnn|otherwise=docaseutf8DecodeChar#(unPtrptr)of(#_,a#)->go(Ptra)(n+1)unPtr::Ptra->Addr#unPtr(Ptra)=autf8EncodeChar::Char->PtrWord8->IO(PtrWord8)utf8EncodeCharcptr=letx=ordcincase()of_|x>0&&x<=0x007f->dopokeptr(fromIntegralx)return(ptr`plusPtr`1)-- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we-- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).|x<=0x07ff->dopokeptr(fromIntegral(0xC0.|.((x`shiftR`6).&.0x1F)))pokeElemOffptr1(fromIntegral(0x80.|.(x.&.0x3F)))return(ptr`plusPtr`2)|x<=0xffff->dopokeptr(fromIntegral(0xE0.|.(x`shiftR`12).&.0x0F))pokeElemOffptr1(fromIntegral(0x80.|.(x`shiftR`6).&.0x3F))pokeElemOffptr2(fromIntegral(0x80.|.(x.&.0x3F)))return(ptr`plusPtr`3)|otherwise->dopokeptr(fromIntegral(0xF0.|.(x`shiftR`18)))pokeElemOffptr1(fromIntegral(0x80.|.((x`shiftR`12).&.0x3F)))pokeElemOffptr2(fromIntegral(0x80.|.((x`shiftR`6).&.0x3F)))pokeElemOffptr3(fromIntegral(0x80.|.(x.&.0x3F)))return(ptr`plusPtr`4)utf8EncodeString::PtrWord8->String->IO()utf8EncodeStringptrstr=goptrstrwhereSTRICT2(go)go_[]=return()goptr(c:cs)=doptr'<-utf8EncodeCharcptrgoptr'csutf8EncodedLength::String->Intutf8EncodedLengthstr=go0strwhereSTRICT2(go)gon[]=ngon(c:cs)|ordc>0&&ordc<=0x007f=go(n+1)cs|ordc<=0x07ff=go(n+2)cs|ordc<=0xffff=go(n+3)cs|otherwise=go(n+4)cs-- ------------------------------------------------------------------------------- The Z-encoding{-
This is the main name-encoding and decoding function. It encodes any
string into a string that is acceptable as a C name. This is done
right before we emit a symbol name into the compiled C or asm code.
Z-encoding of strings is cached in the FastString interface, so we
never encode the same string more than once.
The basic encoding scheme is this.
* Tuples (,,,) are coded as Z3T
* Alphabetic characters (upper and lower) and digits
all translate to themselves;
except 'Z', which translates to 'ZZ'
and 'z', which translates to 'zz'
We need both so that we can preserve the variable/tycon distinction
* Most other printable characters translate to 'zx' or 'Zx' for some
alphabetic character x
* The others translate as 'znnnU' where 'nnn' is the decimal number
of the character
Before After
--------------------------
Trak Trak
foo_wib foozuwib
> zg
>1 zg1
foo# foozh
foo## foozhzh
foo##1 foozhzh1
fooZ fooZZ
:+ ZCzp
() Z0T 0-tuple
(,,,,) Z5T 5-tuple
(# #) Z1H unboxed 1-tuple (note the space)
(#,,,,#) Z5H unboxed 5-tuple
(NB: There is no Z1T nor Z0H.)
-}typeUserString=String-- As the user typed ittypeEncodedString=String-- Encoded formzEncodeString::UserString->EncodedStringzEncodeStringcs=casemaybe_tuplecsofJustn->n-- Tuples go to Z2T etcNothing->gocswherego[]=[]go(c:cs)=encode_digit_chc++go'csgo'[]=[]go'(c:cs)=encode_chc++go'csunencodedChar::Char->Bool-- True for chars that don't need encodingunencodedChar'Z'=FalseunencodedChar'z'=FalseunencodedCharc=c>='a'&&c<='z'||c>='A'&&c<='Z'||c>='0'&&c<='9'-- If a digit is at the start of a symbol then we need to encode it.-- Otherwise package names like 9pH-0.1 give linker errors.encode_digit_ch::Char->EncodedStringencode_digit_chc|c>='0'&&c<='9'=encode_as_unicode_charcencode_digit_chc|otherwise=encode_chcencode_ch::Char->EncodedStringencode_chc|unencodedCharc=[c]-- Common case first-- Constructorsencode_ch'('="ZL"-- Needed for things like (,), and (->)encode_ch')'="ZR"-- For symmetry with (encode_ch'['="ZM"encode_ch']'="ZN"encode_ch':'="ZC"encode_ch'Z'="ZZ"-- Variablesencode_ch'z'="zz"encode_ch'&'="za"encode_ch'|'="zb"encode_ch'^'="zc"encode_ch'$'="zd"encode_ch'='="ze"encode_ch'>'="zg"encode_ch'#'="zh"encode_ch'.'="zi"encode_ch'<'="zl"encode_ch'-'="zm"encode_ch'!'="zn"encode_ch'+'="zp"encode_ch'\''="zq"encode_ch'\\'="zr"encode_ch'/'="zs"encode_ch'*'="zt"encode_ch'_'="zu"encode_ch'%'="zv"encode_chc=encode_as_unicode_charcencode_as_unicode_char::Char->EncodedStringencode_as_unicode_charc='z':ifisDigit(headhex_str)thenhex_strelse'0':hex_strwherehex_str=showHex(ordc)"U"-- ToDo: we could improve the encoding here in various ways.-- eg. strings of unicode characters come out as 'z1234Uz5678U', we-- could remove the 'U' in the middle (the 'z' works as a separator).zDecodeString::EncodedString->UserStringzDecodeString[]=[]zDecodeString('Z':d:rest)|isDigitd=decode_tupledrest|otherwise=decode_upperd:zDecodeStringrestzDecodeString('z':d:rest)|isDigitd=decode_num_escdrest|otherwise=decode_lowerd:zDecodeStringrestzDecodeString(c:rest)=c:zDecodeStringrestdecode_upper,decode_lower::Char->Chardecode_upper'L'='('decode_upper'R'=')'decode_upper'M'='['decode_upper'N'=']'decode_upper'C'=':'decode_upper'Z'='Z'decode_upperch={-pprTrace "decode_upper" (char ch)-}chdecode_lower'z'='z'decode_lower'a'='&'decode_lower'b'='|'decode_lower'c'='^'decode_lower'd'='$'decode_lower'e'='='decode_lower'g'='>'decode_lower'h'='#'decode_lower'i'='.'decode_lower'l'='<'decode_lower'm'='-'decode_lower'n'='!'decode_lower'p'='+'decode_lower'q'='\''decode_lower'r'='\\'decode_lower's'='/'decode_lower't'='*'decode_lower'u'='_'decode_lower'v'='%'decode_lowerch={-pprTrace "decode_lower" (char ch)-}ch-- Characters not having a specific code are coded as z224U (in hex)decode_num_esc::Char->EncodedString->UserStringdecode_num_escdrest=go(digitToIntd)restwheregon(c:rest)|isHexDigitc=go(16*n+digitToIntc)restgon('U':rest)=chrn:zDecodeStringrestgonother=error("decode_num_esc: "++shown++' ':other)decode_tuple::Char->EncodedString->UserStringdecode_tupledrest=go(digitToIntd)restwhere-- NB. recurse back to zDecodeString after decoding the tuple, because-- the tuple might be embedded in a longer name.gon(c:rest)|isDigitc=go(10*n+digitToIntc)restgo0('T':rest)="()"++zDecodeStringrestgon('T':rest)='(':replicate(n-1)','++")"++zDecodeStringrestgo1('H':rest)="(# #)"++zDecodeStringrestgon('H':rest)='(':'#':replicate(n-1)','++"#)"++zDecodeStringrestgonother=error("decode_tuple: "++shown++' ':other){-
Tuples are encoded as
Z3T or Z3H
for 3-tuples or unboxed 3-tuples respectively. No other encoding starts
Z<digit>
* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple)
There are no unboxed 0-tuples.
* "()" is the tycon for a boxed 0-tuple.
There are no boxed 1-tuples.
-}maybe_tuple::UserString->MaybeEncodedStringmaybe_tuple"(# #)"=Just("Z1H")maybe_tuple('(':'#':cs)=casecount_commas(0::Int)csof(n,'#':')':_)->Just('Z':shows(n+1)"H")_->Nothingmaybe_tuple"()"=Just("Z0T")maybe_tuple('(':cs)=casecount_commas(0::Int)csof(n,')':_)->Just('Z':shows(n+1)"T")_->Nothingmaybe_tuple_=Nothingcount_commas::Int->String->(Int,String)count_commasn(',':cs)=count_commas(n+1)cscount_commasncs=(n,cs)