unicode/UTF-8
Denys Duchier
duchier at ps.uni-sb.de
Mon Jul 25 23:00:04 CEST 2005
As promised, this is (hopefully) the start of a thread to discuss a library
providing support for unicode/UTF-8 encoding/decoding. I am attaching the code
for a functor (which I modified recently, but haven't tested, so I hope it still
works) that provides this functionality based on the "on-demand batch stream"
idea that I presented in an earlier message.
My hope is that, because this design is (hopefully) both efficient and elegant
(in the Oz sense), that we will want to use it (or something like it) to provide
general support for various encodings (hopefully, in a more or less transparent
way).
-------------- next part --------------
functor
export
EncodeBy Encode
DecodeBy Decode
prepare
MakeError = Exception.error
MakeFailed = Value.failed
RaiseError = Exception.raiseError
proc {EncodeBy By IN OUT}
proc {Loop IN OUT OK}
if OK > 0 then
case IN
of nil then OUT=nil
[] C|T then
if C < 0 then
OUT = {MakeFailed {MakeError utf8(encode:input(IN))}}
elseif C < 128 then OUT2 in
OUT = C|OUT2
{Loop T OUT2 OK-1}
elseif C < 2048 then
OUT2
Lo = C mod 64
Hi = C div 64
in
OUT = 192+Hi|128+Lo|OUT2
{Loop T OUT2 OK-1}
elseif C < 65536 then
OUT2
Lo = C mod 64
R1 = C div 64
Md = R1 mod 64
Hi = R1 div 64
in
OUT = 224+Hi|128+Md|128+Lo|OUT2
{Loop T OUT2 OK-1}
elseif C < 1114112 then
OUT2
Lo = C mod 64
R1 = C div 64
Ml = R1 mod 64
R2 = R1 div 64
Mh = R2 mod 64
Hi = R2 div 64
in
OUT = 240+Hi|128+Mh|128+Ml|128+Lo|OUT2
{Loop T OUT2 OK-1}
else
OUT = {MakeFailed {MakeError utf8(encode:input(IN))}}
end
end
else
{WaitNeeded OUT}
{Loop IN OUT By}
end
end
in
if {IsInt By} andthen By > 0 then
thread {Loop IN OUT 0} end
else
{RaiseError utf8(encode:by(By))}
end
end
BY_ENCODE = 1000
proc {Encode IN OUT}
{EncodeBy BY_ENCODE IN OUT}
end
proc {DecodeBy By IN OUT}
proc {Loop IN OUT OK}
if OK > 0 then
case IN
of nil then OUT=nil
[] H1|T andthen (H1 div 128)==0 then OUT2 in
OUT = H1|OUT2
{Loop T OUT2 OK-1}
[] H1|H2|T
andthen (H1 div 32)==6
andthen (H2 div 64)==2
then OUT2 in
OUT = ((H1 mod 32)*64 + (H2 mod 64))|OUT2
{Loop T OUT2 OK-1}
[] H1|H2|H3|T
andthen (H1 div 16)==14
andthen (H2 div 64)==2
andthen (H3 div 64)==2
then OUT2 in
OUT = (((H1 mod 16)*64 + (H2 mod 64))*64 + (H3 mod 64))|OUT2
{Loop T OUT2 OK-1}
[] H1|H2|H3|H4|T
andthen (H1 div 8)==30
andthen (H2 div 64)==2
andthen (H3 div 64)==2
andthen (H4 div 64)==2
then OUT2 in
OUT = ((((H1 mod 8)*64 + (H2 mod 64))*64 + (H3 mod 64))*64 + (H4 mod 64))|OUT2
{Loop T OUT2 OK-1}
else
OUT = {MakeFailed {MakeError utf8(decode(IN))}}
end
else
{WaitNeeded OUT}
{Loop IN OUT By}
end
end
in
if {IsInt By} andthen By > 0 then
thread {Loop IN OUT 0} end
else
{RaiseError utf8(decode:by(By))}
end
end
BY_DECODE = BY_ENCODE
proc {Decode IN OUT}
{DecodeBy BY_DECODE IN OUT}
end
end
-------------- next part --------------
--
Dr. Denys Duchier - IRI & LIFL - CNRS, Lille, France
+33 (0)6 25 78 25 74 http://www.lifl.fr/~duchier/
More information about the mozart-hackers
mailing list