unicode/UTF-8

Denys Duchier duchier at ps.uni-sb.de
Mon Jul 25 23:00:04 CEST 2005


As promised, this is (hopefully) the start of a thread to discuss a library
providing support for unicode/UTF-8 encoding/decoding.  I am attaching the code
for a functor (which I modified recently, but haven't tested, so I hope it still
works) that provides this functionality based on the "on-demand batch stream"
idea that I presented in an earlier message.

My hope is that, because this design is (hopefully) both efficient and elegant
(in the Oz sense), that we will want to use it (or something like it) to provide
general support for various encodings (hopefully, in a more or less transparent
way).

-------------- next part --------------
functor
export
   EncodeBy Encode
   DecodeBy Decode
prepare
   MakeError = Exception.error
   MakeFailed = Value.failed
   RaiseError = Exception.raiseError
   
   proc {EncodeBy By IN OUT}
      proc {Loop IN OUT OK}
	 if OK > 0 then
	    case IN
	    of nil then OUT=nil
	    [] C|T then
	       if C < 0 then
		  OUT = {MakeFailed {MakeError utf8(encode:input(IN))}}
	       elseif C < 128 then OUT2 in
		  OUT = C|OUT2
		  {Loop T OUT2 OK-1}
	       elseif C < 2048 then
		  OUT2
		  Lo = C mod 64
		  Hi = C div 64
	       in
		  OUT = 192+Hi|128+Lo|OUT2
		  {Loop T OUT2 OK-1}
	       elseif C < 65536 then
		  OUT2
		  Lo = C mod 64
		  R1 = C div 64
		  Md = R1 mod 64
		  Hi = R1 div 64
	       in
		  OUT = 224+Hi|128+Md|128+Lo|OUT2
		  {Loop T OUT2 OK-1}
	       elseif C < 1114112 then
		  OUT2
		  Lo = C mod 64
		  R1 = C div 64
		  Ml = R1 mod 64
		  R2 = R1 div 64
		  Mh = R2 mod 64
		  Hi = R2 div 64
	       in
		  OUT = 240+Hi|128+Mh|128+Ml|128+Lo|OUT2
		  {Loop T OUT2 OK-1}
	       else
		  OUT = {MakeFailed {MakeError utf8(encode:input(IN))}}
	       end
	    end
	 else
	    {WaitNeeded OUT}
	    {Loop IN OUT By}
	 end
      end
   in
      if {IsInt By} andthen By > 0 then
	 thread {Loop IN OUT 0} end
      else
	 {RaiseError utf8(encode:by(By))}
      end
   end

   BY_ENCODE = 1000

   proc {Encode IN OUT}
      {EncodeBy BY_ENCODE IN OUT}
   end

   proc {DecodeBy By IN OUT}
      proc {Loop IN OUT OK}
	 if OK > 0 then
	    case IN
	    of nil then OUT=nil
	    [] H1|T andthen (H1 div 128)==0 then OUT2 in
	       OUT = H1|OUT2
	       {Loop T OUT2 OK-1}
	    [] H1|H2|T
	       andthen (H1 div 32)==6
	       andthen (H2 div 64)==2
	    then OUT2 in
	       OUT = ((H1 mod 32)*64 + (H2 mod 64))|OUT2
	       {Loop T OUT2 OK-1}
	    [] H1|H2|H3|T
	       andthen (H1 div 16)==14
	       andthen (H2 div 64)==2
	       andthen (H3 div 64)==2
	    then OUT2 in
	       OUT = (((H1 mod 16)*64 + (H2 mod 64))*64 + (H3 mod 64))|OUT2
	       {Loop T OUT2 OK-1}
	    [] H1|H2|H3|H4|T
	       andthen (H1 div 8)==30
	       andthen (H2 div 64)==2
	       andthen (H3 div 64)==2
	       andthen (H4 div 64)==2
	    then OUT2 in
	       OUT = ((((H1 mod 8)*64 + (H2 mod 64))*64 + (H3 mod 64))*64 + (H4 mod 64))|OUT2
	       {Loop T OUT2 OK-1}
	    else
	       OUT = {MakeFailed {MakeError utf8(decode(IN))}}
	    end
	 else
	    {WaitNeeded OUT}
	    {Loop IN OUT By}
	 end
      end
   in
      if {IsInt By} andthen By > 0 then
	 thread {Loop IN OUT 0} end
      else
	 {RaiseError utf8(decode:by(By))}
      end
   end

   BY_DECODE = BY_ENCODE

   proc {Decode IN OUT}
      {DecodeBy BY_DECODE IN OUT}
   end
end
-------------- next part --------------

-- 
Dr. Denys Duchier - IRI & LIFL - CNRS, Lille, France
+33 (0)6 25 78 25 74    http://www.lifl.fr/~duchier/


More information about the mozart-hackers mailing list