首页  编辑  

UTF8解码

Tags: /超级猛料/Format.格式,单位/Encode.编码、格式/   Date Created:

UTF 8解码

Always use UTF8 or unicode strings if possible

Java and dotNet coders don't have to do anything special but "native language" coders must do few extra steps to overcome unicode problems.

Here is widechar := URLDecodeUTF8(ansichar) conversion method and example UI application. I have taken inspiration from Indy urldecode function and converted it to widechar enabled function. Indy urldecode support only string and legacy charset encoded strings.

Here is full sources and example UI exe to try it out. I have created dll library because I need to call it from c++ program. You can copypaste code to suit your needs. Test application uses TntUnicode components, see reference image.

orcoredll.zip sources and exe

dlltest.png reference image

Convert UTF8 encoded ansi string to widestring:

unit orcoredll_unit1;

interface

// unit's public functions. use "C-style" stdcall stack

// for better compatibility to other applications.

function WideStringToAnsiString(const wc: PWideChar; var buf: PAnsiChar): integer; stdcall;

function URLDecodeUTF8(const s: PAnsiChar;  var buf: PWideChar): integer; stdcall;

implementation

uses SysUtils;

{ Convert widechar str to ansichar str }

function WideStringToAnsiString(const wc: PWideChar; var buf: PAnsiChar): integer; stdcall;

var

  sAnsi: AnsiString;

begin

  sAnsi := WideCharToString(wc);

  buf := PAnsiChar(sAnsi);

  Result := Length(sAnsi);

end;

{ decode utf8 encoded str to widechar str }

function URLDecodeUTF8(const s: PAnsiChar; var buf: PWideChar): integer; stdcall;

var

  sAnsi: String;    // normal ansi string

  sUtf8: String;    // utf8-bytes string

  sWide: WideString; // unicode string

  i,utf8Pos: integer;

  ESC: string[2];

  CharCode: integer;

  c: char;

begin

  sAnsi := s; // null-terminated str to pascal str

  SetLength(sUtf8, Length(sAnsi));

  // Convert URLEncoded str to utf8 str,

  // string must have been encoded with utf8 charset.

  //    +      = space

  //    %2A    = *

  //    %C3%84 = Ä (A with diaeresis)

  i := 1;

  utf8Pos := 1;

  while (i <= Length(sAnsi)) do begin

     if (sAnsi[i] <> '%') then begin

        if (sAnsi[i] = '+') then begin

           c := ' ';

        end else begin

           c := sAnsi[i];

        end;

        //sUtf8 := sUtf8 + c;

        sUtf8[utf8Pos] := c;

        Inc(utf8Pos);

     end else begin

        Inc(i); // skip the % char

        ESC := Copy(sAnsi, i, 2); // Copy the escape code

        Inc(i, 1); // skip ESC, another +1 at end of loop

        try

           CharCode := StrToInt('$' + ESC);

           if (CharCode > 0) and (CharCode < 256) then begin

              c := Char(CharCode);

              //sUtf8 := sUtf8 + c;

              sUtf8[utf8Pos] := c;

              Inc(utf8Pos);              

           end;

        except end;

     end;

     Inc(i);

  end;

  SetLength(sUtf8, utf8Pos-1);

  sWide := UTF8Decode(sUtf8); // utf8 string to unicode

  buf := PWideChar(sWide);

  Result := Length(sWide);

end;

end.

---------------------------------------

以下代码在DELPHI 7上调试通过,主要使用了api函数中MultiByteToWidechar

function UnicodeEncode(Str:string;CodePage:integer):WideString;

var

 Len:integer;

begin

 Len:=Length(Str)+1;

 SetLength(Result,Len);

 Len:=MultiByteToWideChar(CodePage,0,PChar(Str),-1,PWideChar(Result),Len);

 SetLength(Result,Len-1); //end is #0

end;

function UnicodeDecode(Str:WideString;CodePage:integer):string;

var

 Len:integer;

begin

 Len:=Length(Str)*2+1;  //one for #0

 SetLength(Result,Len);

 Len:=WideCharToMultiByte(CodePage,0,PWideChar(Str),-1,PChar(Result),Len,nil,nil);

 SetLength(Result,Len-1);

end;

function Gb2Big5(Str:string):string;

begin

 SetLength(Result,Length(Str));

 LCMapString(GetUserDefaultLCID,LCMAP_TRADITIONAL_CHINESE,

   PChar(Str),Length(Str),

   PChar(Result),Length(Result));

 Result:=UnicodeDecode(UnicodeEncode(Result,936),950);

end;

function Big52Gb(Str:string):string;

begin

 Str:=UnicodeDecode(UnicodeEncode(Str,950),936);

 SetLength(Result,Length(Str));

 LCMapString(GetUserDefaultLCID,LCMAP_SIMPLIFIED_CHINESE,

   PChar(Str),Length(Str),

   PChar(Result),Length(Result));

end;

关键使用了UnicodeToUtf8这个函数

function Utf8Encode(const WS: WideString): UTF8String;

var

 L: Integer;

 Temp: UTF8String;

begin

 Result := '';

 if WS = '' then Exit;

 SetLength(Temp, Length(WS) * 3); // SetLength includes space for null terminator

 L := UnicodeToUtf8(PChar(Temp), Length(Temp)+1, PWideChar(WS), Length(WS));

 if L > 0 then

   SetLength(Temp, L-1)

 else

   Temp := '';

 Result := Temp;

end; -

=======================

方法二:

后来想到了用Unicode编码,但该死的Delphi控件竟然不支持Unicode,后来在网上找到了Tnt控件,可以支持Unicode不过而项目已经差不多快好了,要大规模换控件是不可能的,就想到读一下源代码,看看Tnt控件是如何做到的。读完之后一阵绝望,Tnt控件几乎全用W结尾的API,连创建窗体都是用CreateWindowExW,那还有什么好话说呢,自己重做就不如全换Tnt控件

----------------------

Delphi: 将unicode码转换为汉字

Night @ 2004-12-01 21:12

function UnicodeToAnsi(SubUnicode: string):string;  //将unicode码转换为汉字

  var a:array[0..500] of char;

     s1,s2:char;

     substr1,substr2,s:string;

     str:string;

     i:integer;

  begin

    if length(SubUnicode) mod 4 = 0 then

    Begin

      str:='';

      for i:=1 to length(SubUnicode) div 4 do

      Begin

        s:='';

        substr1:=copy(SubUnicode,i*4-3,2);

        substr2:=copy(SubUnicode,i*4-1,2);

        s1:=chr(hextoint(substr1));

        s2:=chr(hextoint(substr2));

        s:=s+s2+s1;

        strpcopy(a,s);

        str:=str+copy(widechartostring(@(a[0])),1,2);

      end;

      result:=str;

    end;

  end;

function HexToInt(hex:string):cardinal;

const cHex='0123456789ABCDEF';

var mult,i,loop:integer;

begin

     result:=0;

     mult:=1;

     for loop:=length(hex)downto 1 do

     begin

      i:=pos(hex[loop],cHex)-1;

      if (i<0) then i:=0;

      inc(result,(i*mult));

      mult:=mult*16;

     end;

end;