; _______________________________________________________________________________________
;| |
;| ..::FreshLib::.. Free, open source. Licensed under "Fresh artistic license." |
;|_______________________________________________________________________________________|
;
; Description: OS independent text encodings translation library.
;
; Target OS: Any
;
; Dependencies: StrLib.asm
;
; Notes:
;
;_________________________________________________________________________________________
; AnsiToUtf8 converts string from ANSI to UTF-8
;
; arguments:
; .SrcEncoding - pointer to 256 ansi2unicode conversion table.
; .hString - handle or pointer to the source string.
; returns:
; eax - handle of new created string with utf-8 encoded text.
proc AnsiToUtf8, .SrcEncoding, .hString
.result dd ?
.start dd ?
begin
pushad
stdcall StrLen, [.hString]
lea ecx, [4*eax+16]
stdcall StrNew
mov [.result], eax
jecxz .finish2
stdcall StrSetCapacity, [.result], ecx
mov edi, eax
mov [.start], eax
stdcall StrPtr, [.hString]
mov esi, eax
mov ebx, [.SrcEncoding]
.loop:
movzx eax, byte [esi]
lea esi, [esi+1]
test eax, eax
jz .finish
mov edx, 1
cmp eax, $80
jb .direct
sub eax, $80
movzx eax, word [ebx+2*eax]
stdcall EncodeUtf8, eax
.direct:
mov [edi], eax
add edi, edx
jmp .loop
.finish:
xor eax, eax
mov [edi], eax
mov eax, [.start]
sub edi, eax
mov [eax+string.len], edi
.finish2:
popad
mov eax, [.result]
return
endp
; Utf8ToAnsi converts string from UTF-8 to some ANSI encoding.
;
; arguments:
; .hString - pointer of handle of utf-8 string.
; .DstEncoding - pointer to utf2ansi conversion table.
; returns:
; eax - handle of new created string with ansi encoded text.
proc Utf8ToAnsi, .hString, .DstEncoding
.result dd ?
.char dd ?
begin
pushad
mov ebx, [.DstEncoding]
stdcall StrLen, [.hString]
mov ecx, eax
stdcall StrNew
mov [.result], eax
stdcall StrSetCapacity, eax, ecx
mov edi, eax
stdcall StrPtr, [.hString]
mov esi, eax
.loop:
stdcall DecodeUtf8, [esi]
lea esi, [esi+edx]
jnc .ok
test edx, edx
jz .end_of_string ; invalid unicode char
.ok:
test eax, eax
jz .end_of_string ; zero terminated
cmp eax, $ffff
ja .not_found
cmp eax, $80
jb .store
; search in the table
mov [.char], eax
xor edx, edx
xor cl, cl ; start
mov ch, $7f ; end
.search_loop:
cmp cl, ch
ja .not_found
mov dl, cl
add dl, ch
shr edx, 1
movzx eax, byte [ebx+256+edx]
movzx eax, word [ebx+2*eax-$100] ; unicode character
cmp [.char], eax
je .found
ja .bigger
mov ch, dl
dec ch
jmp .search_loop
.bigger:
mov cl, dl
inc cl
jmp .search_loop
.found:
mov al, [ebx+256+edx]
.store:
stosb
jmp .loop
.not_found:
mov al, '?'
stosb
jmp .loop
.end_of_string:
mov dword [edi], 0
stdcall StrPtr, [.result]
sub edi, eax
mov [eax+string.len], edi
popad
mov eax, [.result]
return
endp
; converts unicode character [.unichar] to UTF-8 encoding (max to 4 bytes)
;
; returns: CF=0
; EAX = utf-8 character
; EDX = lenfth of the character in bytes.
; CF=1 - the encoding can not be provided.
; eax = 0
; edx = 0
proc EncodeUtf8, .unichar
begin
push ebx ecx edi
xor eax, eax
mov edi, 1
mov ebx, [.unichar]
cmp ebx, $80
jb .one_byte
mov edx, $ffffffc0 ; mask
.loop:
inc edi
cmp edi, 4
ja .error
sar edx, 1
movzx ecx, bl
and cl, $3f
or cl, $80
shl eax, 8
shr ebx, 6
or eax, ecx ; eax == result
test ebx, edx
jnz .loop
lea ecx, [2*edx]
xor ecx, edx
or ebx, edx
xor ebx, ecx
shl eax, 8
mov al, bl
.finish:
mov edx, edi
pop edi ecx ebx
clc
return
.one_byte:
mov eax, ebx
jmp .finish
.error:
xor eax, eax
xor edx, edx
pop edi ecx ebx
stc
return
endp
; decodes 4 bytes in [.chars] to UNICODE dword value.
; returns:
; CF=0 - no error
; eax - unicode value.
; edx - byte count of the char. [1..4]
; CF=1 - invalid utf-8 char;
; eax = edx = 0 the character can not be decoded.
; edx <> 0 -> eax = the overlong encoded character. edx = byte count of the char.
;
; Note: When CF=1 and [.chars] are overlong encoded char.
; eax contains the proper value and edx contains the proper length.
; But it is still invalid character, according to the standards.
proc DecodeUtf8, .chars
begin
push ebx ecx
xor ecx, ecx
.loop1:
shl byte [.chars], 1
jnc .countok
inc ecx
jmp .loop1
.countok:
jecxz .ascii
cmp ecx, 1
je .error ; internal byte
cmp ecx, 4
ja .error ; more than 4 bytes
mov edx, 1
xor ebx, ebx
movzx eax, byte [.chars]
shr eax, cl
shr eax, 1
.loop2:
mov bl, byte [.chars+edx]
and bl, $c0
cmp bl, $80
jne .error
mov bl, byte [.chars+edx]
and bl, $3f
shl eax, 6
or eax, ebx
inc edx
cmp edx, ecx
jne .loop2
and eax, $1fffff
cmp eax, $10ffff
ja .error
cmp eax, [._minimal+4*edx-8]
jb .overlong ; overlong coding.
clc
pop ecx ebx
return
.ascii:
movzx eax, byte [.chars]
shr eax, 1
mov edx, 1
pop ecx ebx
clc
return
.error:
xor eax, eax
xor edx, edx
.overlong:
stc
pop ecx ebx
return
._minimal dd $80, $800, $10000
endp
; Encoding tables
macro EncodingTable name, filename {
if used name
align 4
label name word
file filename
end if
}
EncodingTable WIN1251, '_encodings/cp1251.tbl'
EncodingTable CP866, '_encodings/cp866.tbl'
EncodingTable KOI8R, '_encodings/koi8-r.tbl'
EncodingTable KOI8U, '_encodings/koi8-u.tbl'