Fresh IDE . Artifact [e658e96a80]
Not logged in

This repository is a mirror!

The original is located on: https://fresh.flatassembler.net/fossil/repo/fresh
If you want to follow the project, please update your remote-url

Artifact e658e96a80a6be3e0c83cd04d8ca11daa9e15b04:


; _______________________________________________________________________________________
;|                                                                                       |
;| ..::FreshLib::..  Free, open source. Licensed under "Fresh artistic license."         |
;|_______________________________________________________________________________________|
;
;  Description: OS independent text encodings translation library.
;
;  Target OS: Any
;
;  Dependencies: StrLib.asm
;
;  Notes:
;
;_________________________________________________________________________________________



; AnsiToUtf8 converts string from ANSI to UTF-8
;
; arguments:
;   .SrcEncoding - pointer to 256 ansi2unicode conversion table.
;   .hString - handle or pointer to the source string.
; returns:
;   eax - handle of new created string with utf-8 encoded text.


proc AnsiToUtf8, .SrcEncoding, .hString
.result dd ?
.start  dd ?
begin
        pushad

        stdcall StrLen, [.hString]
        lea     ecx, [4*eax+16]

        stdcall StrNew
        mov     [.result], eax

        jecxz   .finish2

        stdcall StrSetCapacity, [.result], ecx
        mov     edi, eax
        mov     [.start], eax

        stdcall StrPtr, [.hString]
        mov     esi, eax

        mov     ebx, [.SrcEncoding]

.loop:
        movzx   eax, byte [esi]
        lea     esi, [esi+1]
        test    eax, eax
        jz      .finish

        mov     edx, 1
        cmp     eax, $80
        jb      .direct

        sub     eax, $80
        movzx   eax, word [ebx+2*eax]
        stdcall EncodeUtf8, eax

.direct:
        mov     [edi], eax
        add     edi, edx

        jmp     .loop

.finish:
        xor     eax, eax
        mov     [edi], eax

        mov     eax, [.start]
        sub     edi, eax
        mov     [eax+string.len], edi

.finish2:
        popad
        mov     eax, [.result]
        return
endp


; Utf8ToAnsi converts string from UTF-8 to some ANSI encoding.
;
; arguments:
;   .hString - pointer of handle of utf-8 string.
;   .DstEncoding - pointer to utf2ansi conversion table.
; returns:
;   eax - handle of new created string with ansi encoded text.


proc Utf8ToAnsi, .hString, .DstEncoding
.result dd ?
.char   dd ?
begin
        pushad
        mov     ebx, [.DstEncoding]

        stdcall StrLen, [.hString]
        mov     ecx, eax

        stdcall StrNew
        mov     [.result], eax

        stdcall StrSetCapacity, eax, ecx
        mov     edi, eax

        stdcall StrPtr, [.hString]
        mov     esi, eax

.loop:
        stdcall DecodeUtf8, [esi]
        lea     esi, [esi+edx]
        jnc     .ok

        test    edx, edx
        jz      .end_of_string          ; invalid unicode char

.ok:
        test    eax, eax
        jz      .end_of_string          ; zero terminated

        cmp     eax, $ffff
        ja      .not_found

        cmp     eax, $80
        jb      .store

; search in the table

        mov     [.char], eax
        xor     edx, edx
        xor     cl, cl       ; start
        mov     ch, $7f      ; end

.search_loop:
        cmp     cl, ch
        ja      .not_found

        mov     dl, cl
        add     dl, ch
        shr     edx, 1

        movzx   eax, byte [ebx+256+edx]
        movzx   eax, word [ebx+2*eax-$100]   ; unicode character

        cmp     [.char], eax
        je      .found
        ja      .bigger

        mov     ch, dl
        dec     ch
        jmp     .search_loop

.bigger:
        mov     cl, dl
        inc     cl
        jmp     .search_loop

.found:
        mov     al, [ebx+256+edx]

.store:
        stosb
        jmp     .loop

.not_found:
        mov     al, '?'
        stosb
        jmp     .loop

.end_of_string:
        mov     dword [edi], 0
        stdcall StrPtr, [.result]
        sub     edi, eax
        mov     [eax+string.len], edi

        popad
        mov     eax, [.result]
        return
endp





; converts unicode character [.unichar] to UTF-8 encoding (max to 4 bytes)
;
; returns: CF=0
;   EAX = utf-8 character
;   EDX = lenfth of the character in bytes.
; CF=1 - the encoding can not be provided.
;   eax = 0
;   edx = 0

proc EncodeUtf8, .unichar
begin
        push    ebx ecx edi

        xor     eax, eax
        mov     edi, 1
        mov     ebx, [.unichar]
        cmp     ebx, $80
        jb      .one_byte

        mov     edx, $ffffffc0         ; mask

.loop:
        inc     edi
        cmp     edi, 4
        ja      .error

        sar     edx, 1

        movzx   ecx, bl
        and     cl, $3f
        or      cl, $80

        shl     eax, 8
        shr     ebx, 6
        or      eax, ecx        ; eax == result

        test    ebx, edx
        jnz     .loop

        lea     ecx, [2*edx]
        xor     ecx, edx

        or      ebx, edx
        xor     ebx, ecx

        shl     eax, 8
        mov     al, bl

.finish:
        mov     edx, edi
        pop     edi ecx ebx
        clc
        return

.one_byte:
        mov     eax, ebx
        jmp     .finish

.error:
        xor     eax, eax
        xor     edx, edx
        pop     edi ecx ebx
        stc
        return
endp




; decodes 4 bytes in [.chars] to UNICODE dword value.
; returns:
;   CF=0 - no error
;     eax - unicode value.
;     edx - byte count of the char. [1..4]
;   CF=1 - invalid utf-8 char;
;     eax = edx = 0  the character can not be decoded.
;     edx <> 0 -> eax = the overlong encoded character. edx = byte count of the char.
;
;  Note: When CF=1 and [.chars] are overlong encoded char.
;        eax contains the proper value and edx contains the proper length.
;        But it is still invalid character, according to the standards.
proc DecodeUtf8, .chars
begin
        push    ebx ecx

        xor     ecx, ecx

.loop1:
        shl     byte [.chars], 1
        jnc     .countok
        inc     ecx
        jmp     .loop1

.countok:
        jecxz   .ascii

        cmp     ecx, 1
        je      .error          ; internal byte
        cmp     ecx, 4
        ja      .error          ; more than 4 bytes

        mov     edx, 1
        xor     ebx, ebx
        movzx   eax, byte [.chars]
        shr     eax, cl
        shr     eax, 1

.loop2:
        mov     bl, byte [.chars+edx]
        and     bl, $c0
        cmp     bl, $80
        jne     .error
        mov     bl, byte [.chars+edx]
        and     bl, $3f
        shl     eax, 6
        or      eax, ebx
        inc     edx
        cmp     edx, ecx
        jne     .loop2

        and     eax, $1fffff
        cmp     eax, $10ffff
        ja      .error

        cmp     eax, [._minimal+4*edx-8]
        jb      .overlong   ; overlong coding.

        clc
        pop     ecx ebx
        return

.ascii:
        movzx   eax, byte [.chars]
        shr     eax, 1
        mov     edx, 1
        pop     ecx ebx
        clc
        return

.error:
        xor     eax, eax
        xor     edx, edx
.overlong:
        stc
        pop     ecx ebx
        return

._minimal dd $80, $800, $10000

endp




; Encoding tables


macro EncodingTable name, filename {
  if used name
    align 4
    label name word
    file  filename
  end if
}


EncodingTable WIN1251, '_encodings/cp1251.tbl'
EncodingTable CP866,   '_encodings/cp866.tbl'
EncodingTable KOI8R,   '_encodings/koi8-r.tbl'
EncodingTable KOI8U,   '_encodings/koi8-u.tbl'