Artifact [0014d041e5]
Not logged in

Artifact 0014d041e52ddfb52c921643e5ccc8c8993e4958:


module "markdown parser"

struc txt [string] {
common
  .   db string
.length = $ - .
      db 0
}

HTMLHeader txt '<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>', $0d, $0a
HTMLFooter txt $0d, $0a, '</body></html>', $0d, $0a

SizeWriteBuffer = 8192

uglobal
;  pSource         dd  ?
  SizeSource      dd  ?

  tHTML           dd  ?

  HashTable       rd  65536

  HTML            rb  4*SizeWriteBuffer
  .middle         rb  SizeWriteBuffer

endg


START_SIZE = 1024*1024

proc ReadTheInputSlow
begin
        mov     [SizeSource], START_SIZE
        mov     edi, [_MemoryFreeSpace]
        xor     esi, esi

.allocloop:
        stdcall  SpaceAllocate, [SizeSource]

.readloop:
        mov     ecx, [SizeSource]
        lea     eax, [edi+esi]
        sub     ecx, esi
        stdcall FileRead, [STDIN], eax, ecx
        jc      .endoffile
        test    eax, eax
        jz      .endoffile

        add     esi, eax

if options.BenchmarkRead = 1
        push     eax
        stdcall  PrintMsg, cRead, eax, cCRLF
        pop      eax
end if

        mov     eax, [SizeSource]
        sub     eax, esi

        cmp     eax, START_SIZE
        jae     .readloop

        mov     eax, [SizeSource]
        lea     eax, [3*eax]
        shr     eax, 1

        add     eax, $fff
        and     eax, $fffff000
        mov     [SizeSource], eax

;        shl     [SizeSource], 1

if options.BenchmarkMemory = 1
        pushad
        mov      eax, [SizeSource]
        shr      eax, 10
        stdcall  PrintMsg, cReallocated, eax, cKBytes
        stdcall  Print, cCRLF
        popad
end if
        jmp     .allocloop

.endoffile:
        xor     eax, eax
        mov     [edi+esi], eax  ; zero terminated...
        mov     [SizeSource], esi

        return
endp


proc ReadTheInput
begin
        stdcall FileSize, [STDIN]
        jc      .finish

        mov     ebx, eax
        add     eax, 256+$fff    ; some reserve
        and     eax, $fffff000

        stdcall SpaceAllocate, eax
        stdcall FileRead, [STDIN], [_MemoryFreeSpace], ebx

        mov     [SizeSource], ebx

        xor     eax, eax
        mov     edi, [_MemoryFreeSpace]
        mov     [pSource], edi

        mov     [edi+ebx], eax

        clc
.finish:
        return
endp



proc FirstPass
  .header_id dd ?
  .idString  dd ?
begin
        mov     eax, [SizeSource]

        mov     edi, HTML
        mov     esi, [_MemoryFreeSpace]
        mov     [.header_id], 0

.line_start:
        cmp     edi, HTML.middle
        jb      @f

        sub     edi, HTML
        stdcall FileWrite, [STDOUT], HTML, edi
        mov     edi, HTML

@@:
        cmp     byte [esi], '#'
        je      .header

        cmp     byte [esi], '['
        je      .link

.skip_to_eol:
        mov     al, [esi]
        lea     esi, [esi+1]
        cmp     al, $0d
        je      .line_start
        cmp     al, $0a
        je      .line_start
        test    al, al
        jnz     .skip_to_eol

.end_of_file:
        mov     dword [edi], '<hr>'
        mov     word [edi+4], $0a0d
        lea     edi, [edi+6]

        mov     [tHTML], edi
        return

; create table of contents item.
.header:
        mov     al, [esi]
        lea     esi, [esi+1]
        cmp     al, '#'
        je      .header
        cmp     al, ' '
        je      .header
        cmp     al, $09
        je      .header
        cmp     al, $0d
        je      .line_start
        cmp     al, $0a
        je      .line_start

; create the id
        mov     eax, [.header_id]
        mov     ecx, 26

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+1], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+2], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+3], dl

        inc     [.header_id]

        mov     dword [edi], '<a i'
        lea     edi, [edi+4]
        mov     dword [edi], 'd="_'
        lea     edi, [edi+4]

        pushd   [.idString]
        popd    [edi]
        lea     edi, [edi+4]

        mov     dword [edi], '" hr'
        mov     dword [edi+4], 'ef="'
        mov     byte  [edi+8], '#'
        push    [.idString]
        pop     dword [edi+9]
        mov     word [edi+13], '">'
        lea     edi, [edi+15]

; copy the header text
       dec      esi
.copy_loop:
        mov     al, [esi]
        lea     esi, [esi+1]
        cmp     al, $0d
        je      .end_header
        cmp     al, $0a
        je      .end_header
        test    al, al
        jz      .end_header

        mov     [edi], al
        lea     edi, [edi+1]
        jmp     .copy_loop

.end_header:
        dec     esi
        mov     dword [edi],   '</a>'
        mov     dword [edi+4], '<br>'
        lea     edi, [edi+8]
        jmp     .line_start

;link processing
.link:
        mov     edx, $811C9DC5   ; 2166136261 ; FNV offset basis
        mov     ecx, esi

.label_loop:
        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, $20
        je      .label_loop
        cmp     al, $09
        je      .label_loop

        cmp     al, $0d
        je      .line_start
        cmp     al, $0a
        je      .line_start

        test    al, al
        jz      .end_of_file

        xor     dl, al
        imul    edx, $01000193                  ;   16777619              ; FNV prime
        cmp     al, ']'
        jne     .label_loop

; fold the hash to 16 bit value...
        mov     ebx, edx
        shr     ebx, 16
        xor     ebx, edx
        and     ebx, $ffff
        mov     edx, ebx

if options.Benchmark
        inc     [LinkCount]
end if

; search free slot
.search_slot:
        xchg    ecx, [HashTable+4*ebx]
        test    ecx, ecx
        jz      .skip_to_eol

; conflict...
if options.Benchmark
        inc     [HashConflicts]
end if

; compare the labels on ecx (the old content of the slot) with [HashTable+4*ebx] (the new pointer)
        push    edi ecx

        mov     edi, [HashTable+4*ebx]

.first:
        mov     al, [ecx]
        lea     ecx, [ecx+1]

        cmp     al, $20
        je      .first
        cmp     al, $09
        je      .first

.second:
        mov     ah, [edi]
        lea     edi, [edi+1]
        cmp     ah, $20
        je      .second
        cmp     ah, $09
        je      .second

        cmp     al, ah
        jne     .add_not_equal

        cmp     ah, ']'
        jne     .first

        pop     ecx edi
        jmp     .skip_to_eol

; save it to the next slot
.add_not_equal:
        pop     ecx edi

        inc     ebx
        and     ebx, $ffff
        cmp     ebx, edx
        jne     .search_slot

        int3    ; full hash table -> to be implemented error processing

endp


fstateHeader    = $01
fstatePara      = $02
fstateWhite     = $04
fstateBold      = $08
fstateLink      = $10
fstateUnderline = $20
fstateStrikeout = $40
fstateItalic    = $80


proc SecondPass
  .non_space   dd ?
  .state       dd ?
  .last_header dd ?

  .header_id   dd ?
  .idString    dd ?
begin
        mov     esi, [_MemoryFreeSpace]

        xor     eax, eax
        mov     [.state], eax
        mov     [.non_space], eax
        mov     [.header_id], eax

        mov     edi, [tHTML]

.line_start:
        cmp     edi, HTML.middle
        jb      @f

        sub     edi, HTML
        stdcall FileWrite, [STDOUT], HTML, edi
        mov     edi, HTML

@@:
        xor     ecx, ecx
        xchg    [.non_space], ecx

        cmp     byte [esi], '['
        je      .skip

        cmp     byte [esi], '#'
        je      .header

        test    ecx, ecx
        jnz     .scan_line      ; the previous line is not WS

        mov     dword [edi], ' <p>'
        lea     edi, [edi+4]

        or      [.state], fstatePara

.scan_line:
        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, ' '
        je      .white_space
        cmp     al, $09         ; tab
        je      .white_space

        cmp     al, $0d
        je      .end_of_line
        cmp     al, $0a
        je      .end_of_line

        test    al, al
        jz      .end_of_line

; it is not white space character...
        inc     [.non_space]

        mov     ecx, fstateBold
        mov     ebx, '<b>'
        cmp     al, '*'
        je      .ProcessTag

        mov     ecx, fstateUnderline
        mov     ebx, '<u>'
        cmp     al, '_'
        je      .ProcessTag

        mov     ecx, fstateStrikeout
        mov     ebx, '<s>'
        cmp     al, '-'
        je      .ProcessTag

        mov     ecx, fstateItalic
        mov     ebx, '<i>'
        cmp     al, '/'
        je      .ProcessTag

        cmp     al, '['
        je      .is_link_begin

        cmp     al, ']'
        je      .is_link_end

.normal_char:
        and     [.state], not fstateWhite

.markup_ok:
        mov     [edi], al
        lea     edi, [edi+1]
        jmp     .scan_line


; ecx = state mask
; ebx = html tag
.ProcessTag:
        mov     dl, al

        test    [.state], ecx
        jz      .start_tag

; close tag
        shl     ebx, 8
        mov     bx, '</'

        mov     [edi], ebx
        lea     edi, [edi+4]

        not     ecx
        and     [.state], ecx
        jmp     .scan_line

.start_tag:
        test    [.state], fstateWhite
        jz      .normal_char
        cmp     byte [esi], $20
        je      .normal_char
        cmp     byte [esi], $09
        je      .normal_char
        cmp     byte [esi], $0d
        je      .normal_char
        cmp     byte [esi], $0a
        je      .normal_char

; open tag
        mov     [edi], ebx
        lea     edi, [edi+4]
        or      [.state], ecx
        jmp     .scan_line


.white_space:
        or      [.state], fstateWhite
        mov     [edi], al
        lea     edi, [edi+1]
        jmp     .scan_line

; end of line and end of file processing
.end_of_line:
        test    al, al
        jz      @f

        xor     al, $0d xor $0a
        cmp     [esi], al
        jne     @f
        inc     esi
@@:
        or      [.state], fstateWhite

        cmp     [.non_space], 0
        jne     .para_ok

        test    [.state], fstatePara
        jz      .para_ok

        cmp     word [edi-2], $0a0d
        je      .end_para

        sub     edi, 3  ; <p>
        jmp     .clear_para

.end_para:
        sub     edi, 2
        mov     dword [edi], '</p>'
        lea     edi, [edi+4]

.clear_para:
        and     [.state], not fstatePara

.para_ok:
        test    [.state], fstateHeader
        jz      .crlf

; insert back link
        mov     dword [edi], ' <a '
        lea     edi, [edi+4]

        mov     dword [edi], 'href'
        lea     edi, [edi+4]

        mov     dword [edi], '="#_'
        lea     edi, [edi+4]

        push    [.idString]
        pop     dword [edi]
        lea     edi, [edi+4]

        mov     dword [edi], $86e23e22 ; ">arrow
        lea     edi, [edi+4]

        mov     al, $a9            ; the end of the arrow char
        mov     byte [edi], $a9
        lea     edi, [edi+1]

        mov     dword [edi], '</a>'
        lea     edi, [edi+4]

        mov     byte [edi], '<'
        lea     edi, [edi+1]

        mov     ecx, [.last_header]
        add     ecx, '/h0>'
        mov     [edi], ecx
        lea     edi, [edi+4]

        and     [.state], not fstateHeader

.crlf:
        mov     word [edi], $0a0d
        lea     edi, [edi+2]

        test    al, al
        jnz     .line_start

.finish:
        sub     edi, HTML
        stdcall FileWrite, [STDOUT], HTML, edi
        mov     edi, HTML
        return




.is_link_begin:
        test    [.state], fstateWhite
        jz      .normal_char

        dec     esi

;.SearchLink:
        push    edi
        push    esi

; first compute the hash for the link label
        mov     edx, $811C9DC5                  ; 2166136261              ; FNV offset basis

.hashloop:
        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, $20
        je      .hashloop
        cmp     al, $09
        je      .hashloop

        cmp     al, $0d
        je      .not_a_link
        cmp     al, $0a
        je      .not_a_link
        test    al, al
        je      .not_a_link

        xor     dl, al
        imul    edx, $01000193                  ;   16777619              ; FNV prime
        cmp     al, ']'
        jne     .hashloop

; fold the hash to 16 bit value...
        mov     ebx, edx
        shr     ebx, 16
        xor     ebx, edx
;        and     ebx, $ffff
        dec     ebx

.hash_loop:
        lea     ebx, [ebx+1]
        and     ebx, $ffff

        mov     edi, [HashTable+4*ebx]
        mov     esi, [esp]

        test    edi, edi
        jnz     .cmp_labels

.not_a_link:
        pop     esi edi
        mov     al, [esi]
        lea     esi, [esi+1]
        jmp     .normal_char    ; the link address was not found, so it is not link at all.

.cmp_labels:
        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, ' '
        je      .cmp_labels
        cmp     al, $09
        je      .cmp_labels

.target:
        mov     ah, [edi]
        lea     edi, [edi+1]

        cmp     ah, ' '
        je      .target
        cmp     ah, $09
        je      .target

        cmp     al, ah
        jne     .hash_loop      ; it is not the same label - try again.

        cmp     ah, ']'
        jne     .cmp_labels

; the labels are equal, so search the address.
.firstnw:
        mov     al, [edi]
        lea     edi, [edi+1]
        cmp     al, ' '
        je      .firstnw
        cmp     al, $09
        je      .firstnw

        dec     edi
        mov     edx, edi
        pop     esi edi

; yes the link was found.
        or      [.state], fstateLink
        mov     dword [edi], '<a h'
        lea     edi, [edi+4]

        mov     dword [edi], 'ref='
        lea     edi, [edi+4]

        mov     byte [edi], '"'
        lea     edi, [edi+1]

        push    esi
        mov     esi, edx

.copy_link:
        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, $0d
        je      .end_link
        cmp     al, $0a
        je      .end_link
        test    al, al
        jz      .end_link

        mov     [edi], al
        lea     edi, [edi+1]
        jmp     .copy_link

.end_link:
        pop     esi

        mov     word [edi], '">'
        lea     edi, [edi+2]

        and     [.state], not fstateWhite
        inc     esi
        jmp     .scan_line

;....................................................................

.is_link_end:
        test    [.state], fstateLink
        jz      .normal_char

        mov     dword [edi], '</a>'
        lea     edi, [edi+4]

        and     [.state], not fstateLink
        jmp     .scan_line

;....................................................................

; processing header items and building of the table of contents.

.header:
        xor     ecx, ecx
        inc     esi

.scan_level:
        inc     ecx

        mov     al, [esi]
        lea     esi, [esi+1]

        cmp     al, '#'
        je      .scan_level
        dec     esi

        cmp     ecx, 6
        jbe     @f
        mov     ecx, 6
@@:
        shl     ecx, 16
        mov     [.last_header], ecx

        add     ecx, '<h0 '
        mov     [edi], ecx
        mov     dword [edi+4], 'id="'
        lea     edi, [edi+8]

; create ID
        mov     eax, [.header_id]
        mov     ecx, 26

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+1], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+2], dl

        xor     edx, edx
        div     ecx
        add     dl, 'a'
        mov     byte [.idString+3], dl

        inc     [.header_id]

; insert the id here.
        pushd   [.idString]
        popd    [edi]
        mov     word [edi+4], '">'
        lea     edi, [edi+6]

.end_header:
        or      [.state], fstateHeader
        jmp     .scan_line

.skip:
        mov     al, [esi]
        lea     esi, [esi+1]

        test    al, al
        jz      .finish
        cmp     al, $0d
        je      .skip_eol
        cmp     al, $0a
        jne     .skip

.skip_eol:
        xor     al, $0d xor $0a
        cmp     [esi], al
        jne     .line_start
        inc     esi
        jmp     .line_start

endp


proc GetAddressSpace
begin
        invoke  VirtualAlloc, 0, eax, MEM_RESERVE, PAGE_READWRITE
        return
endp

proc CommitMemory
begin
        invoke  VirtualAlloc,
endp


endmodule