Login
parser.cr at tip
Login

File src/remilib/rsconf/parser.cr from the latest check-in


     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
   100
   101
   102
   103
   104
   105
   106
   107
   108
   109
   110
   111
   112
   113
   114
   115
   116
   117
   118
   119
   120
   121
   122
   123
   124
   125
   126
   127
   128
   129
   130
   131
   132
   133
   134
   135
   136
   137
   138
   139
   140
   141
   142
   143
   144
   145
   146
   147
   148
   149
   150
   151
   152
   153
   154
   155
   156
   157
   158
   159
   160
   161
   162
   163
   164
   165
   166
   167
   168
   169
   170
   171
   172
   173
   174
   175
   176
   177
   178
   179
   180
   181
   182
   183
   184
   185
   186
   187
   188
   189
   190
   191
   192
   193
   194
   195
   196
   197
   198
   199
   200
   201
   202
   203
   204
   205
   206
   207
   208
   209
   210
   211
   212
   213
   214
   215
   216
   217
   218
   219
   220
   221
   222
   223
   224
   225
   226
   227
   228
   229
   230
   231
   232
   233
   234
   235
   236
   237
   238
   239
   240
   241
   242
   243
   244
   245
   246
   247
   248
   249
   250
   251
   252
   253
   254
   255
   256
   257
   258
   259
   260
   261
   262
   263
   264
   265
   266
   267
   268
   269
   270
   271
   272
   273
   274
   275
   276
   277
   278
   279
   280
   281
   282
   283
   284
   285
   286
   287
   288
   289
   290
   291
   292
   293
   294
   295
   296
   297
   298
   299
   300
   301
   302
   303
   304
   305
   306
   307
   308
   309
   310
   311
   312
   313
   314
   315
   316
   317
   318
   319
   320
   321
   322
   323
   324
   325
   326
   327
   328
   329
   330
   331
   332
   333
   334
   335
   336
   337
   338
   339
   340
   341
   342
   343
   344
   345
   346
   347
   348
   349
   350
   351
   352
   353
   354
   355
   356
   357
   358
   359
   360
   361
   362
   363
   364
   365
   366
   367
   368
   369
   370
   371
   372
   373
   374
   375
   376
   377
   378
   379
   380
   381
   382
   383
   384
   385
   386
   387
   388
   389
   390
   391
   392
   393
   394
   395
   396
   397
   398
   399
   400
   401
   402
   403
   404
   405
   406
   407
   408
   409
   410
   411
   412
   413
   414
   415
   416
   417
   418
   419
   420
   421
   422
   423
   424
   425
   426
   427
   428
   429
   430
   431
   432
   433
   434
   435
   436
   437
   438
   439
   440
   441
   442
   443
   444
   445
   446
   447
   448
   449
   450
   451
   452
   453
   454
   455
   456
   457
   458
   459
   460
   461
   462
   463
   464
   465
   466
   467
   468
   469
   470
   471
   472
   473
   474
   475
   476
   477
   478
   479
   480
   481
   482
   483
   484
   485
   486
   487
   488
   489
   490
   491
   492
   493
   494
   495
   496
   497
   498
   499
   500
   501
   502
   503
   504
   505
   506
   507
   508
   509
   510
   511
   512
   513
   514
   515
   516
   517
   518
   519
   520
   521
   522
   523
   524
   525
   526
   527
   528
   529
   530
   531
   532
   533
   534
   535
   536
   537
   538
   539
   540
   541
   542
   543
   544
   545
   546
   547
   548
   549
   550
   551
   552
   553
   554
   555
   556
   557
   558
   559
   560
   561
   562
   563
   564
   565
   566
   567
   568
   569
   570
   571
   572
   573
   574
   575
   576
   577
   578
   579
   580
   581
   582
   583
   584
   585
   586
   587
   588
   589
   590
   591
   592
   593
   594
   595
   596
   597
   598
   599
   600
   601
   602
   603
   604
   605
   606
   607
   608
   609
   610
   611
   612
   613
   614
   615
   616
   617
   618
   619
   620
   621
   622
   623
   624
   625
   626
   627
   628
   629
   630
   631
   632
   633
   634
   635
   636
   637
   638
   639
   640
   641
   642
   643
   644
   645
   646
   647
   648
   649
   650
   651
   652
   653
   654
   655
   656
   657
   658
   659
   660
   661
   662
   663
   664
   665
   666
   667
   668
   669
   670
   671
   672
   673
   674
   675
   676
   677
   678
   679
   680
   681
   682
   683
   684
   685
   686
   687
   688
   689
   690
   691
   692
   693
   694
   695
   696
   697
   698
   699
   700
   701
   702
   703
   704
   705
   706
   707
   708
   709
   710
   711
   712
   713
   714
   715
   716
   717
   718
   719
   720
   721
   722
   723
   724
#### libremiliacr
#### Copyright(C) 2020-2024 Remilia Scarlet <remilia@posteo.jp>
####
#### This program is free software: you can redistribute it and/or modify it
#### under the terms of the GNU General Public License as published the Free
#### Software Foundation, either version 3 of the License, or (at your option)
#### any later version.
####
#### This program is distributed in the hope that it will be useful, but WITHOUT
#### ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#### FITNESS FOR A PARTICULAR PURPOSE.See the GNU General Public License for
#### more details.
####
#### You should have received a copy of the GNU General Public License along
#### with this program.If not, see<http:####www.gnu.org/licenses/.>
require "./common"

module RemiLib::RSConf
  # The `Parser` class is used to parse RSConf data into `RSValue`s.
  #
  # ```crystal
  # require "libremiliacr"
  #
  # rsconfStr = %|
  # foo = 69
  # bar = "hello, world"|
  #
  # parser = RemiLib::RSConf::Parser.new(rsconfStr)
  # pp parser.parse
  # ```
  class Parser
    @stream : IO
    @line : UInt64 = 1
    @col : UInt64 = 0

    # :nodoc:
    macro raiseParseError(msg)
      raise RSConfParseError.new(@line, @col, "Near (#{@line}, #{@col}): #{ {{msg}} }")
    end

    # :nodoc:
    macro raiseReturnOrPageFound(char)
      %code = sprintf("%02X", {{char}}.ord)
      %shortName = {{char}} == '\f' ? "\\f" : "\\r"
      %name = {{char}} == '\f' ? "Page break" : "Return"
      raiseParseError("Illegal character: 0x#{%code} (#{%shortName}, #{%name}")
    end

    # :nodoc:
    macro raiseIllegalWhitespace(char)
      %code = sprintf("%02X", {{char}}.ord)
      raiseParseError("Illegal whitespace character: 0x#{%code}")
    end

    # Creates a new `Parser` that will read from *stream*.
    def initialize(@stream : IO)
      unless @stream.encoding == "UTF-8"
        raise RSConfEncodingError.new("Unsupported stream encoding: #{@stream.encoding}")
      end
    end

    # Creates a new `Parser` that will read from *string*.
    def initialize(string : String)
      @stream = IO::Memory.new(string)
    end

    # Parses RSConf data.
    def parse : RSTopLevel
      readDocument
    end

    # Parses RSConf data from a file.
    def self.parse(filename : Path) : RSTopLevel
      File.open(filename, "rb") do |file|
        Parser.parse(file)
      end
    end

    # Parses RSConf data from a string.
    def self.parse(str : String) : RSTopLevel
      io = IO::Memory.new(str)
      Parser.parse(io)
    end

    # Parses RSConf data from an `IO`.
    def self.parse(io : IO) : RSTopLevel
      Parser.new(io).parse
    end

    ############################################################################

    # Advances one line.
    @[AlwaysInline]
    protected def advLine : Nil
      @line += 1
      @col = 1
    end

    # Advances one column.
    @[AlwaysInline]
    protected def adv : Nil
      @col += 1
    end

    @[AlwaysInline]
    protected def whitespaceCharButNotNewline(char : Char) : Bool
      char.ord == 32 || char.ord == 9 # Space or Tab
    end

    @[AlwaysInline]
    protected def whitespaceChar(char : Char) : Bool
      char.ord == 32 || char.ord == 9 || char.ord == 10 # Space, Tab, or Newline
    end

    @[AlwaysInline]
    protected def illegalWhitespaceChar(char : Char) : Bool
      ILLEGAL_WHITESPACE.includes?(char.ord)
    end

    @[AlwaysInline]
    protected def readChar : Char?
      ret = @stream.read_char
      if ret && (ret == '\r' || ret == '\f')
        raiseReturnOrPageFound(ret)
      elsif ret
        if ret == '\n'
          advLine
        else ret
          adv
        end
      end

      ret
    end

    @[AlwaysInline]
    protected def peekChar : Char?
      ret : Char? = nil
      @stream.withExcursion do
        ret = @stream.read_char
      end
      ret
    end

    @[AlwaysInline]
    protected def readChar! : Char?
      ret = @stream.read_char
      if ret
        if ret == '\n'
          advLine
        else
          adv
        end
      end
      ret
    end

    @[AlwaysInline]
    protected def skipWhitespace : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceChar(c)
          if c == '\r' || c == '\f'
            raiseReturnOrPageFound(c)
          else
            readChar
          end
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        else
          break
        end
      end
    end

    @[AlwaysInline]
    protected def skipSpaces : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceCharButNotNewline(c)
          readChar
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        else
          break
        end
      end
    end

    @[AlwaysInline]
    protected def readComment : Nil
      c : Char? = nil
      loop do
        c = readChar
        if c.nil? || c == '\n'
          break
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        end
      end
    end

    @[AlwaysInline]
    protected def skipWhitespaceAndComments : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceCharButNotNewline(c)
          readChar
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        elsif c == ';'
          readComment
        else
          break
        end
      end
    end

    protected def readNumber : RSScalar
      radix : UInt8 = 10 # Assume decimal until determined otherwise
      float? : Bool = false
      expChar : Char = 'e'
      haveExpChar? : Bool = false
      first? : Bool = true
      ret : Int64|Float64 = 0

      # We'll build the number into a string, then convert it later.
      #
      # TODO this could probably be done more efficiently without the string?
      retStr = String.build do |str|
        c = peekChar || raise "Character was not expected to be Nil"
        RemiLib.assert(c != '\r')
        RemiLib.assert(c != '\f')

        # We now need to see if we're starting off with a special radix (e.g. #x
        # or #b), or a digit.
        if c == '#'
          readChar
          c = peekChar || raiseParseError("Unexpected end of stream")

          # We've handled Nil, so now figure out the radix.  The code below
          # calls #readChar which will handle #\Return and #\Page for us.
          case c.downcase
          when 'x'
            readChar
            radix = 16
          when 'o'
            readChar
            radix = 8
          when 'b'
            readChar
            radix = 2
          else
            raiseParseError("Invalid radix character: '#{c}'")
          end
          first? = false

        elsif c.to_i?(radix) || c == '+' || c == '-'
          readChar
          str << c
          first? = false

        else
          # Anything else is an error
          raiseParseError("Expected a digit or the start of a radix")
        end

        # Now read characters.  The #readChar will automatically check for
        # #\Return and #\Page as we go.
        loop do
          c = peekChar

          if c.nil? || whitespaceChar(c)
            readChar
            break

          elsif illegalWhitespaceChar(c)
            raiseIllegalWhitespace(c)

          elsif c == ',' || c == ']' || c == '}'
            break

          elsif c.to_i?(radix)
            readChar
            str << c

          elsif c == '.'
            readChar

            # Switch to float mode, or error if we're already reading floats or
            # are using the wrong radix.
            if float?
              raiseParseError("Unexpected extra period in float")
            elsif radix != 10
              raiseParseError("Floats must be in decimal")
            else
              float? = true
            end
            str << c

          elsif c.downcase == 'e' || c.downcase == 'd'
            expChar = c.downcase

            if float?
              if haveExpChar?
                raiseParseError("Unexpected character while reading float: '#{c}'")
              else
                haveExpChar? = true
              end
            else
              if haveExpChar?
                raiseParseError("Unexpected character while reading integer: '#{c}'")
              else
                haveExpChar? = true
                float? = true
              end
            end

            # We downcase it here in case it's an #\e.  This gets around having
            # to do a String#downcase later on when we parse for a float.
            readChar
            str << c.downcase

          elsif c == '-' || c == '+'
            if float?
              if !haveExpChar?
                raiseParseError("Unexpected sign character in float number: '#{c}'")
              end
            elsif !first?
              raiseParseError("Unexpected sign character in integer: '#{c}'")
            end

            readChar
            str << c

          else
            # Everything else is an error
            raiseParseError("Bad numeric character: '#{c}'")
          end # if c.nil? || whitespaceChar(c)

          first? = false
        end # loop do
      end

      # Now try to parse the number.  If we hit an ArgumentError, raise our own
      # parsing error instead, though this should never happen.
      begin
        ret = if float?
                RemiLib.assert(expChar == 'e' || expChar == 'd')
                # Replace 'd' with 'e' because we aren't in Common Lisp.
                # ameba:disable Lint/UselessAssign
                retStr.gsub('d', 'e').to_f64(whitespace = false)
              else
                retStr.to_i64(radix)
              end
      rescue ArgumentError
        raiseParseError("Could not parse numeric value")
      end

      RSScalar.new(ret)
    end

    protected def readEscapedUTF8Char(startLine : UInt64, startCol : UInt64) : Char
      ret = String.build do |codeStr|
        loop do
          num = readChar
          case
          when num.nil?
            raiseParseError("Unterminated UTF-8 character in the string starting at " \
                            "line #{startLine} column #{startCol}")

          when num.to_i?(16) # Always in hex according to the spec
            codeStr << num

          when num == '}'
            break

          else
            raiseParseError("Invalid UTF-8 code in the string starting at line #{startLine} column #{startCol}")
          end
        end
      end

      begin
        ret.to_i32(16).chr
      rescue err : ArgumentError
        raiseParseError("Could not parse escaped UTF-8 character in string starting at " \
                        "line #{startLine} column #{startCol}: #{err}")
      end
    end

    protected def readString : RSScalar
      startLine : UInt64 = @line
      startCol : UInt64 = @col
      backslash? : Bool = false
      RSScalar.new(
        String.build do |str|
          # This must be true since the caller should only have peeked at the next
          # character.
          raise "Unexpectedly tried to read a string" unless readChar == '"'

          loop do
            c = readChar!
            case
            when c.nil?
              raiseParseError("Unterminated string starting at line #{startLine} column #{startCol} "\
                              "(end of stream reached)")
            when c == '"'
              if backslash?
                str << c
                backslash? = false
              else
                break
              end
            when c == '\\'
              if backslash?
                # Write one backslash
                str << c
                backslash? = false
              else
                backslash? = true
              end
            when c == 'u'
              if backslash?
                if readChar == '{'
                  str << readEscapedUTF8Char(startLine, startCol)
                else
                  # Write both the backslash and the #\u
                  str << '\\'
                  str << c
                end
                backslash? = false
              else
                str << c
              end
            else
              str << c
              backslash? = false
            end
          end
        end)
    end

    # Reads a key name.  This handles #\Return and #\Page charactgers according
    # to the spec.
    protected def readKeyName : String
      skipWhitespace
      ret : String = ""

      # Check for a quoted key name first.
      if peekChar == '"'
        ret = readString.val.as(String)
        skipSpaces
        raiseParseError("Expected a colon after the key name") unless readChar == ':'
      else
        # Not a quoted name, so now we read characters to construct the name instead.
        ret = String.build do |str|
          loop do
            c = readChar
            case
            when c.nil?
              raiseParseError("Unexpected end of stream")
            when c == ':'
              break
            when c == '"' || c == '{' || c == '}' || c == '[' || c == ']'
              raiseParseError("Invalid character in unquoted key name: '#{c}'")
            else
              str << c
            end
          end
        end

        if ret.includes?('\n')
          raiseParseError("Unquoted key names cannot contain newlines")
        end
        ret = ret.strip
      end

      if ret.empty? || ret.blank?
        raiseParseError("Empty key name")
      end

      ret
    end

    protected def readBoolean : RSScalar
      firstChar : Char? = readChar || raise "Can't read bool, but expected to"
      case firstChar
      when 't'
        r = readChar || raiseParseError("Unexpected end of stream")
        u = readChar || raiseParseError("Unexpected end of stream")
        e = readChar || raiseParseError("Unexpected end of stream")
        if r.downcase == 'r' && u.downcase == 'u' && e.downcase == 'e'
          RSScalar.new(true)
        else
          raiseParseError("Bad boolean value")
        end
      when 'f'
        a = readChar || raiseParseError("Unexpected end of stream")
        l = readChar || raiseParseError("Unexpected end of stream")
        s = readChar || raiseParseError("Unexpected end of stream")
        e = readChar || raiseParseError("Unexpected end of stream")
        if a.downcase == 'a' && l.downcase == 'l' && s.downcase == 's' && e.downcase == 'e'
          RSScalar.new(false)
        else
          raiseParseError("Bad boolean value")
        end
      else
        raiseParseError("Unexpected character where a boolean was expected: '#{firstChar}'")
      end
    end

    protected def readNull : RSScalar
      n = readChar || raise "Can't read nil, but expected to"
      i = readChar || raiseParseError("Unexpected end of stream")
      l = readChar || raiseParseError("Unexpected end of stream")
      unless n.downcase == 'n' && i.downcase == 'i' && l.downcase == 'l'
        raiseParseError("Bad null value")
      end
      RSScalar.new(nil)
    end

    @[AlwaysInline]
    protected def maybeReadComma : Nil
      skipSpaces
      readChar if peekChar == ','
      skipWhitespace
    end

    protected def readValue : RSValue?
      ret : RSValue? = nil

      skipWhitespace
      loop do
        c = peekChar
        raiseParseError("Expected a value") if c.nil?
        if c == ';'
          readComment
          skipWhitespace
        else
          ret = case
                when c.to_i?(10) || c == '#' || c == '+' || c == '-'
                  readNumber
                when c == '{'
                  readObject
                when c == '['
                  readArray
                when c == '"'
                  readString
                when c.downcase == 't' || c.downcase == 'f'
                  readBoolean
                when c.downcase == 'n'
                  readNull
                else
                  raiseParseError("Unexpected character where a value was expected: '#{c}'")
                end
          break
        end
      end

      maybeReadComma
      skipWhitespace
      ret
    end

    private def readObject : RSObject
      ret : RSObject = RSObject.new
      raise "Expected start of hash" unless readChar == '{'
      skipWhitespace

      loop do
        c = peekChar
        case
        when c.nil? || c == '}'
          readChar
          maybeReadComma
          break
        when whitespaceChar(c)
          readChar
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
          skipWhitespace
        when c == '"' || !(c == '{' || c == '}' || c == '[' || c == ']')
          key : String = readKeyName
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret[key] = val.as(RSValue)
          end
        else
          raiseParseError("Expected a key or the end of an object")
        end
      end

      ret
    end

    private def readArray : RSArray
      ret : RSArray = RSArray.new
      raise "Expected start of array" unless readChar == '['
      skipWhitespace

      loop do
        c = peekChar
        case
        when c.nil? || c == ']'
          readChar
          maybeReadComma
          break
        when whitespaceChar(c)
          readChar
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
          skipWhitespace
        when c == '"' || c == '{' || c == '}' || c == '[' || c == ']' ||
             c.to_i?(10) || c == '+' || c == '-' || c == '#' ||
             c == 't' || c == 'f' || c == 'n'
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret << val.as(RSValue)
          end
          skipWhitespace
        else
          raiseParseError("Expected a value or the end of an array")
        end
      end

      ret
    end

    protected def testForBOM : Nil
      # This check was determined by writing raw bytes into a byte stream, then
      # converting it to a string, then reading from that string.  Since some of
      # these characters wouldn't necessarily be valid outside of a string
      # anyway, this sort of check works both for a possible Byte-Order-Mark, as
      # well as an initial "hey is the first character a bad one?" for these
      # characters.  We must do this before skipping any initial whitespace.
      #
      # https://en.wikipedia.org/wiki/Byte_order_mark
      bomTest : Int32|Nil = peekChar.try(&.ord)
      if bomTest == 0xFEFF || # Zero width no-break space
         bomTest == 0xFFFD || # Replacement character
         bomTest == 0x2B   || # +
         bomTest == 0x0E      # So
        raise RSConfBOMError.new("Possible byte-order-mark detected, or junk in toplevel")
      end
    end

    private def readDocument : RSTopLevel
      ret : RSObject|RSArray|Nil = nil
      startedObject : Bool = false

      testForBOM
      skipWhitespace
      loop do
        c = peekChar
        #puts "At #{@stream.pos} (#{@line}, #{@col}): #{c}"
        case
        when c.nil?
          break
        when whitespaceChar(c)
          skipWhitespace
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
        when c == '{'
          raiseParseError("Unexpected start of document object") if startedObject
          ret = readObject
          break
        when c == '['
          raiseParseError("Unexpected start of document array") if startedObject
          ret = readArray
          break
        when c == '"' || c.alphanumeric?
          unless startedObject
            startedObject = true
            RemiLib.assert(ret.nil?)
            ret = RSObject.new
          end

          RemiLib.assert(ret.is_a?(RSObject))
          key : String = readKeyName
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret[key] = val.as(RSValue)
          end
        else
          raiseParseError("Unexpected character at toplevel: '#{c}'")
        end
      end

      skipWhitespaceAndComments
      raiseParseError("Unexpected junk at toplevel") unless readChar.nil?
      if ret.nil?
        RSObject.new
      else
        ret.as(RSObject|RSArray)
      end
    end
  end
end