Login
Artifact [95810859f8]
Login

Artifact 95810859f82fb8337504f5ad630aedce2241fd1070636b3fcaa61cc2f8b5b8e3:


#### libremiliacr
#### Copyright(C) 2020-2024 Remilia Scarlet <remilia@posteo.jp>
####
#### This program is free software: you can redistribute it and/or modify it
#### under the terms of the GNU General Public License as published the Free
#### Software Foundation, either version 3 of the License, or (at your option)
#### any later version.
####
#### This program is distributed in the hope that it will be useful, but WITHOUT
#### ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#### FITNESS FOR A PARTICULAR PURPOSE.See the GNU General Public License for
#### more details.
####
#### You should have received a copy of the GNU General Public License along
#### with this program.If not, see<http:####www.gnu.org/licenses/.>
require "./common"

module RemiLib::RSConf
  # The `Parser` class is used to parse RSConf data into `RSValue`s.
  #
  # ```crystal
  # require "libremiliacr"
  #
  # rsconfStr = %|
  # foo = 69
  # bar = "hello, world"|
  #
  # parser = RemiLib::RSConf::Parser.new(rsconfStr)
  # pp parser.parse
  # ```
  class Parser
    @stream : IO
    @line : UInt64 = 1
    @col : UInt64 = 0

    # :nodoc:
    macro raiseParseError(msg)
      raise RSConfParseError.new(@line, @col, "Near (#{@line}, #{@col}): #{ {{msg}} }")
    end

    # :nodoc:
    macro raiseReturnOrPageFound(char)
      %code = sprintf("%02X", {{char}}.ord)
      %shortName = {{char}} == '\f' ? "\\f" : "\\r"
      %name = {{char}} == '\f' ? "Page break" : "Return"
      raiseParseError("Illegal character: 0x#{%code} (#{%shortName}, #{%name}")
    end

    # :nodoc:
    macro raiseIllegalWhitespace(char)
      %code = sprintf("%02X", {{char}}.ord)
      raiseParseError("Illegal whitespace character: 0x#{%code}")
    end

    # Creates a new `Parser` that will read from *stream*.
    def initialize(@stream : IO)
      unless @stream.encoding == "UTF-8"
        raise RSConfEncodingError.new("Unsupported stream encoding: #{@stream.encoding}")
      end
    end

    # Creates a new `Parser` that will read from *string*.
    def initialize(string : String)
      @stream = IO::Memory.new(string)
    end

    # Parses RSConf data.
    def parse : RSTopLevel
      readDocument
    end

    # Parses RSConf data from a file.
    def self.parse(filename : Path) : RSTopLevel
      File.open(filename, "rb") do |file|
        Parser.parse(file)
      end
    end

    # Parses RSConf data from a string.
    def self.parse(str : String) : RSTopLevel
      io = IO::Memory.new(str)
      Parser.parse(io)
    end

    # Parses RSConf data from an `IO`.
    def self.parse(io : IO) : RSTopLevel
      Parser.new(io).parse
    end

    ############################################################################

    # Advances one line.
    @[AlwaysInline]
    protected def advLine : Nil
      @line += 1
      @col = 1
    end

    # Advances one column.
    @[AlwaysInline]
    protected def adv : Nil
      @col += 1
    end

    @[AlwaysInline]
    protected def whitespaceCharButNotNewline(char : Char) : Bool
      char.ord == 32 || char.ord == 9 # Space or Tab
    end

    @[AlwaysInline]
    protected def whitespaceChar(char : Char) : Bool
      char.ord == 32 || char.ord == 9 || char.ord == 10 # Space, Tab, or Newline
    end

    @[AlwaysInline]
    protected def illegalWhitespaceChar(char : Char) : Bool
      ILLEGAL_WHITESPACE.includes?(char.ord)
    end

    @[AlwaysInline]
    protected def readChar : Char?
      ret = @stream.read_char
      if ret && (ret == '\r' || ret == '\f')
        raiseReturnOrPageFound(ret)
      elsif ret
        if ret == '\n'
          advLine
        else ret
          adv
        end
      end

      ret
    end

    @[AlwaysInline]
    protected def peekChar : Char?
      ret : Char? = nil
      @stream.withExcursion do
        ret = @stream.read_char
      end
      ret
    end

    @[AlwaysInline]
    protected def readChar! : Char?
      ret = @stream.read_char
      if ret
        if ret == '\n'
          advLine
        else
          adv
        end
      end
      ret
    end

    @[AlwaysInline]
    protected def skipWhitespace : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceChar(c)
          if c == '\r' || c == '\f'
            raiseReturnOrPageFound(c)
          else
            readChar
          end
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        else
          break
        end
      end
    end

    @[AlwaysInline]
    protected def skipSpaces : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceCharButNotNewline(c)
          readChar
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        else
          break
        end
      end
    end

    @[AlwaysInline]
    protected def readComment : Nil
      c : Char? = nil
      loop do
        c = readChar
        if c.nil? || c == '\n'
          break
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        end
      end
    end

    @[AlwaysInline]
    protected def skipWhitespaceAndComments : Nil
      c : Char? = nil
      loop do
        c = peekChar
        if c.nil?
          break
        elsif whitespaceCharButNotNewline(c)
          readChar
        elsif c == '\r' || c == '\f'
          raiseReturnOrPageFound(c)
        elsif illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        elsif c == ';'
          readComment
        else
          break
        end
      end
    end

    protected def readNumber : RSScalar
      radix : UInt8 = 10 # Assume decimal until determined otherwise
      float? : Bool = false
      expChar : Char = 'e'
      haveExpChar? : Bool = false
      first? : Bool = true
      ret : Int64|Float64 = 0

      # We'll build the number into a string, then convert it later.
      #
      # TODO this could probably be done more efficiently without the string?
      retStr = String.build do |str|
        c = peekChar || raise "Character was not expected to be Nil"
        RemiLib.assert(c != '\r')
        RemiLib.assert(c != '\f')

        # We now need to see if we're starting off with a special radix (e.g. #x
        # or #b), or a digit.
        if c == '#'
          readChar
          c = peekChar || raiseParseError("Unexpected end of stream")

          # We've handled Nil, so now figure out the radix.  The code below
          # calls #readChar which will handle #\Return and #\Page for us.
          case c.downcase
          when 'x'
            readChar
            radix = 16
          when 'o'
            readChar
            radix = 8
          when 'b'
            readChar
            radix = 2
          else
            raiseParseError("Invalid radix character: '#{c}'")
          end
          first? = false

        elsif c.to_i?(radix) || c == '+' || c == '-'
          readChar
          str << c
          first? = false

        else
          # Anything else is an error
          raiseParseError("Expected a digit or the start of a radix")
        end

        # Now read characters.  The #readChar will automatically check for
        # #\Return and #\Page as we go.
        loop do
          c = peekChar

          if c.nil? || whitespaceChar(c)
            readChar
            break

          elsif illegalWhitespaceChar(c)
            raiseIllegalWhitespace(c)

          elsif c == ',' || c == ']' || c == '}'
            break

          elsif c.to_i?(radix)
            readChar
            str << c

          elsif c == '.'
            readChar

            # Switch to float mode, or error if we're already reading floats or
            # are using the wrong radix.
            if float?
              raiseParseError("Unexpected extra period in float")
            elsif radix != 10
              raiseParseError("Floats must be in decimal")
            else
              float? = true
            end
            str << c

          elsif c.downcase == 'e' || c.downcase == 'd'
            expChar = c.downcase

            if float?
              if haveExpChar?
                raiseParseError("Unexpected character while reading float: '#{c}'")
              else
                haveExpChar? = true
              end
            else
              if haveExpChar?
                raiseParseError("Unexpected character while reading integer: '#{c}'")
              else
                haveExpChar? = true
                float? = true
              end
            end

            # We downcase it here in case it's an #\e.  This gets around having
            # to do a String#downcase later on when we parse for a float.
            readChar
            str << c.downcase

          elsif c == '-' || c == '+'
            if float?
              if !haveExpChar?
                raiseParseError("Unexpected sign character in float number: '#{c}'")
              end
            elsif !first?
              raiseParseError("Unexpected sign character in integer: '#{c}'")
            end

            readChar
            str << c

          else
            # Everything else is an error
            raiseParseError("Bad numeric character: '#{c}'")
          end # if c.nil? || whitespaceChar(c)

          first? = false
        end # loop do
      end

      # Now try to parse the number.  If we hit an ArgumentError, raise our own
      # parsing error instead, though this should never happen.
      begin
        ret = if float?
                RemiLib.assert(expChar == 'e' || expChar == 'd')
                # Replace 'd' with 'e' because we aren't in Common Lisp.
                # ameba:disable Lint/UselessAssign
                retStr.gsub('d', 'e').to_f64(whitespace = false)
              else
                retStr.to_i64(radix)
              end
      rescue ArgumentError
        raiseParseError("Could not parse numeric value")
      end

      RSScalar.new(ret)
    end

    protected def readEscapedUTF8Char(startLine : UInt64, startCol : UInt64) : Char
      ret = String.build do |codeStr|
        loop do
          num = readChar
          case
          when num.nil?
            raiseParseError("Unterminated UTF-8 character in the string starting at " \
                            "line #{startLine} column #{startCol}")

          when num.to_i?(16) # Always in hex according to the spec
            codeStr << num

          when num == '}'
            break

          else
            raiseParseError("Invalid UTF-8 code in the string starting at line #{startLine} column #{startCol}")
          end
        end
      end

      begin
        ret.to_i32(16).chr
      rescue err : ArgumentError
        raiseParseError("Could not parse escaped UTF-8 character in string starting at " \
                        "line #{startLine} column #{startCol}: #{err}")
      end
    end

    protected def readString : RSScalar
      startLine : UInt64 = @line
      startCol : UInt64 = @col
      backslash? : Bool = false
      RSScalar.new(
        String.build do |str|
          # This must be true since the caller should only have peeked at the next
          # character.
          raise "Unexpectedly tried to read a string" unless readChar == '"'

          loop do
            c = readChar!
            case
            when c.nil?
              raiseParseError("Unterminated string starting at line #{startLine} column #{startCol} "\
                              "(end of stream reached)")
            when c == '"'
              if backslash?
                str << c
                backslash? = false
              else
                break
              end
            when c == '\\'
              if backslash?
                # Write one backslash
                str << c
                backslash? = false
              else
                backslash? = true
              end
            when c == 'u'
              if backslash?
                if readChar == '{'
                  str << readEscapedUTF8Char(startLine, startCol)
                else
                  # Write both the backslash and the #\u
                  str << '\\'
                  str << c
                end
                backslash? = false
              else
                str << c
              end
            else
              str << c
              backslash? = false
            end
          end
        end)
    end

    # Reads a key name.  This handles #\Return and #\Page charactgers according
    # to the spec.
    protected def readKeyName : String
      skipWhitespace
      ret : String = ""

      # Check for a quoted key name first.
      if peekChar == '"'
        ret = readString.val.as(String)
        skipSpaces
        raiseParseError("Expected a colon after the key name") unless readChar == ':'
      else
        # Not a quoted name, so now we read characters to construct the name instead.
        ret = String.build do |str|
          loop do
            c = readChar
            case
            when c.nil?
              raiseParseError("Unexpected end of stream")
            when c == ':'
              break
            when c == '"' || c == '{' || c == '}' || c == '[' || c == ']'
              raiseParseError("Invalid character in unquoted key name: '#{c}'")
            else
              str << c
            end
          end
        end

        if ret.includes?('\n')
          raiseParseError("Unquoted key names cannot contain newlines")
        end
        ret = ret.strip
      end

      if ret.empty? || ret.blank?
        raiseParseError("Empty key name")
      end

      ret
    end

    protected def readBoolean : RSScalar
      firstChar : Char? = readChar || raise "Can't read bool, but expected to"
      case firstChar
      when 't'
        r = readChar || raiseParseError("Unexpected end of stream")
        u = readChar || raiseParseError("Unexpected end of stream")
        e = readChar || raiseParseError("Unexpected end of stream")
        if r.downcase == 'r' && u.downcase == 'u' && e.downcase == 'e'
          RSScalar.new(true)
        else
          raiseParseError("Bad boolean value")
        end
      when 'f'
        a = readChar || raiseParseError("Unexpected end of stream")
        l = readChar || raiseParseError("Unexpected end of stream")
        s = readChar || raiseParseError("Unexpected end of stream")
        e = readChar || raiseParseError("Unexpected end of stream")
        if a.downcase == 'a' && l.downcase == 'l' && s.downcase == 's' && e.downcase == 'e'
          RSScalar.new(false)
        else
          raiseParseError("Bad boolean value")
        end
      else
        raiseParseError("Unexpected character where a boolean was expected: '#{firstChar}'")
      end
    end

    protected def readNull : RSScalar
      n = readChar || raise "Can't read nil, but expected to"
      i = readChar || raiseParseError("Unexpected end of stream")
      l = readChar || raiseParseError("Unexpected end of stream")
      unless n.downcase == 'n' && i.downcase == 'i' && l.downcase == 'l'
        raiseParseError("Bad null value")
      end
      RSScalar.new(nil)
    end

    @[AlwaysInline]
    protected def maybeReadComma : Nil
      skipSpaces
      readChar if peekChar == ','
      skipWhitespace
    end

    protected def readValue : RSValue?
      ret : RSValue? = nil

      skipWhitespace
      loop do
        c = peekChar
        raiseParseError("Expected a value") if c.nil?
        if c == ';'
          readComment
          skipWhitespace
        else
          ret = case
                when c.to_i?(10) || c == '#' || c == '+' || c == '-'
                  readNumber
                when c == '{'
                  readObject
                when c == '['
                  readArray
                when c == '"'
                  readString
                when c.downcase == 't' || c.downcase == 'f'
                  readBoolean
                when c.downcase == 'n'
                  readNull
                else
                  raiseParseError("Unexpected character where a value was expected: '#{c}'")
                end
          break
        end
      end

      maybeReadComma
      skipWhitespace
      ret
    end

    private def readObject : RSObject
      ret : RSObject = RSObject.new
      raise "Expected start of hash" unless readChar == '{'
      skipWhitespace

      loop do
        c = peekChar
        case
        when c.nil? || c == '}'
          readChar
          maybeReadComma
          break
        when whitespaceChar(c)
          readChar
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
          skipWhitespace
        when c == '"' || !(c == '{' || c == '}' || c == '[' || c == ']')
          key : String = readKeyName
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret[key] = val.as(RSValue)
          end
        else
          raiseParseError("Expected a key or the end of an object")
        end
      end

      ret
    end

    private def readArray : RSArray
      ret : RSArray = RSArray.new
      raise "Expected start of array" unless readChar == '['
      skipWhitespace

      loop do
        c = peekChar
        case
        when c.nil? || c == ']'
          readChar
          maybeReadComma
          break
        when whitespaceChar(c)
          readChar
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
          skipWhitespace
        when c == '"' || c == '{' || c == '}' || c == '[' || c == ']' ||
             c.to_i?(10) || c == '+' || c == '-' || c == '#' ||
             c == 't' || c == 'f' || c == 'n'
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret << val.as(RSValue)
          end
          skipWhitespace
        else
          raiseParseError("Expected a value or the end of an array")
        end
      end

      ret
    end

    protected def testForBOM : Nil
      # This check was determined by writing raw bytes into a byte stream, then
      # converting it to a string, then reading from that string.  Since some of
      # these characters wouldn't necessarily be valid outside of a string
      # anyway, this sort of check works both for a possible Byte-Order-Mark, as
      # well as an initial "hey is the first character a bad one?" for these
      # characters.  We must do this before skipping any initial whitespace.
      #
      # https://en.wikipedia.org/wiki/Byte_order_mark
      bomTest : Int32|Nil = peekChar.try(&.ord)
      if bomTest == 0xFEFF || # Zero width no-break space
         bomTest == 0xFFFD || # Replacement character
         bomTest == 0x2B   || # +
         bomTest == 0x0E      # So
        raise RSConfBOMError.new("Possible byte-order-mark detected, or junk in toplevel")
      end
    end

    private def readDocument : RSTopLevel
      ret : RSObject|RSArray|Nil = nil
      startedObject : Bool = false

      testForBOM
      skipWhitespace
      loop do
        c = peekChar
        #puts "At #{@stream.pos} (#{@line}, #{@col}): #{c}"
        case
        when c.nil?
          break
        when whitespaceChar(c)
          skipWhitespace
        when illegalWhitespaceChar(c)
          raiseIllegalWhitespace(c)
        when c == ';'
          readComment
        when c == '{'
          raiseParseError("Unexpected start of document object") if startedObject
          ret = readObject
          break
        when c == '['
          raiseParseError("Unexpected start of document array") if startedObject
          ret = readArray
          break
        when c == '"' || c.alphanumeric?
          unless startedObject
            startedObject = true
            RemiLib.assert(ret.nil?)
            ret = RSObject.new
          end

          RemiLib.assert(ret.is_a?(RSObject))
          key : String = readKeyName
          val : RSValue|Nil = readValue
          if val.nil?
            raiseParseError("Unexpected end of stream")
          else
            ret[key] = val.as(RSValue)
          end
        else
          raiseParseError("Unexpected character at toplevel: '#{c}'")
        end
      end

      skipWhitespaceAndComments
      raiseParseError("Unexpected junk at toplevel") unless readChar.nil?
      if ret.nil?
        RSObject.new
      else
        ret.as(RSObject|RSArray)
      end
    end
  end
end