;;;; CL-SDM - Opinionated Extra Batteries for Common Lisp
;;;; Copyright (C) 2021-2025 Remilia Scarlet <remilia@posteo.jp>
;;;; Copyright 2012-2025 Manas Technology Solutions
;;;;
;;;; This program is free software: you can redistribute it and/or modify it
;;;; under the terms of the GNU Affero General Public License as published by
;;;; the Free Software Foundation, either version 3 of the License, or (at your
;;;; option) any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful, but WITHOUT
;;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;;;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
;;;; License for more details.
;;;;
;;;; You should have received a copy of the GNU Affero General Public License
;;;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
(in-package :cl-sdm)

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;
;;;; BINARY-UTF-8-STREAM Class and Methods
;;;;
;;;; This is partially based on the IO class in the Crystal standard library.
;;;;
;;;; Using this mixin can be somewhat complicated, and so most users will
;;;; probably want to use one of the other classes that use these mixins in
;;;; CL-SDM, such as the MEMORY-UTF-8-STREAM class.
;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; Conditions, DEFTYPEs, Generic Functions, and Classes
;;;

(define-condition utf-8-byte-sequence-error (stream-error simple-error)
  ())

(defmacro utf-8-byte-sequence-error (stream msg &rest fmt-args)
  `(error 'utf-8-byte-sequence-error :stream ,stream
                                     :format-arguments (list ,@fmt-args)
                                     :format-control ,msg))

(deftype t/utf-8-codepoint-byte ()
  '(integer 0 4))

(deftype t/utf-8-stream-back-buffer ()
  `(vector character))

(deftype t/utf-8-stream-back-size-buffer ()
  `(vector t/utf-8-codepoint-byte))

(defgeneric binary-utf-8-stream-reset (stream)
  (:documentation "Resets the internal state of STREAM."))

(defclass binary-utf-8-input-stream (trivial-gray-streams:fundamental-binary-input-stream)
  ((input-buffer
    :initform (new-vector character)
    :type t/utf-8-stream-back-buffer)

   (input-buffer-sizes
    :initform (new-vector t/utf-8-codepoint-byte)
    :type t/utf-8-stream-back-size-buffer))
  (:documentation "A mixin class that allows reading of character data from a binary stream.  It is
not meant to be created directly.  This implements STREAM-READ-CHAR and
STREAM-READ-LINE.  An example that uses this is the MEMORY-UTF-8-STREAM class.

Descendent classes that use this mixin must implement
TRIVIAL-GRAY-STREAMS:STREAM-READ-BYTE and
TRIVIAL-GRAY-STREAMS:STREAM-FILE-POSITION.

Implementing classes may optionally use the following functions for reading
tasks if they need to implement custom functionality, or reimplement part of the
Gray Streams protocol for their particular class.  These are the same ones used
internally by the Gray Stream protocol for BINARY-UTF-8-INPUT-STREAM and
READ-LINE* methods, and are all declared INLINE.

* BINARY-UTF-8-INPUT-STREAM-BACK-BUFFER-LENGTH
* BINARY-UTF-8-INPUT-STREAM-BACK-BUFFER-BYTE-LENGTH
* BINARY-UTF-8-INPUT-STREAM-DO-READ-CHAR
* BINARY-UTF-8-INPUT-STREAM-DO-READ-LINE
* BINARY-UTF-8-INPUT-STREAM-DO-UNREAD-CHAR"))

(defclass binary-utf-8-output-stream (trivial-gray-streams:fundamental-binary-output-stream)
  ()
  (:documentation "A mixin class that allows writing of character data from a binary stream.  It is
not meant to be created directly.  This implements STREAM-WRITE-CHAR and
STREAM-WRITE-LINE.  An example that uses this is the MEMORY-UTF-8-STREAM class.

Descendent classes that use this mixin must implement
TRIVIAL-GRAY-STREAMS:WRITE-SEQUENCE for (UNSIGNED-BYTE 8) sequences.

Implementing classes may optionally use the following functions for writing
tasks if they need to implement custom functionality, or reimplement part of the
Gry Streams protocol for their particular class.  These are the same ones used
internally by the Gray Stream protocol for BINARY-UTF-8-OUTPUT-STREAM methods,
and are all declared INLINE.

* BINARY-UTF-8-OUTPUT-STREAM-DO-WRITE-CHAR"))

;;;
;;; Implementation Functions
;;;
;;; The methods actually call these functions.  These are all separate, and
;;; INLINEd, so that descendent classes can implement customized behaviors
;;; as-needed.  See the MEMORY-UTF-8-STREAM class as an example, where it has to
;;; account for its internal position.
;;;

(define-typed-fn binary-utf-8-input-stream-back-buffer-length ((binary-utf-8-input-stream stream))
    (t/ufixnum t)
  "Returns the number of characters currently in the back buffer.  These are
characters that have been unread, or that were read but could not be returned as
part of a call to READ-LINE*.

This function is meant to be used by methods that operate on classes that
implement the BINARY-UTF-8-INPUT-STREAM class, not directly by users."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (length (the t/utf-8-stream-back-buffer (slot-value stream 'input-buffer))))

(define-typed-fn binary-utf-8-input-stream-back-buffer-byte-length ((binary-utf-8-input-stream stream))
    ((integer 0 #.(* most-positive-fixnum 4)) t)
  "Returns the number of bytes across all characters currently in the back buffer.
These are characters that have been unread, or that were read but could not be
returned as part of a call to READ-LINE*.

This function is meant to be used by methods that operate on classes that
implement the BINARY-UTF-8-INPUT-STREAM class, not directly by users."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0))
           #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note))
  (loop for elt of-type (integer 0 4)
          across (the t/utf-8-stream-back-size-buffer
                      (slot-value stream 'input-buffer-sizes))
        summing elt))

(define-typed-fn binary-utf-8-input-stream-do-read-char ((binary-utf-8-input-stream stream))
    ((values (or character (eql :eof)) (integer 0 4)) t)
  "Actually implements the code necessary for reading a UTF-8 character from
STREAM.  This returns two values: either a character or :EOF, and the number of
bytes that were read."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (labels
      ((read-masked (stream)
         (let ((ret (trivial-gray-streams:stream-read-byte stream)))
           (when (eq ret :eof)
             (utf-8-byte-sequence-error stream "Incomplete UTF-8 byte sequence"))
           (unless (= (logand ret #xC0) #x80)
             (utf-8-byte-sequence-error
              stream "Unexpected continuation byte #x~x in UTF-8 byte sequence" ret))
           ret)))

    ;; Check to see if we have anything in our input buffer.  This can happen if
    ;; UNREAD-CHAR was used, or if READ-LINE* was used and ended up straddling a
    ;; multi-byte character.
    (with-typed-slots ((t/utf-8-stream-back-buffer input-buffer)
                       (t/utf-8-stream-back-size-buffer input-buffer-sizes))
        stream
      (when (plusp (length input-buffer))
        (return-from binary-utf-8-input-stream-do-read-char
          (values (vector-pop input-buffer)
                  (vector-pop input-buffer-sizes)))))

    (let ((first (trivial-gray-streams:stream-read-byte stream))
          (second 0)
          (third 0)
          (fourth 0))
      (declare (type (or t/uint8 (eql :eof)) first)
               (type t/uint8 second third fourth))

      (cond
        ((eq first :eof)
         (return-from binary-utf-8-input-stream-do-read-char (values :eof 0)))
        ((< first #x80)
         (return-from binary-utf-8-input-stream-do-read-char (values (code-char first) 1)))
        ((< first #xC2)
         (utf-8-byte-sequence-error stream "Unexpected byte #x~x in UTF-8 byte sequence" first)))

      (setf second (read-masked stream))
      (when (< first #xE0)
        (return-from binary-utf-8-input-stream-do-read-char
          (values (code-char (+ (ash first 6) (- second #x3080)))
                  2)))

      (setf third (read-masked stream))
      (when (< first #xF0)
        (cond
          ((and (= first #xE0)
                (< second #xA0))
           (utf-8-byte-sequence-error stream "Overlong UTF-8 encoding"))
          ((and (= first #xED)
                (>= second #xA0))
           (utf-8-byte-sequence-error stream "Invalid UTF-8 codepoint")))
        (return-from binary-utf-8-input-stream-do-read-char
          (values (code-char (+ (ash first 12)
                                (ash second 6)
                                (- third #xE2080)))
                  3)))

      (when (< first #xF5)
        (cond
          ((and (= first #xF0)
                (< second #x90))
           (utf-8-byte-sequence-error stream "Overlong UTF-8 encoding"))
          ((and (= first #xF4)
                (>= second #x90))
           (utf-8-byte-sequence-error stream "Invalid UTF-8 codepoint")))

        (setf fourth (read-masked stream))
        (return-from binary-utf-8-input-stream-do-read-char
          (values (code-char (+ (ash first 18)
                                (ash second 12)
                                (ash third 6)
                                (- fourth #x3C82080)))
                  4)))

      (utf-8-byte-sequence-error stream "Unexpected byte #x~x in UTF-8 byte sequence"))))

(define-typed-fn binary-utf-8-input-stream-do-read-line ((binary-utf-8-input-stream stream))
    ((values simple-string boolean) t)
  "Actually implements the code necessary for reading a line of UTF-8 character fro
STREAM as a string.  Returns two values: the string that was read, and a boolean
indicating if it was stopped by an end-of-file (T) or by a newline (NIL)."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (let ((ret (with-output-to-string (out)
               (loop for char of-type (or (eql :eof) character) = (trivial-gray-streams:stream-read-char stream)
                     if (or (eq char :eof)
                            (char= char #\Newline))
                       do (loop-finish)
                     else do
                       (write-char char out)))))
    (if (zerop (length ret))
        (values ret t)
        (values ret nil))))

(define-typed-fn binary-utf-8-input-stream-do-unread-char ((binary-utf-8-input-stream stream) (character character))
    (null t)
  "Actually implements the code necessary for un-reading a UTF-8 character from
STREAM."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (with-typed-slots ((t/utf-8-stream-back-buffer input-buffer)
                     (t/utf-8-stream-back-size-buffer input-buffer-sizes))
      stream
    (vector-push-extend character input-buffer)
    (vector-push-extend (round (integer-length (char->utf-8 character)) 8) input-buffer-sizes))
  nil)

(define-typed-fn binary-utf-8-output-stream-do-write-char ((binary-utf-8-output-stream stream) (character character))
    (character t)
  "Actually implements the code necessary for writing a UTF-8 character to STREAM.
Returns CHARACTER."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (let ((bytes (sdm:uint->bytes (char->utf-8 character))))
    (declare (type list bytes)
             (dynamic-extent bytes))
    (trivial-gray-streams:stream-write-sequence stream bytes 0 (length bytes)))
  character)

(define-typed-fn binary-utf-8-output-stream-do-write-string ((binary-utf-8-output-stream stream) (string string)
                                                             (integer start) ((or null integer) end))
    (string t)
  "Actually implements the code necessary for writing a UTF-8 string to STREAM.
Returns STRING."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (muffling
    (loop for byte across (babel:string-to-octets (subseq string (or start 0) end) :encoding :utf-8)
          do (cl:write-byte byte stream)))
  string)

(define-typed-fn binary-utf-8-input-stream-do-read-line* ((binary-utf-8-input-stream stream) (t/ufixnum limit))
    ((values simple-string t/utf-8-codepoint-byte) t)
  "Actually implements the code necessary for doing READ-LINE* on a
BINARY-UTF-8-INPUT-STREAM.  Returns the string that was read, and the number of
incomplete bytes consumed (i.e. if LIMIT ended up straddling a character, so the
character couldn't be completely read, then INCOMPLETE-BYTES will say how many
bytes into that next character were read)."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))

  (let ((remaining limit)
        (incomplete 0)
        (out (make-string-output-stream)))
    (declare (type fixnum remaining)
             (type t/utf-8-codepoint-byte incomplete)
             (type string-stream out))

    (with-typed-slots ((t/utf-8-stream-back-buffer input-buffer)
                       (t/utf-8-stream-back-size-buffer input-buffer-sizes))
        stream
      (when (plusp (length input-buffer))
        ;; Pop off what we can by taking from the input buffer that we've
        ;; accumulated elsewhere.
        (loop while (and (plusp (length input-buffer))
                         (not (minusp (- remaining (the (integer 0 4) (vec-last input-buffer-sizes))))))
              do (write-char (vector-pop input-buffer) out)
                 (decf remaining (vector-pop input-buffer-sizes))))

      ;; This must be true, even if we've adjusted it by taking from the input
      ;; buffer.
      (assert (not (minusp remaining)))

      ;; Did we fulfill what we needed already?  If not, we can now start
      ;; reading characters normally.
      (unless (zerop remaining)
        (loop do
          (multiple-value-bind (char bytes-read)
              (binary-utf-8-input-stream-do-read-char stream)
            (declare (type (or character (eql :eof)) char))

            (cond
              ((eql char :eof)
               (loop-finish))

              ((minusp (- remaining bytes-read))
               ;; We've read a full character, but this is a multi-byte
               ;; character that puts us over LIMIT.  So, we store this byte
               ;; for later in the input buffer, then return what we've read
               ;; before now.
               (vector-push-extend char input-buffer)
               (vector-push-extend bytes-read input-buffer-sizes)
               (setf incomplete bytes-read)
               (loop-finish))

              (t
               (assert (not (minusp remaining)))
               (when (char= char #\Newline)
                 (loop-finish))
               (write-char char out)
               (when (zerop (decf remaining bytes-read))
                 (loop-finish))))))))
    (values (get-output-stream-string out) incomplete)))

;;;
;;; Methods
;;;

(defmethod trivial-gray-streams:stream-read-char ((stream binary-utf-8-input-stream))
  "Reads the next byte from STREAM and returns it.  Or, if there are no more bytes
to read, this returns :EOF."
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (nth-value 0 (binary-utf-8-input-stream-do-read-char stream)))

(defmethod trivial-gray-streams:stream-read-line ((stream binary-utf-8-input-stream))
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (binary-utf-8-input-stream-do-read-line stream))

(defmethod trivial-gray-streams:stream-unread-char ((stream binary-utf-8-input-stream) (character character))
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (binary-utf-8-input-stream-do-unread-char stream character))

(defmethod trivial-gray-streams:stream-write-char ((stream binary-utf-8-output-stream) (character character))
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (binary-utf-8-output-stream-do-write-char stream character))

(defmethod trivial-gray-streams:stream-write-string ((stream binary-utf-8-output-stream) (string string)
                                                     &optional start end)
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (assert (typep start '(or null integer)))
  (assert (typep end '(or null integer)))
  (binary-utf-8-output-stream-do-write-string stream string (or start 0) end))

(defmethod read-line* ((stream binary-utf-8-input-stream) (limit integer) &optional recursive-p)
  "Same as READ-LINE, but only reads up to LIMIT characters from STREAM.  The
newline, if encountered, is read from the stream, but is not counted towards and
is not included in the returned value.

This ignores the RECURSIVE-P argument."
  (declare (ignore recursive-p)
           (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (assert (typep limit 't/ufixnum))
  (nth-value 0 (binary-utf-8-input-stream-do-read-line* stream limit)))

(defmethod binary-utf-8-stream-reset ((stream binary-utf-8-input-stream))
  (declare (optimize speed (debug 1) (safety 1) (compilation-speed 0)))
  (setf (slot-value stream 'input-buffer) (new-vector character))
  (setf (slot-value stream 'input-buffer-sizes) (new-vector t/utf-8-codepoint-byte))
  t)
