-- Pandoc custom writer for TASVideos forum markup.
-- https://tasvideos.org/ForumMarkup
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs
--
-- Usage (see https://pandoc.org/MANUAL.html#custom-readers-and-writers):
-- pandoc -t tasvideos_forum.lua input.md
-- tasvideos_forum.lua may be given as an absolute or relative path, or it can
-- be placed in the "custom" subdirectory of the user data directory:
-- https://pandoc.org/MANUAL.html#option--data-dir. Many input formats may be
-- used, see https://pandoc.org/MANUAL.html#option--from.
--
-- To output specific BBCode tags, inclusing TASVideos-specific markup such as
-- [movie] and [post], represent them in Markdown with raw attribute tags and a
-- format name of "tasvideos_forum" (see
-- https://pandoc.org/MANUAL.html#generic-raw-attribute). Like so:
-- This is `[frames]100[/frames]`{=tasvideos_forum} faster than `[movie]1234[/movie]`{=tasvideos_forum}.
--
-- ```{=tasvideos_forum}
-- [note]
-- More details are in [post=1234]this post[/post].
-- [/note]
-- ```
--
-- The download filename for code blocks can be provided by a "filename"
-- attribute, like so:
-- ``` {filename=test.lua}
-- print("hello")
-- ```
--
-- Image paths/URLs are converted to data URIs when the embed_resources
-- extension is activated. This is similar to the --embed-resources option for
-- the pandoc program.
-- pandoc -t tasvideos_forum.lua+embed_resources input.md
-- The format recognized by RawBlock and RawInline.
local MY_FORMAT = "tasvideos_forum"
Extensions = {
-- Convert [img] tag paths/URLs to data URIs. We implement the flag as
-- an extension because we cannot access the Pandoc --embed-resources
-- option: https://github.com/jgm/pandoc/discussions/9978.
embed_resources = false,
}
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs#L80
-- The important piece of information about each BBCode tag, for writing, is
-- whether child elements are allowed in it or not. Some tags, like [b], always
-- allow new child elements; some, like [code], never allow child elements; and
-- some, like [url], allow child elements only if a parameter is set on the
-- tag. For example, in:
-- [url]http://example.com/[b]path[/b][/url]
-- the `[b]...[/b]` is a literal part of the URL path. But with a parameter:
-- [url=http://example.com/path]text [b]label[/b][/url]
-- the `[b]...[/b]` results in bold text.
--
-- There are also "void" elements that are self-closing and never get an end
-- tag, like [hr]. The [*] element for list items is also treated as a void
-- tag, marking the beginning of items but not enclosing them.
local TAGS = {
b = {nesting = true},
i = {nesting = true},
u = {nesting = true},
s = {nesting = true},
sub = {nesting = true},
sup = {nesting = true},
tt = {nesting = true},
left = {nesting = true},
right = {nesting = true},
center = {nesting = true},
quote = {nesting = true},
code = {nesting = false},
img = {nesting = false},
url = {nesting = false, nesting_with_param = true},
size = {nesting = true},
hr = {nesting = false, void = true},
list = {nesting = true},
["*"] = {nesting = false, void = true},
table = {nesting = true},
tr = {nesting = true},
td = {nesting = true},
th = {nesting = true},
}
local function assert_not_nil(x, msg)
assert(x ~= nil, msg)
return x
end
local function lookup_tag(tag)
return assert_not_nil(TAGS[tag], string.format("%q missing from TAGS", tag))
end
-- We first process the Pandoc AST into a linear sequence of tokens, where a
-- token is one of the yield_* types below: a start tag, an end tag, text to be
-- escaped, raw text to be output without escaping, a carriage return, a blank
-- line, or a special token that cancels blank lines.
--
-- The reason to emit a preliminary sequence of tokens, rather than just hav
-- each AST node produce some text directly, is for proper escaping of text
-- that may be broken across nodes--notably of hyperlinks, which are supposed
-- to be escaped to prevent auto-linkification by the BBCode parser. Consider
-- this HTML input:
-- <code><span>http:</span><span>//example.com</span></code>
-- Because HTML <span> has no representation in BBCode, if we were to have each
-- pandoc.Span node simply return its escaped text contents, the above would
-- render to the BBCode:
-- [tt]http://example.com[/tt]
-- Because the text was not a link in the input, it should not be a link in the
-- output. But this output text is not escaped and will wrongly be linkified by
-- the BBCode parser.
--
-- The intermediate sequence of tokens is a fix for this problem. For this
-- example, we first produce the tokens:
-- {type = "start_tag", tag = "tt"}
-- {type = "text", text = "http:"}
-- {type = "text", text = "//example.com"}
-- {type = "end_tag", tag = "tt"}
-- Then, there is an intermediate step that consolidates adjacent "text"
-- tokens:
-- {type = "start_tag", tag = "tt"}
-- {type = "text", text = "http://example.com"}
-- {type = "end_tag", tag = "tt"}
-- With adjacent text tokens being joined into complete strings, the BBCode
-- output can be properly escaped:
-- [tt]http[noparse]://[/noparse]example.com[/tt]
--
-- The cancel_blankline token is a workaround for a visual issue with list
-- rendering on TASVideos. In short, after a [list] element, we want only a
-- carriage return, not a blank line. By itself, a cancel_blankline token does
-- nothing. Adjacent to any number of blankline tokens (before or after), it
-- cancels the blanklines and turns them into a single cr token.
local function yield_start_tag(tag, param)
coroutine.yield({type = "start_tag", tag = tag, param = param})
end
local function yield_end_tag(tag)
coroutine.yield({type = "end_tag", tag = tag})
end
local function yield_text(text)
coroutine.yield({type = "text", text = text})
end
local function yield_raw_text(text)
coroutine.yield({type = "raw_text", text = text})
end
local function yield_cr()
coroutine.yield({type = "cr"})
end
local function yield_blankline()
coroutine.yield({type = "blankline"})
end
local function yield_cancel_blankline()
coroutine.yield({type = "cancel_blankline"})
end
-- Keep track of footnote bodies in this table, in order to output the note
-- bodies the bottom of the document. The table is added to by Inlines.Note.
local footnotes = {}
-- Tables of tokenization functions for Pandoc node types. This is like the
-- pandoc.scaffolding.Writer machinery, though we do not actually use
-- pandoc.scaffolding.Writer. tokenize_doc is called on the top-level
-- pandoc.Pandoc, and in turn recursively calls tokenize_blocks and
-- tokenize_inlines, which consult the Blocks and Inlines tables of per-node
-- tokenization functions.
local Blocks = {}
local Inlines = {}
local function tokenize_blocks(blocks, opts)
assert_not_nil(opts, "tokenize_blocks opts")
for i, el in ipairs(blocks) do
if i > 1 then yield_blankline() end
local tokenize = assert(Blocks[el.tag], string.format("missing Blocks[%q]", el.tag))
tokenize(el, opts)
end
end
local function tokenize_inlines(inlines, opts)
assert_not_nil(opts, "tokenize_inlines opts")
for _, el in ipairs(inlines) do
local tokenize = assert(Inlines[el.tag], string.format("missing Inlines[%q]", el.tag))
tokenize(el, opts)
end
end
-- Put start and end BBCode tags around some other tokens. This function yields
-- a start tag/param, then calls fn with no arguments, then yields an end tag.
--
-- param is optional, and for that matter tag is also optional: if tag is nil,
-- this function just calls fn, without adding any start or end tags. sep is an
-- optional function that returns tokens to insert between the tags and the
-- result of calling fn.
local function enclose(tag, param, fn, sep)
if tag == nil then
assert(param == nil, param)
fn()
else
yield_start_tag(tag, param)
if sep then sep() end
fn()
if sep then sep() end
yield_end_tag(tag)
end
end
local function enclose_blocks(tag, param, blocks, opts)
enclose(tag, param, function()
tokenize_blocks(blocks, opts)
end)
end
local function enclose_inlines(tag, param, inlines, opts)
enclose(tag, param, function()
tokenize_inlines(inlines, opts)
end)
end
local function enclose_text(tag, param, text)
enclose(tag, param, function()
yield_text(text)
end)
end
-- Helper function for pandoc.BulletList and pandoc.OrderedList. param should
-- be either nil (for a bullet list) or "1" (for an ordered list).
local function tokenize_list(param, items, opts)
enclose("list", param, function ()
for i, item in ipairs(items) do
if i > 1 then yield_cr() end
yield_start_tag("*")
tokenize_blocks(item, opts)
end
end, yield_cr)
-- The Bootstrap CSS used by TASVideos has a large margin-bottom after
-- top-level lists:
-- https://github.com/twbs/bootstrap/blob/v5.3.1/dist/css/bootstrap.css#L294
-- We want to emit a blank line between most block elements, but for
-- [list] specifically, we want just a carriage return, because a blank
-- line visually leaves too much space. The cancel_blankline token
-- removes the blankline token that tokenize_blocks adds by default,
-- and converts it to a cr token.
--
-- There's no margin-bottom after nested lists, but the rendering is
-- the same whether we use a carriage return or a blank line in those
-- cases, so there is no need to treat them specially.
-- https://github.com/twbs/bootstrap/blob/v5.3.1/dist/css/bootstrap.css#L301
yield_cancel_blankline()
end
function Blocks.BlockQuote(el, opts)
enclose_blocks("quote", nil, el.content, opts)
end
function Blocks.BulletList(el, opts)
tokenize_list(nil, el.content, opts)
end
-- CodeBlock element classes that are known not to be language tags.
local NON_LANGUAGE_CLASSES = {}
for _, class in ipairs({
"numberLines",
"sourceCode",
}) do
NON_LANGUAGE_CLASSES[class] = true
end
function Blocks.CodeBlock(el, opts)
-- The parameter to the [code] tag can be either a language name or a
-- filename. If it contains a dot character, it's interpreted as a
-- filename, otherwise as a language name:
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/Node.cs#L300
-- We may get a language in el.attr.classes and/or a filename in
-- el.attr.attributes["filename"]. We can represent at most one of
-- them. We prefer a filename, as long as it contains at least one dot
-- and will be recognized as such. Otherwise we take the first class
-- that might plausibly be a language name (which must *not* contain a
-- dot).
local param = el.attr.attributes["filename"]
if param == nil or not string.match(param, "%.") then
-- The language tag, if present, is one of the members of
-- el.attr.classes. Using the shortcut syntax,
-- ```haskell
-- the language tag will be the first class. Other conventional
-- class names may appear, such as numberLines and sourceCode.
-- Take the first class that is not one of the known
-- non-language classes, and that doesn't contain a dot, and
-- interpret it as the language tag.
param = el.attr.classes:find_if(function (class)
return not NON_LANGUAGE_CLASSES[class] and not string.match(class, "%.")
end)
end
enclose_text("code", param, el.text)
end
function Blocks.DefinitionList(el, opts)
for i, item in ipairs(el.content) do
local terms = item[1]
local defns = item[2]
if i > 1 then yield_cr() end
enclose_inlines("b", nil, terms, opts)
yield_cr()
tokenize_blocks({pandoc.BulletList(defns)}, opts)
end
end
function Blocks.Div(el, opts)
tokenize_blocks(el.content, opts)
end
function Blocks.Figure(el, opts)
-- Figure
tokenize_blocks(el.content, opts)
yield_cr()
-- Caption
enclose_blocks("b", nil, el.caption.long, opts)
end
function Blocks.Header(el, opts)
-- There are no header elements in the BBCode markup, so fake it with
-- bold and changing the font size.
-- https://html.spec.whatwg.org/multipage/rendering.html#sections-and-headings
local size = ({
[1] = "2em",
[2] = "1.5em",
[3] = "1.17em",
[4] = "1em",
[5] = "0.83em",
})[el.level] or "0.67em"
local bold = pandoc.Inlines({pandoc.Strong(el.content)})
enclose_inlines("size", size, bold, opts)
end
function Blocks.HorizontalRule(el, opts)
yield_start_tag("hr")
end
function Blocks.LineBlock(el, opts)
for i, line in ipairs(el.content) do
if i > 1 then tokenize_inlines({pandoc.LineBreak()}, opts) end
tokenize_inlines(line, opts)
end
end
function Blocks.OrderedList(el, opts)
assert(el.listAttributes.start == 1, el.listAttributes)
-- ignore el.listAttributes.style
-- ignore el.listAttributes.delimiter
tokenize_list("1", el.content, opts)
end
function Blocks.Para(el, opts)
tokenize_inlines(el.content, opts)
end
function Blocks.Plain(el, opts)
tokenize_inlines(el.content, opts)
end
function Blocks.RawBlock(el, opts)
if el.format == MY_FORMAT then
yield_raw_text(el.text)
else
pandoc.log.warn(string.format("not rendered: %q", tostring(el)))
end
end
-- We can represent most features of the pandoc.Table type, such as mid-table
-- headings, so we don't have to go to the extremity of calling
-- pandoc.utils.to_simple_table. But one thing we cannot represent is the
-- row_span and col_span of cells. This function breaks each m×n cell into 1×1
-- cells, with the original content in the upper left cell.
local function despan_table(el)
-- A list of integers indicating for how many more rows the given
-- column should hold a blank 1×1 cell. When the counter reaches zero,
-- the corresponding element is set to nil and the column becomes
-- eligible for new cells again.
local pending = {}
-- Decrement each element of pending by 1, and remove elements that
-- become 0. Modifies pending in place.
local function age_pending()
for col in pairs(pending) do
pending[col] = pending[col] - 1
if pending[col] == 0 then
pending[col] = nil
end
end
end
-- Chop up col_span cells and distribute them among already pending
-- columns. Update pending according to the row_spans of the cells in
-- this row. Modifies pending in place.
local function despan_rows(rows)
local new_rows = {}
for _, row in ipairs(rows) do
local new_cells = {}
-- Age pending with each new row.
age_pending()
local col = 1
for _, cell in ipairs(row.cells) do
-- Insert blank 1×1 cells until finding a
-- column that is not already pending.
while pending[col] ~= nil do
table.insert(new_cells, pandoc.Cell({}, "AlignDefault", 1, 1, cell.attr))
col = col + 1
end
-- Insert this cell.
table.insert(new_cells, pandoc.Cell(cell.contents, cell.alignment, 1, 1, cell.attr))
pending[col] = cell.row_span
col = col + 1
-- Insert blanks up to this cell's col_span.
-- Any columns touched become pending for the
-- next row.
for _ = 2, cell.col_span do
table.insert(new_cells, pandoc.Cell({}, "AlignDefault", 1, 1, cell.attr))
if pending[col] == nil then
pending[col] = cell.row_span
else
pending[col] = math.max(pending[col], cell.row_span)
end
col = col + 1
end
end
table.insert(new_rows, pandoc.Row(new_cells, row.attr))
end
return new_rows
end
local head = pandoc.TableHead(despan_rows(el.head.rows), el.head.attr)
local bodies = {}
for _, body in ipairs(el.bodies) do
table.insert(bodies, {
head = despan_rows(body.head),
body = despan_rows(body.body),
row_head_columns = body.row_head_columns,
attr = body.attr,
})
end
local foot = pandoc.TableFoot(despan_rows(el.foot.rows), el.foot.attr)
-- Pandoc should have normalized the table it gave us, such that there
-- are no row_spans that extend past the final row.
age_pending()
assert(next(pending) == nil, string.format("pending row_span at end of table"))
return pandoc.Table(el.caption, el.colspecs, head, bodies, foot, el.attr)
end
local function table_cell_align_tag(cell_alignment, col_alignment)
local alignment = cell_alignment
if alignment == "AlignDefault" then
alignment = col_alignment
end
if alignment == "AlignDefault" then
return nil
else
return assert(({
AlignLeft = "left",
AlignRight = "right",
AlignCenter = "center",
})[alignment], string.format("unknown alignment %q", alignment))
end
end
local function tokenize_table_cell(cell, cell_tag, col_alignment, opts)
assert(cell.col_span == 1, string.format("col_span == %d", cell.col_span))
assert(cell.row_span == 1, string.format("row_span == %d", cell.row_span))
local align_tag = table_cell_align_tag(cell.alignment, col_alignment)
enclose(cell_tag, nil, function ()
enclose_blocks(align_tag, nil, cell.contents, opts)
end)
end
-- If row_head_columns is nil, this is a header row: use [th] for every cell.
-- Otherwise row_head_columns is an integer that tells how many initial columns
-- to use [th] for; the rest will use [td].
local function tokenize_table_row(row, row_head_columns, col_alignments, opts)
enclose("tr", nil, function ()
local col = 1
for i, cell in ipairs(row.cells) do
if i > 1 then yield_cr() end
local cell_tag
if row_head_columns == nil or col <= row_head_columns then
cell_tag = "th"
else
cell_tag = "td"
end
tokenize_table_cell(cell, cell_tag, col_alignments[col], opts)
col = col + cell.col_span
end
end, yield_cr)
end
function Blocks.Table(el, opts)
el = despan_table(el)
-- Caption
if next(el.caption.long) then
enclose_blocks("b", nil, el.caption.long, opts)
yield_cr()
end
-- Table
local col_alignments = {}
for i, colspec in ipairs(el.colspecs) do
col_alignments[i] = colspec[1]
end
-- Helper function to yield a cr before every row but the first.
local first = true
local function yield_sep()
if not first then yield_cr() end
first = false
end
enclose("table", nil, function ()
for _, row in ipairs(el.head.rows) do
yield_sep()
tokenize_table_row(row, nil, col_alignments, opts)
end
for _, body in ipairs(el.bodies) do
for _, row in ipairs(body.head) do
yield_sep()
tokenize_table_row(row, nil, col_alignments, opts)
end
for _, row in ipairs(body.body) do
yield_sep()
tokenize_table_row(row, body.row_head_columns, col_alignments, opts)
end
end
for _, row in ipairs(el.foot.rows) do
yield_sep()
tokenize_table_row(row, nil, col_alignments, opts)
end
end, yield_cr)
end
function Inlines.Cite(el, opts)
-- You might want to set the link-citations metadata field to false
-- (`-M link-citations=false` on the command line) to avoid #ref links
-- to nowhere from appearing in citation markers.
tokenize_inlines(el.content, opts)
end
function Inlines.Code(el, opts)
enclose_text("tt", nil, el.text, opts)
end
function Inlines.Emph(el, opts)
enclose_inlines("i", nil, el.content, opts)
end
function Inlines.Image(el, opts)
-- ignore el.caption
-- ignore el.title
-- Include a size param only if w and h are provided and have integer
-- values. (Or if just w is provided.)
local w = el.attr.attributes.width and tonumber(el.attr.attributes.width, 10)
local h = el.attr.attributes.height and tonumber(el.attr.attributes.height, 10)
local size
if w and h then
size = string.format("%dx%d", w, h)
elseif w then
size = string.format("%d", w)
end
-- Convert el.src to a data URI if the embed_resources extension is
-- used.
local src = el.src
if opts.extensions:includes("embed_resources") then
local mime_type, data = pandoc.mediabag.fetch(el.src)
assert(mime_type, el.src)
src = pandoc.mediabag.make_data_uri(mime_type, data)
end
enclose_text("img", size, src, opts)
end
function Inlines.LineBreak(el, opts)
-- Always a literal newline character, not a collapsing "cr" token.
-- The TASVideos forum CSS uses `white-space: pre-wrap`:
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos/wwwroot/css/partials/_customizations.scss#L367
-- which means that every line break counts: multiple consecutive line
-- breaks should be preserved and not collapsed.
yield_text("\n")
end
function Inlines.Link(el, opts)
-- If the element has the class "uri", this is a bare URL link. Prefer
-- to output it as `[url]http://example.com[/url]`. But do this only if
-- el.target and el.content are equal: they may differ due to percent
-- escaping, for example.
if el.attr.classes:includes("uri") and el.target == pandoc.utils.stringify(el.content) then
enclose_inlines("url", nil, el.content, opts)
else
-- We expect Pandoc to have hex-escaped link.target, so it
-- meets the param syntax check in enclose_inlines. Otherwise
-- an error will be raised.
enclose_inlines("url", el.target, el.content, opts)
end
end
-- Convert a pandoc.Math element into plain Pandoc inlines, if possible. This
-- only works for math that is not too fancy. Returns the inlines if the
-- conversion is possible; otherwise returns nil. (In this respect it differs
-- from the Pandoc function texMathToInlines, which returns a fallback in case
-- of error.)
local function tex_math_to_inlines(mathtype, inp)
-- We don't have access to Pandoc's texMathToInlines function, so we
-- employ a hack to access indirectly. We create a new Pandoc document
-- containing just the Math we are interested in and convert it to an
-- HTML string, taking advantage of the fact that the HTML writer uses
-- texMathToInlines when html_math_method = "plain" in WriterOptions.
-- Then we parse the HTML string back into an AST to recover the
-- inlines. We don't get an explicit error when the conversion fails
-- inside the HTML, so in order to check for error, we do a preliminary
-- conversion using the plain writer and check for the output starting
-- with '$', which is true of the plain writer's fallback rendering.
-- https://github.com/jgm/pandoc/discussions/11399
-- Create a new pandoc.Pandoc containing just the desired Math element.
local doc = pandoc.Pandoc({pandoc.Plain({pandoc.Math(mathtype, inp)})})
-- Write to plain format to check if conversion is possible.
local plain = pandoc.write(doc, "plain", {html_math_method = "plain", wrap_text = "wrap-none"})
if string.match(plain, "^%$") then
-- The output starts with '$': conversion failed.
return nil
end
-- If the conversion succeeded for the plain writer, assume it will
-- also succeed for the HTML writer.
local html = pandoc.write(doc, "html", {html_math_method = "plain", wrap_text = "wrap-none"})
-- Parse the HTML string back into a pandoc.Pandoc and extract the
-- inlines.
return pandoc.utils.blocks_to_inlines(pandoc.read(html, "html").blocks)
end
function Inlines.Math(el, opts)
-- Represent the math using plain inlines, if possible.
local inlines = tex_math_to_inlines(el.mathtype, el.text)
if inlines then
if el.mathtype == "InlineMath" then
tokenize_inlines(inlines, opts)
elseif el.mathtype == "DisplayMath" then
yield_blankline()
tokenize_blocks({pandoc.Plain(inlines)}, opts)
yield_blankline()
else
error(el.mathtype)
end
else
-- Otherwise, wrap the LaTeX in a Code or CodeBlock element.
if el.mathtype == "InlineMath" then
tokenize_inlines({pandoc.Code("$" .. el.text .. "$", {class = "latex"})}, opts)
elseif el.mathtype == "DisplayMath" then
tokenize_blocks({pandoc.CodeBlock("$$" .. el.text .. "$$", {class = "latex"})}, opts)
else
error(el.mathtype)
end
end
end
local function footnote_marker(n, opts)
return pandoc.Inlines({pandoc.Superscript(pandoc.Str(n))})
end
function Inlines.Note(el, opts)
-- Insert the note body into the global footnotes table, to output it
-- at the end of tokenize_doc.
table.insert(footnotes, el.content)
-- Output the footnote marker.
tokenize_inlines(footnote_marker(#footnotes), opts)
end
function Inlines.Quoted(el, opts)
local q = assert(({
SingleQuote = {open = [[']], close = [[']]},
DoubleQuote = {open = [["]], close = [["]]},
})[el.quotetype], el.quotetype)
yield_text(q.open)
tokenize_inlines(el.content, opts)
yield_text(q.close)
end
function Inlines.RawInline(el, opts)
if el.format == MY_FORMAT then
yield_raw_text(el.text)
else
pandoc.log.warn(string.format("not rendered: %q", tostring(el)))
end
end
function Inlines.SmallCaps(el, opts)
tokenize_inlines(el.content, opts)
end
function Inlines.SoftBreak(el, opts)
yield_text(" ")
end
function Inlines.Space(el, opts)
-- A literal space character, not the reflowable pandoc.layout.space.
-- Line breaks in the input are always interpreted as line breaks by
-- the parser, so retain long lines.
yield_text(" ")
end
function Inlines.Span(el, opts)
tokenize_inlines(el.content, opts)
end
function Inlines.Str(el, opts)
yield_text(el.text)
end
function Inlines.Strikeout(el, opts)
enclose_inlines("s", nil, el.content, opts)
end
function Inlines.Strong(el, opts)
enclose_inlines("b", nil, el.content, opts)
end
function Inlines.Subscript(el, opts)
enclose_inlines("sub", nil, el.content, opts)
end
function Inlines.Superscript(el, opts)
enclose_inlines("sup", nil, el.content, opts)
end
function Inlines.Underline(el, opts)
enclose_inlines("u", nil, el.content, opts)
end
-- We maintain a stack to keep track of what BBCode tags are open as we write
-- the output. The context of open tags affects how the BBCode is parsed, and
-- therefore affects how we must do escaping in what we output. The main
-- consideration is whether the element at the top of the stack (the most
-- recent start tag) permits nested child elements.
--
-- If nested child elements are permitted, then the parser will look for and
-- interpret start tags and URLs in the input, and we must escape them (using
-- [noparse]) to prevent such interpretation.
--
-- If nested child elements are not permitted, then the parser will not
-- interpret anything that looks like a start tag or a URL, but copy it
-- verbatim to the output. Therefore we must *not* escape them. Most end tags
-- are copied to the output verbatim in the same way, with the sole exception
-- of an end tag that matches the element at the top of the tag stack. That one
-- end tag is looked for so that that open element can be closed. If we are
-- asked to output text that matches such an end tag, we raise an error,
-- because we have no way to escape it.
--
-- Before https://github.com/TASVideos/tasvideos/pull/2248, end tags in
-- no-children-allowed contexts would match anywhere on the stack, not just at
-- the top. But we don't have to worry about that.
local function nesting_allowed(stack)
local head = stack[#stack]
if head then
local tag = lookup_tag(head.tag)
return assert_not_nil(tag.nesting) or (head.param ~= nil and tag.nesting_with_param)
else
-- Empty stack means top level, tags are allowed.
return true
end
end
local function url_autolinking(stack)
for _, elem in ipairs(stack) do
if elem.tag == "url" then
return false
end
end
return true
end
-- Strings to escape with [noparse]. Generally we have to escape BBCode tags
-- and URLs. In the special case of being inside a [url] tag already, we can
-- get away with escaping just BBCode tags.
local ESCAPE_BBCODE_URL_RE = re.compile([[ "["+ / "://" ]])
local ESCAPE_BBCODE_RE = re.compile([[ "["+ ]])
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs#L457
-- Technically the character class should also exclude \p{C} (control and
-- "other" characters).
local CLOSING_TAG_PATTERN = "%[/([^%[%]=/]+)%]"
local function escape(text, stack)
if nesting_allowed(stack) then
-- Nested child elements are allowed here, so we may use
-- noparse escaping. We must escape nested BBCode tags to
-- prevent them from being interpreted by the parser, and
-- URL-like strings to prevent them from being autolinked.
local escape_re
if url_autolinking(stack) then
-- Inside a [url] tag, URL autolinking is disabled, so
-- we don't escape URLs. Escaping them would do no
-- harm, but we avoid doing so for cleaner markup.
escape_re = ESCAPE_BBCODE_URL_RE
else
escape_re = ESCAPE_BBCODE_RE
end
-- Escaping just the "[" of a BBCode tag, like so:
-- [noparse][[/noparse]/center]
-- is more robust than escaping the whole tag:
-- [noparse][/center][/noparse]
-- This way, we can escape the strings "[noparse]" and
-- "[/noparse]" themselves. Looking for and escaping the single
-- character "[" is easier than matching tags precisely, at the
-- possible cost of some harmless unnecessary escaping.
return re.gsub(text, escape_re, "[noparse]%0[/noparse]")
else
-- Nested children are not allowed at this point, which means
-- we cannot use noparse escaping. We do not need to escape
-- start tags or URLs, and end tags are likewise safe as long
-- as they do not match the tag at the top of the stack. Raise
-- an error if an end tag matches the top of the stack and
-- therefore would be misinterpreted by the parser.
local head = assert_not_nil(stack[#stack])
for tag in string.gmatch(text, CLOSING_TAG_PATTERN) do
if tag == head.tag then
error(string.format("cannot escape [/%s] in %q", tag, text))
end
end
return text
end
end
-- Check that a tag name is syntactically valid.
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs#L439
local function tag_is_valid(tag)
return string.match(tag, "^[^%[%]=/]+$")
end
-- A BBCode parameter can contain '[' and ']' characters, but only if they are
-- in balanced pairs.
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs#L441
-- Technically the character class should also exclude \p{C} (control and
-- "other" characters).
local PARAM_RE = re.compile([[
full <- seq !.
seq <- ([^][] / balanced)*
balanced <- "[" seq "]"
]])
local function param_is_valid(param)
return re.match(param, PARAM_RE) ~= nil
end
local function start_tag(tag, param)
assert(tag_is_valid(tag), tag)
if param == nil then
return "[" .. tag .. "]"
else
-- Check that param is syntactically valid.
assert(param_is_valid(param), string.format("cannot escape param: %q", param))
-- If param begins and ends with quote characters, they will
-- be stripped by the parser. So add another pair of quotes to
-- protect them.
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/BbParser.cs#L287
if string.match(param, "^\".*\"$") then
param = "\"" .. param .. "\""
end
return "[" .. tag .. "=" .. param .. "]"
end
end
local function end_tag(tag)
assert(tag_is_valid(tag), tag)
return "[/" .. tag .. "]"
end
local function render_token(token, stack)
if token.type == "start_tag" then
return start_tag(token.tag, token.param)
elseif token.type == "end_tag" then
return end_tag(token.tag)
elseif token.type == "text" then
return escape(token.text, stack)
elseif token.type == "raw_text" then
return token.text
elseif token.type == "cr" then
return pandoc.layout.cr
elseif token.type == "blankline" then
return pandoc.layout.blankline
else
error(token.type)
end
end
local function render_tokens(tokens)
local parts = {}
local stack = {}
for token in tokens do
if token.type == "end_tag" then
local tag = assert(table.remove(stack), string.format("empty stack for %q", token.tag))
assert(token.tag == tag.tag, string.format("popping %q, found %q", token.tag, tag.tag))
end
table.insert(parts, render_token(token, stack))
if token.type == "start_tag" and not lookup_tag(token.tag).void then
assert(nesting_allowed(stack), token.tag)
table.insert(stack, {tag = token.tag, param = token.param})
end
end
return pandoc.layout.concat(parts)
end
-- Merge adjacent text tokens and let cancel_blankline tokens cancel blankline
-- tokens.
local function consolidate_tokens(tokens)
local text_buf = {}
local function flush_text()
if next(text_buf) then
coroutine.yield({type = "text", text = table.concat(text_buf)})
text_buf = {}
end
end
local blanklines_count = 0
local cancel_blankline = false
local function flush_blanklines()
if cancel_blankline then
if blanklines_count > 0 then
yield_cr()
end
else
for i = 1, blanklines_count do
yield_blankline()
end
end
blanklines_count = 0
cancel_blankline = false
end
for token in tokens do
if token.type ~= "text" then
flush_text()
end
if token.type ~= "blankline" and token.type ~= "cancel_blankline" then
flush_blanklines()
end
if token.type == "text" then
table.insert(text_buf, token.text)
elseif token.type == "blankline" then
blanklines_count = blanklines_count + 1
elseif token.type == "cancel_blankline" then
cancel_blankline = true
else
coroutine.yield(token)
end
end
flush_text()
flush_blanklines()
end
local function tokenize_doc(doc, opts)
assert_not_nil(opts)
tokenize_blocks(doc.blocks, opts)
-- Output footnote bodies, if any.
if next(footnotes) then
yield_blankline()
tokenize_blocks({pandoc.HorizontalRule()}, opts)
yield_cr()
-- The default font size is 12 = 1em:
-- https://github.com/TASVideos/tasvideos/blob/b54ece055c14d7e0c2a2eb61603e42067ffd1912/TASVideos.ForumEngine/Node.cs#L463
-- Take inspiration from TeX \footnotesize, which for 12pt is 10pt.
enclose_blocks("size", "0.83em", {pandoc.OrderedList(footnotes)}, opts)
end
end
function Writer(doc, opts)
local tokens = coroutine.wrap(function () tokenize_doc(doc, opts) end)
local tokens = coroutine.wrap(function () consolidate_tokens(tokens) end)
return render_tokens(tokens):render()
end