Module:User:Theknightwho/wikitext parser
Jump to navigation
Jump to search
- This module sandbox lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.
local anchor_encode = mw.uri.anchorEncode
local byte = string.byte
local char = string.char
local concat = table.concat
local explode = require("Module:string utilities").explode_utf8
local format = string.format
local insert = table.insert
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local pairs = pairs
local rawset = rawset
local remove = table.remove
local rep = string.rep
local require = require
local select = select
local setmetatable = setmetatable
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ulower = string.ulower
local umatch = mw.ustring.match
local unpack = unpack
local upper = string.upper
local uupper = string.uupper
local m_parser = require("Module:parser")
local d = load_data("Module:User:Theknightwho/wikitext parser/data")
local Parser, Node = m_parser.new()
local export = {}
------------------------------------------------------------------------------------
--
-- Helper functions
--
------------------------------------------------------------------------------------
-- Like tostring, but no character escapes are applied.
local function rawstring(this)
return type(this) == "table" and this:__rawstring() or tostring(this)
end
------------------------------------------------------------------------------------
--
-- Nodes
--
------------------------------------------------------------------------------------
local Proxy = {}
function Proxy:__index(k)
return Proxy[k] or self.__chars[k]
end
function Proxy:__newindex(k, v)
local key = self.__keys[k]
if key then
self.__chars[k] = v
self.__parents[k][key] = v
elseif key == false then
error("Character is immutable.")
else
error("Invalid key.")
end
end
function Proxy:build(a, b, c)
insert(self.__chars, a)
insert(self.__parents, b)
insert(self.__keys, c)
end
function Proxy:iter(i)
i = i + 1
local char = self.__chars[i]
if char then
return i, self[i], self, self.__keys[i], self.__parents[i]
end
end
function Node:new_proxy()
return setmetatable({
__node = self,
__chars = {},
__keys = {},
__parents = {}
}, Proxy)
end
-- Iterates over display characters.
function Node:pairs_display()
local proxy = self:new_proxy()
for char, parent, key in self:__pairs("next_display") do
if type(char) == "string" then
proxy:build(char, parent, key)
end
end
return Proxy.iter, proxy, 0
end
-- Iterates over raw wikitext characters.
function Node:pairs_raw()
local proxy = self:new_proxy()
for char, parent, key, mut in self:__pairs("next_raw") do
if (
type(char) == "string" or
char.type == "apostrophes"
) then
proxy:build(char, parent, mut and key or false)
end
end
return Proxy.iter, proxy, 0
end
do
local function escape(this)
local len = #this
if len == 1 then
this = byte(this)
elseif len == 2 then
local b1, b2 = byte(this, 1, 2)
this = 0x40 * b1 + b2 - 0x3080
elseif len == 3 then
local b1, b2, b3 = byte(this, 1, 3)
this = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
elseif len == 4 then
local b1, b2, b3, b4 = byte(this, 1, 4)
this = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
end
return "&#" .. this .. ";"
end
local function is_newline(this)
return not this or
this == "\n" or
this == "\r"
end
local function iterate(i, this, proxy, key, word, wikilink)
if not key then
return this
elseif (
d.NOWIKI[this] or
wikilink == 2 and match(this, "^%a$")
) then
return escape(this)
end
local prev = proxy[i - 1]
if (
(is_newline(prev) and d.NOWIKI_START[this]) or
prev == "_" and this == "_"
) then
return escape(this)
end
local nxt, nxt2 = proxy[i + 1], proxy[i + 2]
if (
this == ":" and nxt == "/" and nxt2 == "/" or
is_newline(prev) and this == "-" and nxt == "-" and nxt2 == "-" and proxy[i + 3] == "-" or
this == ":" and d.EL_SCHEMES_UNSLASHED[concat(word)] or
d.SPACE_SEPARATOR[this] and d.MAGIC_LINKS[concat(word)]
) then
return escape(this)
end
return this
end
function Node:__tostring()
local ret, word, ret_len, word_len, wikilink = {}, {}, 0, 0, 0
for i, this, proxy, key in self:pairs_raw() do
this = iterate(i, this, proxy, key, word, wikilink)
ret_len = ret_len + 1
ret[ret_len] = this
if match(this, "[%w_]") then
word_len = word_len + 1
word[word_len] = this
else
for i = 1, word_len do
word[i] = nil
end
word_len = 0
end
-- Raw "]" only appears at the end of a bracketed external link or wikilink. If there are two in a row, then it must be the end of a wikilink.
wikilink = this == "]" and wikilink + 1 or 0
end
return concat(ret)
end
function Node:__rawstring()
local ret, len = {}, 0
for _, this in self:pairs_raw() do
len = len + 1
ret[len] = this
end
return concat(ret)
end
end
local Wikitext = Node:new_class("wikitext")
do
local _new = Wikitext.new
function Wikitext:new(t, force_wrapper)
if type(t) == "string" then
t = explode(t)
end
return _new(self, t, force_wrapper)
end
end
function Wikitext:next_display(i)
i = i + 1
return self[i], i, true
end
Wikitext.next_raw = Wikitext.next_display
Wikitext.__tostring = Node.__tostring
-- Return no children.
function Node:next_no_op()
end
local Apostrophes = Node:new_class("apostrophes")
Apostrophes.next = Node.next_no_op
Apostrophes.next_display = Node.next_no_op
Apostrophes.next_raw = Node.next_no_op
function Apostrophes:__tostring()
return rep("'", self.num)
end
local ExternalLink = Node:new_class("external link")
function ExternalLink:next(i)
if i == 0 then
i = self.scheme and "scheme" or "url"
elseif i == "scheme" then
i = "url"
elseif i == "url" then
i = "display"
else
return nil
end
return self[i], i
end
-- FIXME: need to return something immutable if there's no display
function ExternalLink:next_display(i)
if i == 0 then
i = "display"
return self[i], i
end
end
function ExternalLink:next_raw(i)
if i == 0 then
i = 1
if self.bracketed then
return "[", i, false
end
end
if i == 1 then
i = self.scheme and "scheme" or "url"
elseif i == "scheme" then
i = 2
return ":", i, false
elseif i == 2 then
i = "url"
elseif i == "url" then
if self.display then
i = 3
return " ", i, false
elseif self.bracketed then
return "]", nil, false
end
return nil
elseif i == 3 then
i = "display"
elseif i == "display" and self.display then
return "]", nil, false
else
return nil
end
return self[i], i, true
end
local HTMLEntity = Node:new_class("html entity")
function HTMLEntity:next_display(i)
if i == 0 then
i = "char"
return self[i], i
end
end
function HTMLEntity:next_raw(i)
i = i + 1
return self[i], i, false
end
function HTMLEntity:__tostring()
return self.char:__tostring()
end
function HTMLEntity:__rawstring()
return self.char:__rawstring()
end
local HTMLTag = Node:new_class("html tag")
HTMLTag.next = Node.next_no_op
--[==[
function HTMLTag:__tostring()
local str = "<"
if self["end"] then
str = str .. "/"
end
str = str .. tostring(self.name)
if self.attributes then
for i = 1, #self.attributes, 2 do
str = str .. " " .. tostring(self.attributes[i]) .. "=\"" .. tostring(self.attributes[i + 1]) .. "\""
end
end
if self.self_closing then
str = str .. "/"
end
return str .. ">"
end
]==]
local PercentEncoding = Node:new_class("percent-encoding")
PercentEncoding.next_display = HTMLEntity.next_display
PercentEncoding.next_raw = HTMLEntity.next_raw
function PercentEncoding:__tostring()
return self.char
end
PercentEncoding.__rawstring = PercentEncoding.__tostring
local StripMarker = Node:new_class("strip marker")
StripMarker.next_display = Wikitext.next_display
StripMarker.next_raw = Wikitext.next_raw
local Wikilink = Node:new_class("wikilink")
function Wikilink:next(i)
if i == 0 then
i = self.prefix and "prefix" or "title"
elseif i == "prefix" then
i = "title"
elseif i == "title" then
i = self.fragment and "fragment" or "display"
elseif i == "fragment" then
i = "display"
else
return nil
end
return self[i], i
end
Wikilink.next_display = ExternalLink.next_display
function Wikilink:next_raw(i)
if i == 0 or i == 1 then
i = i + 1
return "[", i, false
elseif i == 2 then
i = 3
return ":", i, false
elseif i == 3 then
i = self.prefix and "prefix" or "title"
elseif i == "prefix" then
i = "title"
elseif i == "title" then
i = self.fragment and 4 or 5
return self.fragment and "#" or "|", i, false
elseif i == 4 then
i = "fragment"
elseif i == "fragment" then
i = 5
return "|", i, false
elseif i == 5 then
i = "display"
elseif i == "display" then
i = 6
return "]", i, false
elseif i == 6 then
i = nil
return "]", i, false
else
return nil
end
return self[i], i, true
end
local Prefix = Node:new_class("prefix")
function Prefix:next_raw(i)
i = i + 1
if i % 2 == 1 then
return self[(i + 1) / 2], i, true
end
return ":", i, false
end
--[==[
function Prefix:__tostring()
local output = {}
for i = 1, #self do
insert(output, tostring(self[i]))
end
return concat(output, ":") .. ":"
end
]==]
local Category = Node:new_class("category")
--[==[
function Category:__tostring()
local sortkey = self.sortkey and tostring(self.sortkey) or nil
return "[[Category:" .. tostring(self.title) .. (sortkey and "|" .. sortkey or "") .. "]]"
end
]==]
local Multipart = Node:new_class("multipart")
--[==[
function Multipart:__tostring()
local output = {}
for i = 1, #self do
local v = self[i]
insert(output, type(v) == "table" and v:__tostring() or tostring(v))
end
return concat(output, "<span class=\"Zsym mention\" style=\"font-size:100%;\">/</span>")
end
]==]
------------------------------------------------------------------------------------
--
-- Parser
--
------------------------------------------------------------------------------------
function Parser:push(route)
local layer = setmetatable({
head = self.head,
route = route,
len = 0
}, Wikitext)
local len = self.len + 1
self[len] = layer
self.n = layer
self.len = len
end
function Parser:push_sublayer(handler)
local layer = self.n
rawset(layer, "__concat", layer.__concat)
rawset(layer, "__index", layer)
rawset(layer, "__newindex", layer)
rawset(layer, "__pairs", layer.__pairs)
rawset(layer, "__tostring", layer.__tostring)
local sublayer = setmetatable({
handler = handler,
sublayer = true,
len = 0
}, layer)
local len = self.len + 1
self[len] = sublayer
self.n = sublayer
self.len = len
end
------------------------------------------------------------------------------------
--
-- Italics and bold
--
------------------------------------------------------------------------------------
-- A direct copy of doQuotes in Parser.php:
-- (1) '''''...''''' is treated as <i><b>...</b></i>, but an open ''''' is implicitly closed as <b><i>...</i></b>.
-- (2) A lone '' or ''' at the end of a line is treated as <i></i> or <b></b> respectively, but a lone ''''' is completely ignored.
-- adjust_style_apostrophes is run if the number of StyleApostrophes2 and StyleApostrophes3 on the line are both odd, which converts one of the StyleApostrophes3 into an apostrophe followed by StyleApostrophes2. Parsoid uses the following priorities, and picks the first occurrence of the highest priority found:
-- (1) After a single ASCII character after a normal space (" X'''").
-- (2) After multiple non-space characters ("XXX'''") or a non-ASCII character ("É'''").
-- (3) After a space (" '''").
-- Otherwise, no adjustment.
-- If the new apostrophe is added straight after a free external link then the new apostrophe becomes part of the link, so the preceding FreeExternalLinkClose must be moved ahead of it. Note that this will cause any trailing punctuation between the old end and the apostrophe to be included as part of the link as well, since the new apostrophe means it is no longer trailing: for example, ''https://example.com/!''' goes from being https://example.com/ followed by "!" to https://example.com/!' if adjusted (but remember that only one adjustment is made, even if the adjusted sequence is repeated).
function Parser:handle_odd_number_italics_and_bold(layer)
local italics, bold = 0, 0
for token in pairs(layer) do
if token.type == "apostrophes" then
if token.num ~= 2 then
bold = bold + 1
end
if token.num ~= 3 then
italics = italics + 1
end
end
end
if italics % 2 == 0 or bold % 2 == 0 then
return
end
local word_token, word_parent, word_key,
space_token, space_parent, space_key
for i, token, proxy, key, parent in layer:pairs_raw() do
if token.type == "apostrophes" and token.num == 3 then
if proxy[i - 1] == " " then
space_token = space_token or token
space_parent = space_parent or parent
space_key = space_key or key
elseif proxy[i - 1] and proxy[i - 2] == " " then
token.num = 2
insert(parent, key, "'")
parent.len = parent.len + 1
return
else
word_token = word_token or token
word_parent = word_parent or parent
word_key = word_key or key
end
end
end
if word_token then
word_token.num = 2
insert(word_parent, word_key, "'")
word_parent.len = word_parent.len + 1
local i, prev = 0
repeat
i = i + 1
prev = layer[word_key - i]
until not (
type(prev) == "string" and
match(prev, "^[!%),%.:;%?\\]$")
)
if not (
type(prev) == "table" and
prev.type == "external link" and
not prev.bracketed
) then
return
end
i = i - 1
local token
for _ = 1, i + 1 do
token = remove(layer, word_key - i)
layer.len = layer.len - 1
insert(prev.url, token)
prev.url.len = prev.url.len + 1
end
elseif space_token then
space_token.num = 2
insert(space_parent, space_key, "'")
space_parent.len = space_parent.len + 1
end
return
end
function Parser:substitute_apostrophes(layer)
local state, both_parent, both_key, final_parent, final_key
for token, parent, key in pairs(layer) do
final_parent = false
final_key = false
if token.type == "apostrophes" then
if token.num == 5 then
if state == "b" then
parent[key] = HTMLTag:new{
name = "b",
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = "i",
})
parent.len = parent.len + 1
key = key + 1
state = "i"
elseif state == "i" then
parent[key] = HTMLTag:new{
name = "i",
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = "b",
})
parent.len = parent.len + 1
key = key + 1
state = "b"
elseif state == "bi" then
parent[key] = HTMLTag:new{
name = "i",
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = "b",
["end"] = true
})
parent.len = parent.len + 1
key = key + 1
state = ""
elseif state == "ib" then
parent[key] = HTMLTag:new{
name = "b",
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = "i",
["end"] = true
})
parent.len = parent.len + 1
key = key + 1
state = ""
elseif state == "both" then
parent[key] = HTMLTag:new{
name = "b",
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = "i",
["end"] = true
})
parent.len = parent.len + 1
key = key + 1
both_parent[both_key] = HTMLTag:new{
name = "i",
}
insert(both_parent, both_key + 1, HTMLTag:new{
name = "b",
})
both_parent.len = both_parent.len + 1
if both_parent == parent then
key = key + 1
end
both_parent = nil
both_key = nil
state = ""
else
both_parent = parent
both_key = key
state = "both"
end
else
local this = token.num == 2 and "i" or "b"
local other = this == "i" and "b" or "i"
if state == this then
parent[key] = HTMLTag:new{
name = this,
["end"] = true
}
state = ""
elseif state == other .. this then
parent[key] = HTMLTag:new{
name = this,
["end"] = true
}
state = other
elseif state == this .. other then
parent[key] = HTMLTag:new{
name = other,
["end"] = true
}
insert(parent, key + 1, HTMLTag:new{
name = this,
["end"] = true
})
insert(parent, key + 2, HTMLTag:new{
name = other,
})
parent.len = parent.len + 2
key = key + 2
state = other
elseif state == "both" then
parent[key] = HTMLTag:new{
name = this,
["end"] = true
}
both_parent[both_key] = HTMLTag:new{
name = other,
}
insert(both_parent, both_key + 1, HTMLTag:new{
name = this,
})
both_parent.len = both_parent.len + 1
if both_parent == parent then
key = key + 1
end
both_parent = nil
both_key = nil
state = other
else
parent[key] = HTMLTag:new{
name = this,
}
state = state == other and other .. this or this
end
end
final_parent = parent
final_key = key
end
end
-- No open tags at the end of a line.
if final_parent and (
final_parent[final_key].type == "apostrophes" or
not final_parent[final_key]["end"]
) then
final_parent[final_key] = nil
final_parent.len = final_parent.len - 1
if state == "i" or state == "b" or state == "both" then
return
end
state = sub(state, 1, 1)
end
if state == "b" or state == "ib" then
self:emit(HTMLTag:new{
name = "b",
["end"] = true
})
end
if state == "i" or state == "bi" or state == "ib" then
self:emit(HTMLTag:new{
name = "i",
["end"] = true
})
end
if state == "bi" then
self:emit(HTMLTag:new{
name = "b",
["end"] = true
})
elseif state == "both" then
self:emit(HTMLTag:new{
name = "i",
["end"] = true
})
self:emit(HTMLTag:new{
name = "b",
["end"] = true
})
both_parent[both_key] = HTMLTag:new{
name = "b",
}
insert(both_parent, both_key + 1, HTMLTag:new{
name = "i",
})
both_parent.len = both_parent.len + 1
end
return
end
function Parser:finalize_line()
if self.n.apos then
self:handle_odd_number_italics_and_bold(self.n)
self:substitute_apostrophes(self.n)
end
-- Conversions that need to be done after apostrophes have been processed.
for token, parent, key in pairs(self.n) do
if token.type == "html entity" then
local char = token.char
parent[key] = #char == 1 and char[1] or char
elseif token.type == "percent-encoding" then
parent[key] = char
elseif (
token.type == "external link" and
token.display and
#token.display == 0
) then
token.display = nil
end
end
end
------------------------------------------------------------------------------------
--
-- Apostrophes
--
------------------------------------------------------------------------------------
do
local function handle_apostrophes(self, this)
if this == "'" then
self.n.apos = self.n.apos or {}
insert(self.n.apos, self.head)
else
local apos = self.n.apos and #self.n.apos + 1 or 1
if apos == 1 then
return self:fail_route()
elseif apos == 2 or apos == 3 or apos == 5 then
self.n.num = apos
elseif apos == 4 then
self:emit("'")
self.n.num = 3
else
for _ = 1, apos - 5 do
self:emit("'")
end
self.n.num = 5
end
return self:pop()
end
end
function Parser:do_apostrophes()
self:set("handler", handle_apostrophes)
self:advance()
end
function Parser:apostrophes()
local apostrophes = self:get("do_apostrophes")
if apostrophes == self.n.bad_route then
return nil
end
self:emit_tokens(apostrophes)
self:advance(-1)
return Apostrophes:new(apostrophes)
end
end
------------------------------------------------------------------------------------
--
-- Carriage return
--
------------------------------------------------------------------------------------
-- "\r" and "\r\n" are both treated as "\n".
function Parser:carriage_return(this)
if self.n.override == self.carriage_return then
self.n.override = nil
if this ~= "\n" then
self:advance(-1)
end
return self:consume("\n")
end
self.n.override = self.carriage_return
end
------------------------------------------------------------------------------------
--
-- Comment
--
------------------------------------------------------------------------------------
do
-- Handlers.
local handle_start
local traverse_comment
local handle_end
function handle_start(self, this)
self.n.i = self.n.i + 1
if this ~= sub("<!--", self.n.i, self.n.i) then
return self:fail_route()
elseif self.n.i == 4 then
self:advance()
return traverse_comment(self)
end
end
function traverse_comment(self)
local this
repeat
this = self:read()
if this == "-" then
self.n.i = 1
self.n.handler = handle_end
self:advance()
return self:traverse()
elseif this == "" then
return self:pop()
end
self:advance()
until false
end
function handle_end(self, this)
self.n.i = self.n.i + 1
if this ~= sub("-->", self.n.i, self.n.i) then
self:advance()
return traverse_comment(self)
elseif self.n.i == 3 then
return self:pop()
end
end
function Parser:do_comment()
self:set("handler", handle_start)
self.n.no_magic_word = true
self.n.i = 1
self:advance()
end
function Parser:comment()
local comment = self:get("do_comment")
if comment == self.n.bad_route then
return self:consume()
end
end
end
------------------------------------------------------------------------------------
--
-- External link
--
------------------------------------------------------------------------------------
-- Note: the Parsoid implementation of URLs is pretty crude, and doesn't respect the URL spec at https://url.spec.whatwg.org/ in many cases.
do
local function is_invalid(this)
return this == "" or
this == "\239\191\189" or -- U+FFFD Replacement Character
this ~= "\t" and byte(this) <= 0x1F -- C0 control characters except \t
end
-- Handlers.
local handle_bracketed_start
local handle_double_bracketed_start
local handle_bracketed_scheme
local handle_free_scheme
local handle_slashes
local handle_after_scheme
local handle_ip
local handle_decoded_ip
local handle_uri
local handle_free_uri_trail
local handle_bracketed_uri_whitespace
local handle_uri_end
local handle_bracketed_text
-- If another "[" is found, record the position after it as wikilink_on_fail, which will be used as the head of a wikilink if this route fails.
function handle_bracketed_start(self, this)
if this == "[" then
self.n.handler = handle_double_bracketed_start
return
end
if this == "/" then
self.n.handler = handle_slashes
self.n.i = 0
else
self:push_sublayer(handle_bracketed_scheme)
end
return self:consume()
end
function handle_double_bracketed_start(self, this)
self.n.wikilink_on_fail = self.head
if this == "[" then
return self:fail_route()
end
self.n.handler = handle_bracketed_start
return self:consume()
end
function handle_bracketed_scheme(self, this)
if this == ":" then
local scheme = self:pop_sublayer()
local normalized_scheme = lower(concat(scheme))
if d.EL_SCHEMES_SLASHED[normalized_scheme] then
self.n.handler = handle_slashes
self.n.i = 0
elseif d.EL_SCHEMES_UNSLASHED[normalized_scheme] then
self.n.handler = handle_after_scheme
else
return self:fail_route()
end
self.n.scheme = Wikitext:new(scheme)
elseif match(this, "^[%w%+%-%.]$") then
self:emit(this)
else
return self:fail_route()
end
end
function handle_free_scheme(self)
local i, this, nxt = 0, ":"
repeat
i = i - 1
this, nxt = self:emitted(i), this
until type(this) ~= "string" or not match(this, "^[%w%+%-%.]$")
if (
match(nxt, "^%a$") and -- Schemes must start with a letter.
not (type(this) == "string" and umatch(this, "^%w$"))
) then
local scheme = self:concat(-1, i + 1)
local normalized_scheme = lower(scheme)
self.n.scheme_pos = i + 1
self.n.pattern = "^[!%),%.:;%?\\]$"
if d.EL_SCHEMES_SLASHED[normalized_scheme] then
self:push_sublayer(handle_slashes)
self.n.i = 0
elseif d.EL_SCHEMES_UNSLASHED[normalized_scheme] then
self:push_sublayer(handle_after_scheme)
else
return self:fail_route()
end
self.n.scheme = Wikitext:new(scheme)
else
return self:fail_route()
end
end
function handle_slashes(self, this)
if this ~= "/" then
return self:fail_route()
end
self:emit(this)
self.n.i = self.n.i + 1
if self.n.i == 2 then
self.n.handler = handle_after_scheme
end
end
-- Parsoid bugs:
-- (1) Entities for "[" (e.g. [) aren't treated as the start of an IP address.
-- (2) Only "%5B" is converted to "[", not "%5b".
function handle_after_scheme(self, this)
self:push_sublayer(handle_uri)
if this == "%" then
this = self:percent_encoding() or "%"
if rawstring(this) == "[" and this.code == "%5B" then
self:emit(this)
self:push_sublayer(handle_decoded_ip)
return
end
self.head = this and this.head or self.head
elseif this == "[" then
self:emit("[")
self:push_sublayer(handle_ip)
return
end
return self:consume()
end
-- IP URLs starting with "[" must have a matching "]". Fails if a non-IP character is found, since "[" is otherwise invalid.
function handle_ip(self, this)
if this == "]" then
if #self.n == 0 then
return self:fail_route()
end
self:emit_tokens(self:pop_sublayer())
self:emit("]")
self.n.ip = true
self.n.handler = handle_uri
elseif match(this, "^[%x%.:]$") then
self:emit(this)
else
return self:fail_route()
end
end
-- IP URLs starting with "%5B" must have a matching "%5D". If a non-IP character is found, "[" is converted back to "%5B".
-- Parsoid bug: Only "%5D" is converted to "]", not "%5d".
function handle_decoded_ip(self, this)
if this == "%" then
this = self:percent_encoding() or "%"
if (
#self.n > 0 and
rawstring(this) == "]" and
this.code == "%5D"
) then
self:emit_tokens(self:pop_sublayer())
self:emit(this)
self.n.ip = true
return
end
self.head = this and this.head or self.head
elseif match(this, "^[%x%.:]$") then
self:emit(this)
return
end
self:emit_tokens(self:pop_sublayer())
local i = 0
repeat
i = i - 1
this = self:emitted(i)
until rawstring(this) == "["
self:replace(i, "%")
self:emit(i + 1, "5")
self:emit(i + 1, "B")
return self:consume()
end
-- Note: Some valid wikitext characters which are invalid in URLs resolve to percent-encoding.
-- Parsoid bugs:
-- (1) In bracketed links, "<", ">" (and corresponding entities < and >) end the URI and start the text even if they come straight after the scheme, resulting in invalid targets like "https://".
-- (2) In free links, the entities for "<", ">" and the non-breaking space are supposed to end the URI, but Parsoid doesn't account for < > and  .
function handle_uri(self, this)
local trail_pos = self.n.trail and #self.n + 1
if this == "&" then
this = self:html_entity()
if not this then
self:emit("&")
elseif not this.char then
this.char = Wikitext:new("\239\191\189") -- U+FFFD Replacement Character
self:emit(this)
else
local decoded = rawstring(this)
if decoded == " " then
self:emit("+")
elseif self.n.bracketed and (
this.code == "<" or
this.code == ">"
) or not self.n.bracketed and (
decoded == "<" and this.code ~= "<" or
decoded == ">" and this.code ~= ">" or
decoded == "\194\160" and this.code ~= " "
) then
self.head = this.head
return handle_uri_end(self, trail_pos)
elseif match(decoded, "^[\t\n\"<>%[%]|]$") then
this = format("%02X", byte(decoded))
self:emit("%")
self:emit(sub(this, 1, 1))
self:emit(sub(this, 2, 2))
else
self:emit(this)
end
end
elseif this == "'" then
this = self:apostrophes()
if this then
self.n.apos = true
self.head = this.head
return handle_uri_end(self, trail_pos)
end
self:emit("'")
elseif this == "]" then
return handle_uri_end(self, trail_pos, true)
elseif not self.n.bracketed and this == "(" then -- Remove ")" from the trail pattern.
self.n.pattern = "^[!,%.:;%?\\]$"
self:emit("(")
elseif this == "|" then
self:emit("%")
self:emit("7")
self:emit("C")
elseif this == "\127" then
this = self:strip_marker()
if this then
self.head = this.head
return handle_uri_end(self, trail_pos)
end
self:emit("?")
elseif not self.n.bracketed and match(this, self.n.pattern) then
self:push_sublayer(handle_free_uri_trail)
self.n.trail_head = self.head
return self:consume()
elseif d.SPACE_SEPARATOR[this] then
if not self.n.bracketed then
return handle_uri_end(self, trail_pos)
end
self.n.handler = handle_bracketed_uri_whitespace
elseif match(this, "^[\"<>%[]$") then
return handle_uri_end(self, trail_pos)
elseif is_invalid(this) then
if self.n.bracketed then
return self:fail_route()
end
return handle_uri_end(self, trail_pos)
elseif #this > 1 and (
d.IGNORED_IN_URI[this] or
match(this, "^\243\160[\128-\191][\128-\191]$") -- U+E0000–E0FFF
) then
return
else
self:emit(this)
end
if self.n.trail and #self.n >= trail_pos then
self:emit_tokens(trail_pos, self.n.trail)
self.n.trail = nil
end
end
-- Gather any trail characters and save them. Later, they will be added to the URI if we know that the end doesn't come straight after them. If it does, they'll be discarded and the head set to the start of the trail. Note: Parsoid never adds decoded entities to the trail.
-- Parsoid bug: If "(" is given as an entity, it does not cause ")" to be excluded from the trail characters.
function handle_free_uri_trail(self, this)
if match(this, self.n.pattern) then
self:emit(this)
else
local trail = self:pop_sublayer()
self.n.trail = trail
return self:consume()
end
end
function handle_bracketed_uri_whitespace(self, this)
if not d.SPACE_SEPARATOR[this] then
return handle_uri_end(self)
end
end
-- Fail if end comes straight after the scheme (+ slashes where applicable). If wikilink_on_fail is set (e.g. [[https://]]), then reset it to nil, since Parsoid won't parse it as a wikilink either (even though "https://" is a valid title!).
-- For free links, we also still need to determine if the trail needs to be added (e.g. there may be excess apostrophes after it).
-- IP square brackets use percent-encoding if the URI continues after "]", even if entered as raw characters.
function handle_uri_end(self, trail_pos, force_pop)
if #self.n == 0 then
self.n.wikilink_on_fail = nil
return self:fail_route()
elseif self.n.trail then
if #self.n >= trail_pos then
self:emit_tokens(trail_pos, self.n.trail)
else
self.head = self.n.trail_head
end
end
self:emit_tokens(self:pop_sublayer())
if self.n.ip and rawstring(self:emitted()) ~= "]" then
local i, this = 0
repeat
i = i - 1
this = self:emitted(i)
if rawstring(this) == "]" then
self:replace(i, "%")
self:emit(i + 1, "5")
self:emit(i + 1, "D")
elseif rawstring(this) == "[" then
self:replace(i, "%")
self:emit(i + 1, "5")
self:emit(i + 1, "B")
break
end
until not this
end
local url = Wikitext:new(self:pop_sublayer())
self.n.url = url
if not self.n.bracketed or force_pop then
return self:pop()
end
self:push_sublayer(handle_bracketed_text)
return self:consume()
end
function handle_bracketed_text(self, this)
if this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "'" then
this = self:apostrophes()
self.n.apos = self.n.apos or this and true or nil
self:emit(this or "'")
elseif this == "<" then
self:html_tag()
elseif this == "]" then
local raw_display = self:pop_sublayer()
self.n.display = Wikitext:new(raw_display)
return self:pop()
elseif this == "\127" then
self:emit(self:strip_marker() or "?")
elseif is_invalid(this) then
return self:fail_route()
else
self:emit(this)
end
end
function Parser:do_bracketed_external_link()
self:set("handler", handle_bracketed_start)
self.n.bracketed = true
self:advance()
self:push_sublayer()
end
function Parser:bracketed_external_link()
local link = self:get("do_bracketed_external_link")
if link == self.n.bad_route then
if link.wikilink_on_fail then
self:wikilink(link.wikilink_on_fail)
else
self:emit("[")
end
else
if link.wikilink_on_fail then
self:emit("[")
end
self:emit(ExternalLink:new(link))
end
end
function Parser:do_free_external_link()
self:set("handler", handle_free_scheme)
end
function Parser:free_external_link()
local link = self:get("do_free_external_link")
if link == self.n.bad_route then
if self.n.dl then
self:emit(self.DescriptionListSeparator)
self.n.dl = nil
else
self:emit(":")
end
else
-- Account for already-emitted scheme.
for _ = -1, link.scheme_pos, -1 do
self:remove()
end
self:emit(ExternalLink:new(link))
self:advance(-1)
end
end
end
------------------------------------------------------------------------------------
--
-- Heading
--
------------------------------------------------------------------------------------
do
-- Handlers.
local handle_start
local handle_start_whitespace
local handle_start_excess
local handle_only_equals_signs
local handle_body
local handle_body_whitespace
local handle_end
local handle_end_whitespace
function handle_start(self, this)
if this == "=" then
self.n.eq = self.n.eq + 1
elseif this == "\n" or this == "" then
return handle_only_equals_signs(self)
elseif this == " " or this == "\t" then
self.n.handler = handle_start_whitespace
else
handle_start_excess(self)
self.n.handler = handle_body
return self:consume()
end
end
function handle_start_whitespace(self, this)
if this == "\n" or this == "" then
return handle_only_equals_signs(self)
elseif this ~= " " and this ~= "\t" then
handle_start_excess(self)
self.n.handler = handle_body
return self:consume()
end
end
-- Emit any excess = signs once we know it's a conventional heading. Up till now, we couldn't know if the heading is just a string of = signs (e.g. ========), so it wasn't guaranteed that the heading text starts after the 6th.
function handle_start_excess(self)
if self.n.eq > 6 then
for _ = 1, self.n.eq - 6 do
self:emit("=")
end
self.n.eq = 6
end
end
-- ===== is "=" as an L2; ======== is "==" as an L3 etc.
function handle_only_equals_signs(self)
if self.n.eq < 3 then
return self:fail_route()
end
-- Calculate which equals signs determine the heading level.
local eq = self.n.eq - 1
eq = eq - eq % 2
eq = eq > 12 and 12 or eq
-- Emit the excess.
for _ = 1, self.n.eq - eq do
self:emit("=")
end
self.n.level = eq / 2
return self:pop()
end
function handle_body(self, this)
if this == "=" then
local end_eq = self:get("do_heading_end")
if end_eq == self.n.bad_route then -- = signs are just part of the heading.
self:advance(#self.n.bad_route)
self:emit_tokens(self.n.bad_route)
return self:consume()
elseif end_eq > self.n.eq then
for _ = 1, end_eq - self.n.eq do
self:emit("=")
end
self.n.level = self.n.eq
return self:pop()
end
for _ = 1, self.n.eq - end_eq do
self:emit(1, "=")
end
-- Remove already-emitted whitespace before end.
local this = self:emitted()
while this == " " or this == "\t" do
self:remove()
this = self:emitted()
end
self.n.level = end_eq
return self:pop()
elseif this == " " or this == "\t" then
self:emit(this)
self.n.override = handle_body_whitespace
elseif this == "\n" or this == "" then
return self:fail_route()
elseif this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "'" then
this = self:apostrophes()
self.n.apos = self.n.apos or this and true or nil
self:emit(this or "'")
elseif this == ":" then
self:free_external_link()
elseif this == "<" then
self:html_tag()
elseif this == "I" or this == "P" or this == "R" then
self:magic_link(this)
elseif this == "[" then
self:bracketed_external_link()
elseif this == "\127" then
self:emit(self:strip_marker() or "?")
else
self:emit(this)
end
end
function handle_body_whitespace(self, this)
if this ~= " " and this ~= "\t" then
self.n.override = nil
return self:consume()
end
end
function handle_end(self, this)
if this == "=" then
self:emit("=")
elseif this == "\n" or this == "" then
return #self:pop()
elseif this == " " or this == "\t" then
self.n.handler = handle_end_whitespace
else
return self:fail_route()
end
end
function handle_end_whitespace(self, this)
if this == "\n" or this == "" then
return #self:pop()
elseif this ~= " " and this ~= "\t" then
return self:fail_route()
end
end
function Parser:do_heading()
self:set("handler", handle_start)
self.n.eq = 1
self:advance()
end
function Parser:do_heading_end()
self:set("handler", handle_end)
end
function Parser:heading()
local heading = self:get("do_heading")
if heading ~= self.n.bad_route then
self:emit(HTMLTag:new{
name = Wikitext:new("h" .. heading.level)
})
self:emit(HTMLTag:new{
name = Wikitext:new("span"),
attributes = {
Wikitext:new("class"),
Wikitext:new("mw-headline"),
Wikitext:new("id"),
export.parse_nowiki(anchor_encode(tostring(heading)))
}
})
self:emit_tokens(heading)
self:emit(HTMLTag:new{
name = Wikitext:new("span"),
["end"] = true
})
self:emit(HTMLTag:new{
name = Wikitext:new("h" .. heading.level),
["end"] = true
})
end
self:advance(-1)
end
end
------------------------------------------------------------------------------------
--
-- Horizontal rule
--
------------------------------------------------------------------------------------
do
local function handle_horizontal_rule(self, this)
if this == "-" then
self.n.i = self.n.i + 1
elseif self.n.i >= 4 then
self:pop()
return true
else
return self:fail_route()
end
end
function Parser:do_horizontal_rule()
self:set("handler", handle_horizontal_rule)
self.n.i = 1
self:advance()
end
function Parser:horizontal_rule()
local horizontal_rule = self:get("do_horizontal_rule")
if horizontal_rule ~= self.n.bad_route then
self:emit(HTMLTag:new{
name = Wikitext:new("hr"),
self_closing = true
})
end
self:advance(-1)
end
end
------------------------------------------------------------------------------------
--
-- HTML entity
--
------------------------------------------------------------------------------------
-- Parsoid regex: &([A-Za-z0-9\x80-\xff]+;)|&\#([0-9]+)|&\#[xX]([0-9A-Fa-f]+)|(&)
-- If the route decodes to an invalid entity (e.g. �), then the route still succeeds, but the output is the original wikitext. This matches Parsoid, which processes such entities but makes the output the same as the input string. This means that inputs such as [[�]] are treated as attempted links to pages with an HTML entity in the title (invalid), and not as a link to "&" with the fragment "xD800;".
-- Characters which are never valid in HTML entities. Note that non-ASCII characters are treated as valid in entity names by the Parsoid regex, since it supports some nonstandard entities that use them.
do
local function is_invalid(this)
return not not (
this == "" or
#this == 1 and not match(this, "^%w$")
)
end
-- Converts a codepoint to the equivalent character. Characters which aren't decoded by Parsoid return nil.
local function utf8_char(cp)
if (
cp <= 0x08 or
cp >= 0x0B and cp <= 0x1F or
cp >= 0x7F and cp <= 0x9F or
cp >= 0xD800 and cp <= 0xDFFF or
cp == 0xFFFE or cp == 0xFFFF or
cp > 0x10FFFF
) then
return nil
elseif cp < 0x80 then
return char(cp)
elseif cp < 0x800 then
return char(
0xC0 + cp / 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x10000 then
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
-- Handlers.
local handle_start
local handle_numeric
local handle_numeric_code
local handle_dec_code
local handle_hex_code
local handle_named
local handle_percent_encoding
function handle_start(self, this)
if this == "#" then
self:emit(this)
self.n.handler = handle_numeric
elseif this == "%" then
return handle_percent_encoding(self)
elseif is_invalid(this) then
return self:fail_route()
else
self:emit(this)
self.n.handler = handle_named
end
end
function handle_numeric(self, this)
if this == "%" then
return handle_percent_encoding(self)
elseif this == "X" or this == "x" then
self:emit(this)
self.n.handler = handle_hex_code
elseif match(this, "^%d$") then
self:emit(this)
self.n.handler = handle_dec_code
else
return self:fail_route()
end
end
function handle_numeric_code(self, this, format, start, base)
if this == "%" then
return handle_percent_encoding(self)
elseif this == ";" then
local char = utf8_char(tonumber(self:concat(start), base))
if not char then
self.n.no_char = true
return self:fail_route()
end
self:emit(";")
self.n.char = Wikitext:new(char)
return self:pop()
elseif not match(this, format) then
return self:fail_route()
end
self:emit(this)
end
function handle_dec_code(self, this)
return handle_numeric_code(self, this, "^%d$", 3)
end
function handle_hex_code(self, this)
return handle_numeric_code(self, this, "^%x$", 4, 16)
end
function handle_named(self, this)
if this == "%" then
return handle_percent_encoding(self)
elseif this == ";" then
local char = load_data("Module:data/entities")[self:concat(2)]
if not char then
self.n.no_char = true
return self:fail_route()
end
self:emit(";")
self.n.char = Wikitext:new(char)
return self:pop()
elseif is_invalid(this) then
return self:fail_route()
end
self:emit(this)
end
function handle_percent_encoding(self)
if not self.n.decode_percent then
return self:fail_route()
end
local this = rawstring(self:percent_encoding() or "%")
if this == "%" then -- Avoid double-decoding.
return self:fail_route()
end
return self:consume(this)
end
-- `decode_percent` denotes underlying contexts in which percent-decoding should be attempted, since Parsoid decodes percent-encoding then HTML entities in that order (e.g. "%26%79%65%6E%3B" → "¥" → "¥").
function Parser:do_html_entity(decode_percent)
self:set("handler", handle_start)
self.n.decode_percent = decode_percent
self:emit("&")
self:advance()
end
-- Returns nil if the parse fails (e.g. "&exam ple;" or "{x;"), and false if no character can be decoded (e.g. "¬valid;" is not associated with any character, "�" is a codepoint that doesn't get resolved, and "􀀀" is a codepoint that's too high). This is because the second type will cause wikilinks to fail, whereas the first will not.
function Parser:html_entity(decode_percent)
local entity = self:get("do_html_entity", decode_percent)
if entity == self.n.bad_route then
if self.n.bad_route.no_char then
return false
end
return nil
end
entity.code = concat(entity)
return HTMLEntity:new(entity)
end
end
------------------------------------------------------------------------------------
--
-- HTML tag
--
------------------------------------------------------------------------------------
do
-- HTML whitespace.
local function is_space(this)
return not not (
this == " " or
this == "\t" or
this == "\n" or
this == "\f"
)
end
-- Handlers.
local handle_start
local handle_open_tag_name
local handle_before_attribute_name
local handle_attribute_name
local handle_after_attribute_name
local handle_before_attribute_value
local handle_quoted_attribute_value
local handle_unquoted_attribute_value
local handle_self_closing_tag
local handle_end_tag_start
local handle_end_tag_name
local handle_end_tag_remainder
function handle_start(self, this)
if this == "/" then
self:push_sublayer(handle_end_tag_start)
elseif match(this, "^%a$") then
self:push_sublayer(handle_open_tag_name)
self:emit(lower(this))
else
return self:fail_route()
end
end
function handle_open_tag_name(self, this)
if this == "/" then
local name = Wikitext:new(self:pop_sublayer())
self.n.name = name
self.n.handler = handle_self_closing_tag
elseif this == ">" then
local name = Wikitext:new(self:pop_sublayer())
self.n.name = name
return self:pop()
elseif this == "" then
return self:fail_route()
elseif is_space(this) then
local name = Wikitext:new(self:pop_sublayer())
self.n.name = name
self:push_sublayer(handle_before_attribute_name)
elseif match(this, "^%u$") then
self:emit(lower(this))
else
self:emit(this)
end
end
function handle_before_attribute_name(self, this)
if this == "/" then
self.n.handler = handle_self_closing_tag
elseif this == "=" then
self:push_sublayer(handle_attribute_name)
self:emit("=")
elseif this == ">" then
local attributes = self:pop_sublayer()
if #attributes > 0 then
self.n.attributes = attributes
end
return self:pop()
elseif this == "" then
return self:fail_route()
elseif not is_space(this) then
self:push_sublayer(handle_attribute_name)
return self:consume()
end
end
function handle_attribute_name(self, this)
if this == "/" or this == ">" or is_space(this) then
self:emit(Wikitext:new(self:pop_sublayer()))
self.n.handler = handle_after_attribute_name
return self:consume()
elseif this == "=" then
self:emit(Wikitext:new(self:pop_sublayer()))
self.n.handler = handle_before_attribute_value
elseif this == "" then
return self:fail_route()
elseif match(this, "^%u$") then
self:emit(lower(this))
else
self:emit(this)
end
end
function handle_after_attribute_name(self, this)
if this == "/" then
self:emit(Wikitext:new{})
self.n.handler = handle_self_closing_tag
elseif this == "=" then
self.n.handler = handle_before_attribute_value
elseif this == ">" then
self:emit(Wikitext:new{})
local attributes = self:pop_sublayer()
self.n.attributes = attributes
return self:pop()
elseif this == "" then
return self:fail_route()
elseif not is_space(this) then
self:emit(Wikitext:new{})
self:push_sublayer(handle_attribute_name)
return self:consume()
end
end
function handle_before_attribute_value(self, this)
if this == "\"" or this == "'" then
self:push_sublayer(handle_quoted_attribute_value)
self:set("quoter", this)
elseif this == ">" then
self:emit(Wikitext:new{})
local attributes = self:pop_sublayer()
self.n.attributes = attributes
return self:pop()
elseif not is_space(this) then
self:push_sublayer(handle_unquoted_attribute_value)
return self:consume()
end
end
function handle_quoted_attribute_value(self, this)
if this == self.n.quoter then
self:emit(Wikitext:new(self:pop_sublayer()))
self.n.handler = handle_before_attribute_name
elseif this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_unquoted_attribute_value(self, this)
if this == "&" then
self:emit(self:html_entity() or "&")
elseif this == ">" then
self:emit(Wikitext:new(self:pop_sublayer()))
local attributes = self:pop_sublayer()
self.n.attributes = attributes
return self:pop()
elseif this == "" then
return self:fail_route()
elseif is_space(this) then
self:emit(Wikitext:new(self:pop_sublayer()))
self.n.handler = handle_before_attribute_name
else
self:emit(this)
end
end
function handle_self_closing_tag(self, this)
if this == ">" then
self.n.self_closing = true
local attributes = self:pop_sublayer()
if #attributes > 0 then
self.n.attributes = attributes
end
return self:pop()
end
self.n.handler = handle_before_attribute_name
return self:consume()
end
function handle_end_tag_start(self, this)
if match(this, "^%a$") then
self.n["end"] = true
self:emit(lower(this))
self.n.handler = handle_end_tag_name
else
return self:fail_route()
end
end
function handle_end_tag_name(self, this)
if this == "/" or is_space(this) then
local name = Wikitext:new(self:pop_sublayer())
self.n.name = name
self.n.handler = handle_end_tag_remainder
elseif this == ">" then
local name = Wikitext:new(self:pop_sublayer())
self.n.name = name
return self:pop()
elseif this == "" then
return self:fail_route()
elseif match(this, "^%u$") then
self:emit(lower(this))
else
self:emit(this)
end
end
function handle_end_tag_remainder(self, this)
if this == ">" then
return self:pop()
elseif this == "" then
return self:fail_route()
end
end
function Parser:do_html_tag()
self:set("handler", handle_start)
self:advance()
end
function Parser:html_tag()
local tag = self:get("do_html_tag")
if tag == self.n.bad_route then
self:emit("<")
else
self:emit(HTMLTag:new(tag))
end
end
end
------------------------------------------------------------------------------------
--
-- Magic link
--
------------------------------------------------------------------------------------
-- Parsoid regexes:
-- ISBN: \bISBN$spaces((?:97[89]$spdash?)?(?:[0-9]$spdash?){9}[0-9Xx]\b
-- PMID/RFC: \b(?:RFC|PMID)$spaces([0-9]+)\b
-- where:
-- $spaces is (?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})++
-- $spdash is (?:-|\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})
do
-- Handlers.
local handle_prefix
local handle_whitespace
local handle_isbn13_number_first
local handle_isbn_number
local handle_isbn_spdash
local handle_isbn_end
local handle_other_number
function handle_prefix(self, this)
self.n.i = self.n.i + 1
if this ~= sub(self.n.prefix, self.n.i, self.n.i) then
return self:fail_route()
end
self:emit(this)
if self.n.i == #self.n.prefix then
if (
type(self.n.prev) == "string" and
umatch(self.n.prev, "^%w$")
) then
return self:fail_route()
end
self.n.handler = handle_whitespace
end
end
function handle_whitespace(self, this)
if this == "&" then
this = self:html_entity()
if (
not this or
this.code == " " or
rawstring(this) ~= "\194\160"
) then
return self:fail_route()
end
self.n.ws_found = true
elseif d.SPACE_SEPARATOR[this] then
self.n.ws_found = true
elseif match(this, "^%d$") then
self:emit(" ")
if self.n.prefix == "ISBN" then
local number = self:get("do_isbn_number")
if number == self.n.bad_route then
return self:fail_route()
end
self:emit_tokens(number)
return self:pop()
end
self.n.handler = handle_other_number
return self:consume()
else
return self:fail_route()
end
end
-- spdash is not allowed between the first three digits of an ISBN13 number.
function handle_isbn13_number_first(self, this)
self.n.i = self.n.i + 1
if (
self.n.i == 1 and this ~= "9" or
self.n.i == 2 and this ~= "7" or
self.n.i == 3 and this ~= "8" and this ~= "9"
) then
return self:fail_route()
end
self:emit(this)
if self.n.i == 3 then
return self:pop()
end
end
function handle_isbn_number(self, this)
self.n.i = self.n.i + 1
if self.n.i == 10 and match(this, "^[%dXx]$") then
self:emit(this)
self.n.handler = handle_isbn_end
elseif self.n.i < 10 and match(this, "^%d$") then
self:emit(this)
self.n.override = handle_isbn_spdash
else
return self:fail_route()
end
end
function handle_isbn_spdash(self, this)
self.n.override = nil
if this == "&" then
this = self:html_entity()
if (
not this or
this.code == " " or
rawstring(this) ~= "\194\160"
) then
return self:fail_route()
end
self:emit("\194\160")
elseif this == "-" or d.SPACE_SEPARATOR[this] then
self:emit(this)
else
return self:consume()
end
end
function handle_isbn_end(self, this)
if not umatch(this, "^%w$") then
return self:pop()
end
return self:fail_route()
end
function handle_other_number(self, this)
if match(this, "^%d$") then
self:emit(this)
elseif umatch(this, "^%w$") then
return self:fail_route()
else
return self:pop()
end
end
function Parser:do_magic_link(this)
self:set("handler", handle_prefix)
self.n.prev = self:emitted()
self.n.prefix = this == "I" and "ISBN" or
this == "P" and "PMID" or
this == "R" and "RFC"
self.n.i = 0
end
function Parser:do_isbn_number()
self:set("handler", handle_isbn_number)
local isbn13_first = self:get("do_isbn13_number_first")
if isbn13_first ~= self.n.bad_route then
local isbn13_rem = self:get("do_isbn13_number_remainder")
if isbn13_rem ~= self.n.bad_route then
self:emit_tokens(isbn13_first)
self:emit_tokens(isbn13_rem)
return self:pop()
end
self.head = isbn13_first.head
end
self.n.i = 0
end
function Parser:do_isbn13_number_first()
self:set("handler", handle_isbn13_number_first)
self.n.i = 0
end
function Parser:do_isbn13_number_remainder()
self:set("handler", handle_isbn_number)
self.n.i = 0
self.n.override = handle_isbn_spdash
self:advance()
end
function Parser:magic_link(this)
local magic_link = self:get("do_magic_link", this)
if magic_link == self.n.bad_route then
self:emit(this)
return
elseif magic_link.prefix == "ISBN" then
local prefix = Prefix:new{Wikitext:new("Special")}
local title = Wikitext:new("BookSources/")
for i = 6, #magic_link do
if match(magic_link[i], "^[%dXx]$") then
insert(title, upper(magic_link[i]))
end
end
self:emit(Wikilink:new{
prefix = prefix,
title = title,
display = Wikitext:new(magic_link)
})
else
local url, scheme, i, c
if magic_link.prefix == "PMID" then
url, i, c = explode("//www.ncbi.nlm.nih.gov/pubmed/?dopt=Abstract"), 6, 25
else
url, i, c = explode("//tools.ietf.org/html/rfc"), 5, 21
scheme = "https"
end
for n = i, #magic_link do
insert(url, n + c, magic_link[n])
end
self:emit(ExternalLink:new{
scheme = scheme and Wikitext:new(scheme) or nil,
url = Wikitext:new(url),
display = Wikitext:new(magic_link),
bracketed = true
})
end
self:advance(-1)
end
end
------------------------------------------------------------------------------------
--
-- Magic word
--
------------------------------------------------------------------------------------
do
-- Handlers.
local handle_start
local handle_body
local handle_end
function handle_start(self, this)
if this ~= "_" then
return self:fail_route()
end
self.n.handler = handle_body
end
function handle_body(self, this)
if this == "_" then
self.n.handler = handle_end
elseif match(this, "^%a$") then
self:emit(this)
else
return self:fail_route()
end
end
function handle_end(self, this)
if this == "_" then
local magic_word = self:concat()
if d.MAGIC_WORDS_CS[magic_word] then -- Case sensitive.
return uupper(magic_word)
end
magic_word = uupper(magic_word)
if d.MAGIC_WORDS_NOT_CS[magic_word] then -- Case insensitive.
return magic_word
end
return self:fail_route()
elseif match(this, "^%a$") then
self:emit("_")
self:emit(this)
self.n.handler = handle_body
else
return self:fail_route()
end
end
function Parser:do_magic_word()
self:set("handler", handle_start)
self.n.no_magic_word = true
self:advance()
end
function Parser:magic_word()
if self.n.no_magic_word then
return self:consume()
end
local magic_word = self:get("do_magic_word")
if magic_word == self.n.bad_route then
return self:consume()
end
self:pop()
if not self.n.magic_words then
self.magic_words = {}
end
insert(self.magic_words, magic_word)
end
end
------------------------------------------------------------------------------------
--
-- Newline
--
------------------------------------------------------------------------------------
-- If a newline is found, the current layer is retained as the main layer for the current parse, but sublayers are used for each subsequent newline. This allows finalize_line to do line-by-line postprocessing (matching Parsoid), which can then be emitted to the main layer once finalised.
function Parser:newline()
-- Remove already-emitted whitespace before end.
local this = self:emitted()
while this == " " or this == "\t" do
self:remove()
this = self:emitted()
end
self:finalize_line()
if self.n.sublayer then
self:emit_tokens(self:pop_sublayer())
end
self:emit("\n")
self:push_sublayer()
end
------------------------------------------------------------------------------------
--
-- Multipart
--
------------------------------------------------------------------------------------
function Parser:multipart(data, on_fail)
data.route[2] = true -- multipart
data.route[3] = 1 -- head
if on_fail then
data.allow_fail = true
on_fail.route[2] = true -- multipart
end
local parser, ok, tokens, sections = 1
while true do
ok, tokens, parser = Parser:parse(data)
if not ok then
on_fail.route[3] = data.route[3] -- head
tokens, parser = select(2, Parser:parse(on_fail))
end
if parser["end"] then
break
end
data.route[3] = parser.head + 1
sections = sections or {}
insert(sections, tokens)
end
if sections then
insert(sections, tokens)
return Multipart:new(sections)
end
return tokens
end
------------------------------------------------------------------------------------
--
-- Percent-encoding
--
------------------------------------------------------------------------------------
-- If decoding fails, this will normally cause the containing wikilink to fail, since any bytes decoded up to that point would decode to an invalid UTF-8 sequence on their own, which in invalid anywhere in a link. However, if if decoding fails on the leading byte due to an invalid raw character, then the wikilink will not fail, because the link will not contain any valid percent-encodings. e.g. [[%0G]] is a valid link, but [[%C2%0G]] and [[foo#%80]] will both fail, since "%C2" must have a trailing byte and "%80" can't be a leading byte in UTF-8.
do
-- Handlers.
local handle_leading_byte
local handle_trailing_byte
local handle_digit
function handle_leading_byte(self)
local byte = self:get("do_digit")
if self.n.bad_route then
self.n.no_fail_wikilink = true
return self:fail_route()
elseif (
byte.val > 0x7F and byte.val < 0xC2 or
byte.val > 0xF4
) then
return self:fail_route()
end
self:emit_tokens(byte)
if byte.val < 0x80 then
self.n.char = char(byte.val)
return self:pop()
end
self.n.bytes = {byte.val}
self.n.num = byte.val < 0xE0 and 2 or byte.val < 0xF0 and 3 or 4
self.n.handler = handle_trailing_byte
end
function handle_trailing_byte(self, this)
if this ~= "%" then
return self:fail_route()
end
local byte = self:get("do_digit")
if (
byte == self.n.bad_route or
byte.val < 0x80 or
byte.val > 0xBF or
#self.n.bytes == 1 and (
self.n.bytes[1] == 0xE0 and byte.val < 0xA0 or
self.n.bytes[1] == 0xED and byte.val > 0x9F or
self.n.bytes[1] == 0xF0 and byte.val < 0x90 or
self.n.bytes[1] == 0xF4 and byte.val > 0x8F
)
) then
return self:fail_route()
end
self:emit_tokens(byte)
insert(self.n.bytes, byte.val)
if #self.n.bytes == self.n.num then
self.n.char = char(unpack(self.n.bytes))
return self:pop()
end
end
function handle_digit(self, this)
if not match(this, "^%x$") then
return self:fail_route()
end
self:emit(this)
self.n.i = self.n.i + 1
if self.n.i == 2 then
self.n.val = tonumber(self:concat(2), 16)
return self:pop()
end
end
function Parser:do_percent_encoding()
self:set("handler", handle_leading_byte)
end
function Parser:do_digit()
self:set("handler", handle_digit)
self.n.i = 0
self:emit("%")
self:advance()
end
function Parser:percent_encoding()
local percent = self:get("do_percent_encoding")
if percent == self.n.bad_route then
return self.n.bad_route.no_fail_wikilink and "%" or nil
end
percent.code = concat(percent)
return PercentEncoding:new(percent)
end
end
------------------------------------------------------------------------------------
--
-- Strip marker
--
------------------------------------------------------------------------------------
do
local unstrip_nowiki = mw.text.unstripNoWiki
-- Handlers.
local handle_prefix
local handle_tag
local handle_hex_code
local handle_dec_code
local handle_suffix
function handle_prefix(self, this)
self.n.i = self.n.i + 1
if this ~= sub("'\"`UNIQ--", self.n.i, self.n.i) then
return self:fail_route()
end
self:emit(this)
if self.n.i == 9 then
self.n.handler = handle_tag
end
end
function handle_tag(self, this)
if this == "-" then
self.n.tag = self:concat(11)
self:emit("-")
if d.STRIP_MARKERS_HEX[self.n.tag] then
self.n.i = 0
self.n.handler = handle_hex_code
elseif d.STRIP_MARKERS_DEC[self.n.tag] then
self.n.handler = handle_dec_code
else
return self:fail_route()
end
elseif match(this, "^%l$") then
self:emit(this)
else
return self:fail_route()
end
end
function handle_hex_code(self, this)
if this == "-" then
if self.n.i ~= 8 then
return self:fail_route()
end
self:emit("-")
-- Ends -QINU`\"'\127 (one dash).
self.n.i = 1
self.n.handler = handle_suffix
elseif match(this, "^[%d%u]$") then
self:emit(this)
self.n.i = self.n.i + 1
else
return self:fail_route()
end
end
function handle_dec_code(self, this)
if this == "-" then
self:emit("-")
-- Ends --QINU`\"'\127 (two dashes).
self.n.i = 0
self.n.handler = handle_suffix
elseif match(this, "^%d$") then
self:emit(this)
else
return self:fail_route()
end
end
function handle_suffix(self, this)
self.n.i = self.n.i + 1
if this ~= sub("-QINU`\"'\127", self.n.i, self.n.i) then
return self:fail_route()
end
self:emit(this)
if self.n.i == 9 then
return self:pop()
end
end
function Parser:do_strip_marker()
self:set("handler", handle_prefix)
self.n.i = 0
self:emit("\127")
self:advance()
end
function Parser:strip_marker()
local strip_marker = self:get("do_strip_marker")
if strip_marker == self.n.bad_route then
return nil
elseif strip_marker.tag == "nowiki" then
local head = strip_marker.head
strip_marker = export.parse_nowiki(unstrip_nowiki(concat(strip_marker)))
strip_marker.tag = "nowiki"
strip_marker.head = head
end
return StripMarker:new(strip_marker)
end
end
------------------------------------------------------------------------------------
--
-- Wikilink
--
------------------------------------------------------------------------------------
do
local function is_invalid_target(this, pattern)
return not not (
not this or
this == "" or
this == "\239\191\189" or -- U+FFFD Replacement Character
match(this, pattern)
)
end
do
local handle_target_decoding
local handle_target
local handle_target_whitespace
local handle_target_escape
local handle_capitalizer
local handle_multipart
local handle_end_after_target
local handle_default_display_text
local handle_after_pipe
local handle_rsqb_after_pipe
local handle_text
local handle_text_after_newline
local handle_end_after_text
local handle_end_after_extra_rsqb
local handle_trail
function handle_target_decoding(self, this)
if this == "%" then
this = self:percent_encoding()
if type(this) == "table" then
return handle_target_decoding(self, rawstring(this))
end
return this, this
elseif this == "&" then
this = self:html_entity(true)
if this == false then
return nil
end
end
return this or "&", type(this) == "table" and rawstring(this) or this or "&"
end
function handle_target(self, this)
if this == "'" then
this = self:apostrophes()
self.n.apos = self.n.apos or this and true or nil
self:emit(this or "'")
return
elseif this == "\\" then
self.n.override = handle_target_escape
return
elseif this == "^" then
self.n.override = handle_capitalizer
elseif self.unembedded_link then
if this == "/" then
self.n.override = handle_multipart
return
elseif this == "" then
local ret = handle_default_display_text(self, true)
if ret then
return ret
end
self["end"] = true
return self:pop()
end
-- Only if not self.unembedded_link.
elseif this == "]" then
if #self.n == 0 then
return self:fail_route()
end
self.n.handler = handle_end_after_target
return
elseif this == "|" then
if #self.n == 0 then
return self:fail_route()
end
local wikilink = self:wikilink_target(Wikitext:new(self:pop_sublayer(), true))
if not wikilink then
return self:fail_route()
elseif wikilink.other then
return self:pop()
end
self.n.handler = handle_after_pipe
return
end
local decoded
if self.n.fragment then
-- "<" and ">" are valid as literals in fragments.
if is_invalid_target(this, "^[%z\1-\31%[%]{|}\127]$") then
return self:fail_route()
end
this, decoded = handle_target_decoding(self, this)
if not decoded then
return self:fail_route()
end
else
this, decoded = handle_target_decoding(self, this)
if is_invalid_target(decoded, "^[%z\1-\31<>%[%]{|}\127]$") then
return self:fail_route()
end
end
if decoded == "#" then
self:emit(this)
self.n.fragment = true
elseif d.BIDI[decoded] then
return
elseif d.WIKILINK_SPACE[decoded] then
self:emit(this)
self.n.override = handle_target_whitespace
else
self:emit(this)
end
end
function handle_target_whitespace(self, this)
if this == " " then
return
elseif d.WIKILINK_SPACE[this] then
self:emit(this)
else
self.n.override = nil
return self:consume()
end
end
function handle_target_escape(self, this)
self.n.override = nil
if this == "" then
return self:consume()
-- Retain escape for second pass.
elseif this == "#" or this == ":" or this == "\\" then
self:emit("\\")
end
self:emit(this)
end
function handle_capitalizer(self, this)
self.n.override = nil
-- TODO
end
function handle_multipart(self, this)
self.n.override = nil
if this == "/" then
local ret = handle_default_display_text(self, true)
if ret then
return ret
end
return self:pop()
else
self:emit("/")
return self:consume(this)
end
end
function handle_end_after_target(self, this)
if this ~= "]" then
return self:fail_route()
end
local ret = handle_default_display_text(self)
if ret then
return ret
end
local display = self.n.display
-- Push self.n.display onto the stack for the trail.
display.handler = handle_trail
display.head = self.head
display.route = handle_trail
local len = self.len + 1
self[len] = display
self.n = display
self.len = len
end
function handle_default_display_text(self, unembedded_link)
local raw_display = self:pop_sublayer()
-- Generate the target using a clone of raw_display, in case it gets trashed.
local wikilink = self:wikilink_target(
Wikitext:new({unpack(raw_display)}, true),
unembedded_link
)
if not wikilink then
return self:fail_route()
elseif wikilink.other then
return self:pop()
end
self.n.display = Wikitext:new(raw_display, true)
-- Style apostrophes are parsed before the trail is added.
self:substitute_apostrophes(self.n.display)
end
function handle_after_pipe(self, this)
if this == "]" then
self.n.handler = handle_rsqb_after_pipe
return
end
self:push_sublayer(handle_text)
return self:consume()
end
function handle_rsqb_after_pipe(self, this)
if this == "]" then
return self:fail_route()
end
self:push_sublayer(handle_text)
self:emit("]")
return self:consume()
end
-- Note: except for trails, sortkeys are parsed like display text, since Parsoid parses them before doing the category logic.
function handle_text(self, this)
if this == "\n" then
self:newline()
self.n.override = handle_text_after_newline
elseif this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "'" then
this = self:apostrophes()
self.n.apos = self.n.apos or this and true or nil
self:emit(this or "'")
elseif this == "<" then
self:html_tag()
elseif this == "[" then
if self.n.len > 0 and self:emitted() == "[" then
if self.n.other == "file" then
-- TODO
else
return self:fail_route()
end
end
self.n.extra_rsqb = true
self:emit("[")
elseif this == "]" then
self.n.handler = handle_end_after_text
if self.n.extra_rsqb then
local end_of_text = self:get("do_wikilink_end_after_extra_rsqb")
if end_of_text ~= self.n.bad_route then
self:emit_tokens(end_of_text)
return self:consume()
end
end
elseif this == "{" then
-- TODO: table
elseif this == "|" and self.n.other == "file" then
-- TODO
elseif this == "\127" then
self:emit(self:strip_marker() or "?")
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_text_after_newline(self, this)
if this == " " or this == "\t" then
return
end
self.n.override = nil
if this == "-" then
self:horizontal_rule()
elseif this == "=" then
self:heading()
else
return self:consume()
end
end
function handle_end_after_text(self, this)
if this == "]" then
-- Style apostrophes are parsed before the trail is added.
-- This is (bizarrely) even applied to sortkeys.
self:finalize_line()
if self.n.other == "category" then
local sortkey = Wikitext:new(self:pop_sublayer())
self.n.sortkey = sortkey
return self:pop()
end
self.n.handler = handle_trail
else
self:emit("]")
self.n.handler = handle_text
return self:consume()
end
end
function handle_end_after_extra_rsqb(self, this)
if this == "]" then
self.n.i = self.n.i + 1
if self.n.i == 2 then
return self:pop()
end
else
return self:fail_route()
end
end
function handle_trail(self, this)
if not match(this, "^%a$") then
local display = Wikitext:new(self:pop_sublayer())
self.n.display = display
self:advance(-1)
return self:pop()
end
self:emit(this)
end
function Parser:do_wikilink(head)
self:set("handler", handle_target)
self.head = head or self.head
self:push_sublayer()
end
function Parser:do_wikilink_end_after_extra_rsqb()
self:set("handler", handle_end_after_extra_rsqb)
self.n.i = 0
self:emit("]")
self:advance()
end
end
-- Second pass over wikilink target:
-- Get normalized prefixes: capitalization is ignored, and spacing characters + "_" become spaces.
-- Get any fragment.
-- Check for the colon trick.
-- Ignore style apostrophes.
do
local handle_target_decoding_2
local handle_prefix
local handle_target_2
local handle_target_escape_2
local handle_category
function handle_target_decoding_2(self, this)
if type(this) == "table" then
if this.type == "apostrophes" then
return "apostrophes"
end
-- Replace HTML entities and percent-encoding with the relevant characters.
local decoded, pos = this.char, self.head
self.text[pos] = decoded[1]
for i = 2, #decoded do
pos = pos + 1
insert(self.text, pos, decoded[i])
end
return decoded[1]
end
return this
end
function handle_prefix(self, this)
if this == "" then
return self:fail_route()
end
this = handle_target_decoding_2(self, this)
if this == "apostrophes" then
return self:fail_route()
elseif this == ":" then
if #self.n == 0 then
return self:pop()
end
local raw_prefix = concat(self.n)
local prefix = load_data("Module:data/namespaces")[raw_prefix]
if prefix then
self.n.prefix_type = "namespace"
-- Normalize namespace.
if raw_prefix == prefix then
self.n.normalized = self.n
else
self.n.normalized = Wikitext:new(explode(prefix))
end
self.n.str = prefix
return self:pop()
end
local prefix_type = load_data("Module:data/interwikis")[raw_prefix]
if not prefix_type then
return self:fail_route()
else
self.n.prefix_type = prefix_type
self.n.normalized = self.n
self.n.str = prefix
return self:pop()
end
-- Don't emit spaces at the start or end.
elseif d.WIKILINK_SPACE[this] then
if self.n.can_emit_space then
self.n.do_emit_space = true
end
else
if self.n.do_emit_space then
self:emit(" ")
self.n.do_emit_space = nil
end
if #this == 1 then
if not match(this, "^%w$") then
return self:fail_route()
end
self:emit(lower(this))
else
self:emit(ulower(this))
end
self.n.can_emit_space = true
end
end
function handle_target_2(self, this)
if this == "\\" then
self.n.override = handle_target_escape_2
return
elseif this == "" then
local layer = Wikitext:new(self:pop_sublayer())
self.n[self.n.title and "fragment" or "title"] = layer
return self:pop()
end
this = handle_target_decoding_2(self, this)
if this == "apostrophes" then
return
elseif not self.n.title then
if this == "#" then
local title = Wikitext:new(self:pop_sublayer())
self.n.title = title
self:push_sublayer()
return
end
-- TODO: add a title length counter and fail if too long
if this == "%" then
-- TODO: check for percent-encoding format
elseif this == "&" then
-- TODO: check for HTML entity format
elseif this == "." then
-- TODO: check for dot slash notation
elseif this == "/" then
-- TODO: ditto
elseif this == "~" then
-- TODO: check for 3+ consecutive tildes
end
end
self:emit(this)
end
function handle_target_escape_2(self, this)
self.n.override = nil
self:emit(this)
end
function handle_file_or_category(self, this)
if this == "" then
local layer = Wikitext:new(self:pop_sublayer())
self.n.title = layer
return self:pop()
elseif type(this) == "table" then
if this.type == "apostrophes" then
for _ = 1, this.num do
self:emit("'")
end
return
end
this = rawstring(this)
end
self:emit(this)
end
function Parser:do_prefix()
self.n.handler = handle_prefix
end
function Parser:do_wikilink_2(unembedded_link)
local colons, prefix, prefixes, prev_prefix_type = 0
while true do
prefix = self:get("do_prefix")
if prefix == self.n.bad_route then
break
elseif not prefixes then
if prefix.len == 0 then
if colons == 1 then
return self:fail_route()
end
self.n.colon_trick = true
elseif prefix.prefix_type == "current" then
self.n.colon_trick = true
else
prefixes = Prefix:new{}
insert(prefixes, prefix.normalized)
prev_prefix_type = prefix.prefix_type
end
colons = 1
elseif #prefixes == 1 and prefix.len == 0 then
if (
colons == 2 or
colons == 1 and not (
prev_prefix_type == "local" or
prev_prefix_type == "external"
)
) then
return self:fail_route()
end
colons = 2
elseif prefix.len > 0 then
insert(prefixes, prefix.normalized)
colons = 1
prev_prefix_type = prefix.prefix_type
end
self:advance()
-- Category prefix in an unembedded link always links to the category.
if not self.n.colon_trick and (
prefix.str == "file" or
prefix.str == "category" and not unembedded_link
) then
self.n.handler = handle_file_or_category
self.n.other = prefix.str
self:push_sublayer()
return
elseif prefix.prefix_type == "namespace" then
break
end
end
self.n.prefix = prefixes
self.n.handler = handle_target_2
self:push_sublayer()
end
function Parser:wikilink_target(target, unembedded_link)
if self.n.apos then
self:handle_odd_number_italics_and_bold(target)
self.n.apos = nil
end
local parser = Parser:new(target)
local wikilink = parser:get("do_wikilink_2", unembedded_link)
if wikilink == parser.bad_route then
return nil
end
self.n.title = wikilink.title
if wikilink.other then
self.n.other = wikilink.other
else
self.n.colon_trick = wikilink.colon_trick
self.n.prefix = wikilink.prefix
self.n.fragment = wikilink.fragment
end
return wikilink
end
end
function Parser:wikilink(head)
local wikilink = self:get("do_wikilink", head)
if wikilink == self.n.bad_route then
self:emit("[")
self:emit("[")
self:advance()
elseif wikilink.other == "category" then
self:emit(Category:new(wikilink))
else
self:emit(Wikilink:new(wikilink))
end
end
do
local function traverse_link_template(self)
local this, layer
repeat
this = self:read()
if this == "\r" then
layer = self:carriage_return("\r")
elseif this == "<" then
layer = self:comment()
elseif this ~= "\0" then
layer = self:consume(this)
end
self:advance()
until layer
self:advance(-1)
return layer
end
function Parser:do_link_template(_, head)
self.traverse = traverse_link_template
self.unembedded_link = true
self.head = head
return self:do_wikilink()
end
end
function export.parse_link_template(str)
local text = explode(str)
return Parser:multipart(
{
text = text,
node = {Wikilink},
route = {"do_link_template"}
},
{
text = text,
node = {Wikitext},
route = {"do_default"}
}
)
end
end
------------------------------------------------------------------------------------
--
-- Parser
--
------------------------------------------------------------------------------------
do
-- Handlers.
local handle_plaintext
local handle_plaintext_whitespace
local handle_plaintext_after_newline
local handle_multipart
function handle_plaintext(self, this)
if this == " " or this == "\t" then
self:emit(this)
self.n.override = handle_plaintext_whitespace
elseif this == "\n" then
self:newline()
self.n.override = handle_plaintext_after_newline
elseif this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "'" then
this = self:apostrophes()
self.n.apos = self.n.apos or this and true or nil
self:emit(this or "'")
elseif this == "/" and self.multi then
self.n.override = handle_multipart
elseif this == ":" then
self:free_external_link()
elseif this == "<" then
self:html_tag()
elseif this == "I" or this == "P" or this == "R" then
self:magic_link(this)
elseif this == "[" then
self:bracketed_external_link()
elseif this == "\127" then
self:emit(self:strip_marker() or "?")
elseif this == "" then
self:finalize_line()
self["end"] = true
return self:pop()
else
self:emit(this)
end
end
function handle_plaintext_whitespace(self, this)
if this ~= " " and this ~= "\t" then
self.n.override = nil
return self:consume(this)
end
end
function handle_plaintext_after_newline(self, this)
self.n.override = nil
if this == "#" then
self:emit(self.OrderedListMarker)
elseif this == "*" then
self:emit(self.UnorderedListMarker)
elseif this == "-" then
self:horizontal_rule()
elseif this == ":" then
self:emit(self.IndentationMarker)
elseif this == ";" then
self:emit(self.DescriptionListMarker)
self.n.dl = true
elseif this == "=" then
self:heading()
else
return self:consume(this)
end
end
function handle_multipart(self, this)
self.n.override = nil
if this == "/" then
self:finalize_line()
return self:pop()
else
self:emit("/")
return self:consume(this)
end
end
do
local function traverse_default(self)
local this, layer
repeat
this = self:read()
if this == "\r" then
layer = self:carriage_return("\r")
elseif this == "<" then
layer = self:comment()
elseif this == "_" then
layer = self:magic_word()
elseif this ~= "\0" then
layer = self:consume(this)
end
self:advance()
until layer
self:advance(-1)
return layer
end
function Parser:do_default(multipart, head)
self.traverse = traverse_default
if multipart then
self.multi = multipart
self.head = head
end
self:set("handler", handle_plaintext)
end
end
function export.parse(str, multipart)
local data = {
text = explode(str),
node = {Wikitext},
route = {"do_default"}
}
if multipart then
return Parser:multipart(data)
end
return (select(2, Parser:parse(data)))
end
end
do
local handle_nowiki
local handle_multipart
function handle_nowiki(self, this)
if this == "\r" then
return self:carriage_return("\r")
elseif this == "&" then
self:emit(self:html_entity() or "&")
elseif this == "/" and self.multi then
self.n.override = handle_multipart
elseif this == "" then
self:finalize_line()
self["end"] = true
return self:pop()
elseif this ~= "/0" then
self:emit(this)
end
end
function handle_multipart(self, this)
self.n.override = nil
if this == "/" then
return self:pop()
else
self:emit("/")
return self:consume(this)
end
end
function Parser:do_nowiki(multipart, head)
if multipart then
self.multi = multipart
self.head = head
end
self:set("handler", handle_nowiki)
end
function export.parse_nowiki(str)
local data = {
text = explode(str),
node = {Wikitext},
route = {"do_nowiki"}
}
return (select(2, Parser:parse(data)))
end
end
return export