Module:Unicode data/patterns
Appearance
- The following documentation is located at Module:Unicode data/patterns/documentation. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
Generates patterns for Lua patterns or regular expressions, to be put inside set notation: [...]
or [^...]
.
{{#invoke:Unicode data/patterns|make_pattern|module=Module:Unicode data submodule|value=value to look for in
singles
and ranges
}}
- Pattern for Latin script as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=scripts|value=Latn}}
: ⅎⁱⁿℲªºꟓA-Za-zÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿₐ-ₜK-ÅⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-Ꟑ-ꟑꟕ-ꟲ-ꟿꬰ-ꭚꭜ-ꭤꭦ-ꭩff-stA-Za-z𐞀-𐞅𐞇-𐞰𐞲-𐞺𝼀-𝼞𝼥-𝼪
- Pattern for "Common" script as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=scripts|value=Zyyy}}
: ·،㋿〆ーꧏ؛؟฿ʹ×÷;🟰᳓ـ꤮𝒻᳡𝒢꭛𝕆΅゠ᳺ᠅჻ -@[-`{-©«-¹»-¿ʹ-˟˥-˩ˬ-˿।-॥࿕-࿘᛫-᛭᜵-᜶᠂-᠃ᳩ-ᳬᳮ-ᳳᳵ-᳷ ---⁰⁴-⁾₀-₎₠-⃀℀-℥℧-℩ℬ-ℱℳ-⅍⅏-⅟↉-↋←-⑀-⑊①-⟿⤀-⭳⭶-⮕⮗-⯿⸀-⹝⿰-〄〈-〠〰-〷〼-〿゛-゜・-ー㆐-㆟㇀-㈠-㉟㉿-㋏㍘-㏿䷀-䷿꜀-꜡ꞈ-꞊꠰-꠹꭪-꭫﴾-﴿︐-︙︰-﹒﹔-﹦﹨-﹫!-@[-`{-・゙-゚¢-₩│-○-�𐄀-𐄂𐄇-𐄳𐄷-𐄿𐆐-𐆜𐇐-𐇼𐋡-𐋻---𜽐-𜿃𝀀-𝃵𝄀-𝄦𝄩-𝅦𝅪-𝆃-𝆄𝆌-𝆩𝆮-𝇪𝋀-𝋓𝋠-𝋳𝌀-𝍖𝍠-𝍸𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕊-𝕐𝕒-𝚥𝚨-𝟋𝟎-𝟿𞱱-𞲴𞴁-𞴽🀀-🀫🀰-🂓🂠-🂮🂱-🂿🃁-🃏🃑-🃵🄀-🆭🇦-🇿🈁-🈂🈐-🈻🉀-🉈🉐-🉑🉠-🉥🌀-🛗🛜-🛬🛰-🛼🜀-🝶🝻-🟙🟠-🟫🠀-🠋🠐-🡇🡐-🡙🡠-🢇🢐-🢭🢰--🤀-🩓🩠-🩭🩰-🩼🪀--🫎--🫰-🫸🬀-🮒🮔-🯹-
- Pattern for titlecase letters as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=category|value=Lt}}
: DžLjNjDzᾼῌῼᾈ-ᾏᾘ-ᾟᾨ-ᾯ
local export = {}
local Array = require "Module:array"
local function numeric_character_reference(code_point)
return ("&#x%04X;"):format(code_point)
end
function export.all_ranges_per_value(data_module)
local value_to_ranges = setmetatable({}, {
__index = function(self, key)
local value = Array()
self[key] = value
return value
end,
})
for code_point, value in pairs(data_module.singles) do
value_to_ranges[value]:insert { code_point, code_point }
end
for _, range in ipairs(data_module.ranges) do
local low, high, value = unpack(range)
value_to_ranges[value]:insert { low, high }
end
return value_to_ranges
end
function export.ranges_per_value(data_module, value_to_find)
local ranges = Array()
for code_point, value in pairs(data_module.singles) do
if value == value_to_find then
ranges:insert { code_point, code_point }
end
end
for _, range in ipairs(data_module.ranges) do
local low, high, value = unpack(range)
if value == value_to_find then
ranges:insert { low, high }
end
end
return ranges
end
local function sort_ranges(ranges)
table.sort(
ranges,
function (a, b)
return a[1] < b[1]
end)
end
-- Makes a pattern suitable to put inside [...] or [^...]
-- in a Lua pattern or regular expression.
local function make_pattern(ranges, char_ref)
local output = Array()
for _, range in ipairs(ranges) do
if char_ref then
output:insert(numeric_character_reference(range[1]))
else
output:insert(mw.ustring.char(range[1]))
end
if range[1] ~= range[2] then
output:insert "-"
if char_ref then
output:insert(numeric_character_reference(range[2]))
else
output:insert(mw.ustring.char(range[2]))
end
end
end
return output:concat()
end
-- Assumes ranges are sorted and that only one range has bad characters.
-- Treats all characters U+0000-U+001F as invalid in wikitext, but only some are.
local function sanitize_ranges(ranges)
for i, range in ipairs(ranges) do
if 0 <= range[1] and range[1] <= 0x1F then
if 0 <= range[2] and range[2] <= 0x1F then
table.remove(ranges, i)
break
else
range[1] = 0x20
end
end
end
end
function export.make_pattern(frame)
local module_name = frame.args.module
if not module_name then
error("Provide name of submodule of Module:Unicode data in |module= parameter.")
end
local value = frame.args.value
if not value then
error("Provide value to search for in |value= parameter.")
end
local ranges = export.ranges_per_value(require("Module:Unicode data/" .. module_name), value)
sanitize_ranges(ranges)
return make_pattern(ranges, false)
end
function export.show_all_patterns(frame)
local module_name = frame.args.module
if not module_name then
error("Provide name of submodule of Module:Unicode data in |module=.")
end
local value_to_ranges = export.all_ranges_per_value(require("Module:Unicode data/" .. module_name))
for _, ranges in pairs(value_to_ranges) do
sort_ranges(ranges)
end
local output = Array()
for value, ranges in require "Module:table".sortedPairs(value_to_ranges) do
output:insert("\n* " .. value .. ": ")
output:insert "<code>"
output:insert(make_pattern(ranges, true))
output:insert "</code>"
end
return output:concat()
end
return export