Jump to content

Module:Unicode data/patterns

From Wiktionary, the free dictionary

Generates patterns for Lua patterns or regular expressions, to be put inside set notation: [...] or [^...].

{{#invoke:Unicode data/patterns|make_pattern|module=Module:Unicode data submodule|value=value to look for in singles and ranges}}

Pattern for Latin script as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=scripts|value=Latn}}: ⅎⁱⁿℲªºꟓA-Za-zÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿₐ-ₜK-ÅⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꟍꟐ-ꟑꟕ-Ƛꟲ-ꟿꬰ-ꭚꭜ-ꭤꭦ-ꭩff-stA-Za-z𐞀-𐞅𐞇-𐞰𐞲-𐞺𝼀-𝼞𝼥-𝼪
Pattern for "Common" script as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=scripts|value=Zyyy}}: ·،㋿〆ーꧏ؛۝؟฿ʹ㇯×÷࣢;🟰󠀁᳓ـ꤮𝒻᳡𝒢꭛𝕆΅゠ᳺ᠅჻؅ -@[-`{-©«-¹»-¿ʹ-˟˥-˩ˬ-˿।-॥࿕-࿘᛫-᛭᜵-᜶᠂-᠃ᳩ-ᳬᳮ-ᳳᳵ-᳷ -​‎-⁤⁦-⁰⁴-⁾₀-₎₠-⃀℀-℥℧-℩ℬ-ℱℳ-⅍⅏-⅟↉-↋←-␩⑀-⑊①-⟿⤀-⭳⭶-⮕⮗-⯿⸀-⹝⿰-〄〈-〠〰-〷〼-〿゛-゜・-ー㆐-㆟㇀-㇥㈠-㉟㉿-㋏㍘-㏿䷀-䷿꜀-꜡ꞈ-꞊꠰-꠹꭪-꭫﴾-﴿︐-︙︰-﹒﹔-﹦﹨-﹫!-@[-`{-・゙-゚¢-₩│-○-�𐄀-𐄂𐄇-𐄳𐄷-𐄿𐆐-𐆜𐇐-𐇼𐋡-𐋻𛲠-𛲣𜰀-𜳹𜴀-𜺳𜽐-𜿃𝀀-𝃵𝄀-𝄦𝄩-𝅦𝅪-𝅺𝆃-𝆄𝆌-𝆩𝆮-𝇪𝋀-𝋓𝋠-𝋳𝌀-𝍖𝍠-𝍸𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕊-𝕐𝕒-𝚥𝚨-𝟋𝟎-𝟿𞱱-𞲴𞴁-𞴽🀀-🀫🀰-🂓🂠-🂮🂱-🂿🃁-🃏🃑-🃵🄀-🆭🇦-🇿🈁-🈂🈐-🈻🉀-🉈🉐-🉑🉠-🉥🌀-🛗🛜-🛬🛰-🛼🜀-🝶🝻-🟙🟠-🟫🠀-🠋🠐-🡇🡐-🡙🡠-🢇🢐-🢭🢰-🢻🣀-🣁🤀-🩓🩠-🩭🩰-🩼🪀-🪉🪏-🫆🫎-🫜🫟-🫩🫰-🫸🬀-🮒🮔-🯹󠀠-󠁿
Pattern for titlecase letters as defined by Unicode
{{#invoke:Unicode data/patterns|make_pattern|module=category|value=Lt}}: DžLjNjDzᾼῌῼᾈ-ᾏᾘ-ᾟᾨ-ᾯ

local export = {}
local Array = require "Module:array"

local function numeric_character_reference(code_point)
	return ("&#x%04X;"):format(code_point)
end

function export.all_ranges_per_value(data_module)
	local value_to_ranges = setmetatable({}, {
		__index = function(self, key)
			local value = Array()
			self[key] = value
			return value
		end,
	})

	for code_point, value in pairs(data_module.singles) do
		value_to_ranges[value]:insert { code_point, code_point }
	end

	for _, range in ipairs(data_module.ranges) do
		local low, high, value = unpack(range)
		value_to_ranges[value]:insert { low, high }
	end
	
	return value_to_ranges
end

function export.ranges_per_value(data_module, value_to_find)
	local ranges = Array()

	for code_point, value in pairs(data_module.singles) do
		if value == value_to_find then
			ranges:insert { code_point, code_point }
		end
	end

	for _, range in ipairs(data_module.ranges) do
		local low, high, value = unpack(range)
		if value == value_to_find then
			ranges:insert { low, high }
		end
	end
	
	return ranges
end

local function sort_ranges(ranges)
	table.sort(
		ranges,
		function (a, b)
			return a[1] < b[1]
		end)
end

-- Makes a pattern suitable to put inside [...] or [^...]
-- in a Lua pattern or regular expression.
local function make_pattern(ranges, char_ref)
	local output = Array()
	
	for _, range in ipairs(ranges) do
		if char_ref then
			output:insert(numeric_character_reference(range[1]))
		else
			output:insert(mw.ustring.char(range[1]))
		end
		if range[1] ~= range[2] then
			output:insert "-"
			if char_ref then
				output:insert(numeric_character_reference(range[2]))
			else
				output:insert(mw.ustring.char(range[2]))
			end
		end
	end
	
	return output:concat()
end

-- Assumes ranges are sorted and that only one range has bad characters.
-- Treats all characters U+0000-U+001F as invalid in wikitext, but only some are.
local function sanitize_ranges(ranges)
	for i, range in ipairs(ranges) do
		if 0 <= range[1] and range[1] <= 0x1F then
			if 0 <= range[2] and range[2] <= 0x1F then
				table.remove(ranges, i)
				break
			else
				range[1] = 0x20
			end
		end
	end
end

function export.make_pattern(frame)
	local module_name = frame.args.module
	if not module_name then
		error("Provide name of submodule of Module:Unicode data in |module= parameter.")
	end
	
	local value = frame.args.value
	if not value then
		error("Provide value to search for in |value= parameter.")
	end
	
	local ranges = export.ranges_per_value(require("Module:Unicode data/" .. module_name), value)
	
	sanitize_ranges(ranges)
	
	return make_pattern(ranges, false)
end

function export.show_all_patterns(frame)
	local module_name = frame.args.module
	if not module_name then
		error("Provide name of submodule of Module:Unicode data in |module=.")
	end
	local value_to_ranges = export.all_ranges_per_value(require("Module:Unicode data/" .. module_name))
	
	for _, ranges in pairs(value_to_ranges) do
		sort_ranges(ranges)
	end
	
	local output = Array()
	for value, ranges in require "Module:table".sortedPairs(value_to_ranges) do
		output:insert("\n* " .. value .. ": ")
		output:insert "<code>"
		output:insert(make_pattern(ranges, true))
		output:insert "</code>"
	end
	
	return output:concat()
end

return export