Jump to content

Module:category tree/poscatboiler/data/scripts/blocks

From Wiktionary, the free dictionary

Creates a list of the Unicode blocks in which a script's characters are found. This list is displayed on category pages, such as the category for the Latin script.


local m_str_utils = require("Module:string utilities")

local concat = table.concat
local cp = m_str_utils.codepoint
local floor = math.floor
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local insert = table.insert
local sort = table.sort
local u = m_str_utils.char

local export = {}

local fun = require "Module:fun"
local Array = require "Module:array"

local function compare_script_codes(code1, code2)
	-- Sort four-letter codes and non-four-letter codes alphabetically.
	if (#code1 == 4) == (#code2 == 4) then
		return code1 < code2
	
	-- Put four-letter codes before non-four-letter codes.
	else
		return #code1 == 4
	end
end

local function sort_scripts(script_codes)
	sort(script_codes, compare_script_codes)
	return script_codes
end

local block_data = require "Module:Unicode data/blocks"

-- Add position of range in the array of blocks to the range tables.
for i, range in ipairs(block_data) do
	range[4] = i
end

-- Binary search, to avoid iterating over entire table in order to look up the
-- higher codepoints.
local function binary_lookup_block(codepoint)
	local iStart, iEnd = 1, block_data.length or #block_data
	while iStart <= iEnd do
		local iMid = floor((iStart + iEnd) / 2)
		local range = block_data[iMid]
		if codepoint < range[1] then
			iEnd = iMid - 1
		elseif codepoint <= range[2] then
			return range
		else
			iStart = iMid + 1
		end
	end
	error(string.format("No block found for codepoint U+%04X.", codepoint))
end

function export.lookup_block(char)
	local codepoint = cp(char)
	local range = binary_lookup_block(codepoint)
	if range then
		return range
	else
		error(string.format("No block found for U+%04X (%s).", codepoint, u(codepoint)))
	end
end

function export.get_singles_and_ranges(pattern)
	local ranges, singles = {}, {}
	pattern = gsub(
		pattern,
		"(.)%-(.)",
		function(lower, higher)
			insert(ranges, { lower, higher })
			return ""
		end)
	
	for character in gmatch(pattern, ".") do
		insert(singles, character)
	end
	
	return singles, ranges
end

function export.get_block_arrays(pattern)
	local singles, ranges = export.get_singles_and_ranges(pattern)
	
	local blocks = {}
	
	for _, character in ipairs(singles) do
		blocks[export.lookup_block(character)] = true
	end
	
	for _, range in ipairs(ranges) do
		local block_array1, block_array2 = export.lookup_block(range[1]), export.lookup_block(range[2])
		for i = block_array1[4], block_array2[4] do
			blocks[block_data[i]] = true
		end
	end
	
	return Array.keysToList(blocks, function (block1, block2) return block1[4] < block2[4] end)
end

local function format_block_info(block_array)
	return ("[[Appendix:Unicode/%s|%s]] (U+%04X&ndash;U+%04X)"):format(block_array[3], block_array[3], block_array[1], block_array[2])
end

function export.print_blocks(block_arrays, prefix)
	sort(
		block_arrays,
		function (block_array1, block_array2)
			return block_array1[1] < block_array2[1]
		end)
	local block_names = fun.map(
		function (block_array)
			return "* " .. format_block_info(block_array)
		end,
		block_arrays)
	if prefix then
		insert(block_names, 1, prefix)
	end
	return concat(block_names, "\n")
end

function export.print_blocks_by_canonical_name(script_name)
	if type(script_name) ~= "string" then
		error("script_name should be a string, not " .. type(script_name) .. ".")
	end
	
	local scripts_by_pattern = {}
	setmetatable(
		scripts_by_pattern,
		{
			__index = function(self, key)
				if key == nil then
					return
				end
				local val = Array()
				self[key] = val
				return val
			end
		})
	
	local count = 0
	for code, data in pairs(mw.loadData("Module:scripts/data")) do
		if data[1] == script_name and data.characters then
			count = count + 1
			scripts_by_pattern[data.characters]:insert(code)
		end
	end
	
	if not next(scripts_by_pattern) then
		return nil
	end
	
	local block_arrays_by_scripts = {}
	local block_count = 0
	
	-- Construct arrays of blocks and count the blocks.
	for pattern, scripts in pairs(scripts_by_pattern) do
		local array = export.get_block_arrays(pattern)
		block_arrays_by_scripts[sort_scripts(scripts)] = array
		block_count = block_count + #array
	end
	
	require("Module:debug").track{
		"scriptcatboiler/blocks/" .. count,
		"scriptcatboiler/blocks/" .. block_count
	}
	
	if count == 1 and block_count == 1 then
		local scripts, block_arrays = next(block_arrays_by_scripts)
		if scripts[2] or block_arrays[2] then
			error("More than one script or more than one block. Something is wrong.")
		end
		
		return ("The characters of <code>%s</code> are found in the Unicode block %s")
			:format(scripts[1], format_block_info(block_arrays[1]))
	else
		local collapsible1 = '{|\n'
		local collapsible3 = '</div></div>'
		
		return '{| class="mw-collapsible mw-collapsed wikitable" style="width: 30em;"\n|+ style="font-weight: normal;" | ' .. "'''Unicode block"
			.. (block_count > 1 and "s" or "") .. " for characters in "
			.. (count > 1 and "these scripts" or "this script") .. "'''\n|\n"
			.. concat(
				fun.mapIter(
					function(block_arrays, scripts)
						return export.print_blocks(
							block_arrays,
							"; Block" .. (block_arrays[2] and "s" or "") .. " in "
								.. concat(
									fun.map(
										function (script_code)
											return "<code>" .. script_code .. "</code>"
										end,
										scripts),
									", "))
					end,
					require("Module:table").sortedPairs(
						block_arrays_by_scripts,
						function(script_array1, script_array2)
							return compare_script_codes(script_array1[1], script_array2[1])
						end)),
				"\n")
			.. '\n|}'
	end
end

-- For testing.
function export.print_blocks_by_canonical_name_template(frame)
	return export.print_blocks_by_canonical_name(frame.args[1])
end

return export