Jump to content

Module:Unicode data/make

From Wiktionary, the free dictionary


local m_str_utils = require("Module:string utilities")

local byte = string.byte
local insert = table.insert
local set_nested = require("Module:table").setNested
local split = m_str_utils.split
local tonumber = tonumber
local tostring = tostring
local u = m_str_utils.char

local process_string = tostring
local process_decimal = tonumber

local function process_hexadecimal(v)
	return tonumber(v, 16)
end

local function process_null()
	return nil
end

local function get_return_val_keys(funcs, ...)
	if ... then
		local vals = {...}
		return vals, #vals
	end
	local vals, n = {}, #funcs
	for i = 1, n do
		insert(vals, i)
	end
	return vals, n
end

local function iterate_UnicodeData(...)
	local UnicodeData = require("Module:Unicode data/raw/UnicodeData.txt")
	
	local funcs = {
		function(v)
			if type(v) == "string" then
				return process_hexadecimal(v)
			end
			v[1], v[2] = process_hexadecimal(v[1]), process_hexadecimal(v[2])
			return v
		end,
		
		process_string,
		process_string,
		process_decimal,
		process_string,
		
		function(v)
			if v == "" then
				return
			end
			local type, start = v:match("^<(.-)> *()")
			v = split(start and v:sub(start) or v, " +")
			v.type = type
			return v
		end,
		
		process_decimal,
		process_decimal,
		
		function(v)
			if v == "" then
				return
			end
			local n, d = v:match("^(%-?%d+)/(%-?%d+)$")
			if n then
				return tonumber(n) / tonumber(d)
			end
			return tonumber(v)
		end,
		
		function(v)
			if v == "Y" then
				return true
			elseif v == "N" then
				return false
			end
		end,
		
		process_string,
		process_null,
		process_hexadecimal,
		process_hexadecimal,
		process_hexadecimal
	}
	
	local start, vals, n, line = 1, get_return_val_keys(funcs, ...)
	
	local function ordered_unpack(line, i)
		i = i or 1
		local k = vals[i]
		local ret = funcs[k](line[k])
		if i == n then
			return ret
		end
		return ret, ordered_unpack(line, i + 1)
	end
	
	local function iter(prev) -- TODO: iterate ranges
		line, start = UnicodeData:match("([^\n]+)()", start)
		if not line then
			return
		end
		line = split(line, ";")
		if prev then
			line[1] = {prev[1], line[1]}
		elseif line[2]:sub(-8, -1) == ", First>" then
			return iter(line)
		end
		return ordered_unpack(line)
	end
	
	return iter
end

local export = {}

local function compress(t, trailing)
	for k, v in pairs(t) do
		if type(v) == "table" then
			v = compress(v, true)
			t[k] = v
		end
	end
	if not trailing then
		return t
	end
	local check_v = t[128]
	for i = 129, 191 do
		if t[i] ~= check_v then
			return t
		end
	end
	return check_v
end

function export.categories()
	local output = {}
	for codepoint, category in iterate_UnicodeData(1, 3) do
		if category and type(codepoint) ~= "table" then
			local ch = u(codepoint)
			set_nested(output, category, byte(ch, 1, -1))
		end
	end
	return compress(output)
end

function export.combining_classes()
	local output = {}
	for codepoint, class in iterate_UnicodeData(1, 4) do
		if class and class ~= 0 then
			local ch = u(codepoint)
			set_nested(output, class, byte(ch, 1, -1))
		end
	end
	return compress(output)
end

return export