Jump to content

Module:User:Erutuon/scripts

From Wiktionary, the free dictionary
local export = {}

local floor = math.floor
local lookup = mw.loadData("Module:Unicode data/scripts")
local get_codepoint = mw.ustring.codepoint

-- These pairs of scripts share many characters.
local shares_some_characters_with = {
	Latn = "Latinx",
	Grek = "polytonic",
	Cyrl = "Cyrs",
}

local compound_scripts = {
	Jpan = { "Hani", "Hira", "Kana" },
	Kore = { "Hang", "Hani" },
}

-- What scripts wouldn't these override?
local overrides = {
	Jpan = "Latn",
	Hani = "Latn",
	Kore = "Latn",
	-- ...
}

local scripts_with_identical_characters = {
	Latn = { "Latf", "Latn", "Zyyy", "nv-Latn", "pjt-Latn" },
	Arab = {
		"Arab", "fa-Arab", "kk-Arab", "ks-Arab", "ku-Arab", "ms-Arab",
		"mzn-Arab", "ota-Arab", "pa-Arab", "ps-Arab", "sd-Arab", "tt-Arab",
		"ug-Arab", "ur-Arab",
	},
	Hani = { "Hani", "Hans", "Hant" },
}

local function in_array(array, val)
	if array == nil then
		return false
	end
	
	for _, v in ipairs(array) do
		if val == v then
			return true
		end
	end
	return false
end

-- Returns the first value in one array that is also found in another array.
local function get_intersection(array1, array2)
	for _, v in ipairs(array1) do
		if in_array(array2, v) then
			return v
		end
	end
end

local function compare_range_arrays(range1, range2)
	return range1[1] < range2[1]
end

--[[
	Binary search: more efficient for the longer lists of codepoint ranges than
	for the shorter ones.
]]
local function binary_search(ranges, value)
	if not ranges then
		return nil
	end
	--	Initialize numbers.
	local iStart, iMid = 1, 0
	-- Can't use # because table is loaded by mw.loadData.
	local iEnd = ranges.length or require("Module:table").size(ranges)

	if iEnd == 0 then
		return nil
	end

	local iterations = 0

	-- Do search.
	while iStart <= iEnd do
		iterations = iterations + 1

		-- Calculate middle.
		iMid = floor((iStart + iEnd) / 2)

		-- Get compare value.
		local range = ranges[iMid]

		if range[1] > value then
			iEnd = iMid - 1

		-- Return matching index. Assumes there are no duplicates.
		elseif value <= range[2] then
			return range

		-- Keep searching.
		else
			iStart = iMid + 1
		end
	end
	return nil
end

local function look_up_in_order(number, ranges)
	for i, range in ipairs(ranges) do
		if number < range[1] then
			return nil
		elseif number <= range[2] then
			return range[3]
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local ranges_cache = {}

--[=[
	Takes a codepoint and finds the script code (if any) that is appropriate for
	it, using the data module [[Module:Unicode data/scripts]]. The data module
	was generated from the patterns in [[Module:scripts/data]] using
	[[Module:User:Erutuon/script recognition]].

	Returns a script code if the codepoint is in the list of individual
	characters, or if it is in one of the defined ranges in the 4096-character
	block that it belongs to, else returns "None".
]=]
local individual_lookup = lookup.individual
local function codepoint_to_script(codepoint)
	local individual_match = individual_lookup[codepoint]
	if individual_match then
		return individual_match
	else
		local script = look_up_in_order(codepoint, ranges_cache)
		if script then
			return script
		end

		local index = floor(codepoint / 0x1000)

		script = look_up_in_order(index, lookup.blocks)
		if script then
			return script
		end

		local range = binary_search(lookup[index], codepoint)
		if range then
			table.insert(ranges_cache, range)
			table.sort(ranges_cache, compare_range_arrays)
			return range[3]
		end
	end

	return "None"
end

function export.get_script_counts(str)
	local script_counts = {}
	setmetatable(
		script_counts,
		{
			__index = function(self, key)
				self[key] = 0
				return 0
			end
		})
	
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = codepoint_to_script(codepoint)
		script_counts[script] = script_counts[script] + 1
	end
	
	setmetatable(script_counts, nil)
	
	return script_counts
end

local get_lang_data = require "Module:fun".memoize(function (lang_code)
	if #lang_code == 3 then
		return mw.loadData("Module:languages/data/3/" .. lang_code:sub(1, 1))[lang_code]
	elseif #lang_code == 2 then
		return mw.loadData "Module:languages/data/2"[lang_code]
	else
		return mw.loadData "Module:languages/data/exceptional"[lang_code]
	end
end)

local function transfer_count(script_counts, from_script_code, to_script_code)
	script_counts[from_script_code], script_counts[to_script_code] =
		0, (script_counts[to_script_code] or 0) + script_counts[from_script_code]
end

function export.find_best_script(str, lang_code)
	local script_counts = export.get_script_counts(str)
	
	-- Show string and list of scripts.
	-- mw.log(str, table.concat(require "Module:fun".mapIter(function(value, key) return key end, pairs(script_counts)), ", "))
	
	-- Might save a little processing time.
	script_counts.None = nil
	
	if lang_code then
		local data = get_lang_data(lang_code) or error("Language code " .. lang_code .. " not recognized.")
		local scripts = data and data.scripts
		
		local compound_script
		for _, script in ipairs(scripts) do
			if compound_scripts[script] then
				compound_script = script
			end
		end
		
		local overriding
		
		for script, count in pairs(script_counts) do
			if not in_array(scripts, script) then
				local similar_script = shares_some_characters_with[script]
				local other_script
				
				-- in Ancient Greek: Grek -> polytonic
				if similar_script and in_array(scripts, similar_script) then
					other_script = similar_script
				
				-- in Japanese: Kana -> Jpan
				elseif compound_script and in_array(compound_scripts[compound_script], script) then
					other_script = compound_script
				
				-- in Navajo: Latn -> nv-Latn
				elseif scripts_with_identical_characters[script] then
					local intersection = get_intersection(scripts, scripts_with_identical_characters[script])
					if intersection then
						other_script = intersection
					end
				end
				
				-- Transfer character count of original script to new script.
				if other_script then
					transfer_count(script_counts, script, other_script)
					script = other_script
				end
			end
			
			if overrides[script] then
				overriding = script
			end
		end
	
		if compound_script and script_counts[compound_script] then
			local constituent_scripts = compound_scripts[compound_script]
			for script, count in pairs(script_counts) do
				if count > 0 and in_array(constituent_scripts, script) then
					transfer_count(script_counts, script, compound_script)
				end
			end
		end
		
		if overriding then
			local overridden = overrides[overriding]
			
			if script_counts[overridden] then
				transfer_count(script_counts, overridden, overriding)
			end
		end
	end
	
	local greatest_count = 0
	local best_script
	
	for script, count in pairs(script_counts) do
		if count > greatest_count and script ~= "None" then
			greatest_count = count
			best_script = script
		end
	end
	
	return best_script
end

return export