Module:User:Erutuon/scripts
Appearance
- The following documentation is located at Module:User:Erutuon/scripts/documentation. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local export = {}
local floor = math.floor
local lookup = mw.loadData("Module:Unicode data/scripts")
local get_codepoint = mw.ustring.codepoint
-- These pairs of scripts share many characters.
local shares_some_characters_with = {
Latn = "Latinx",
Grek = "polytonic",
Cyrl = "Cyrs",
}
local compound_scripts = {
Jpan = { "Hani", "Hira", "Kana" },
Kore = { "Hang", "Hani" },
}
-- What scripts wouldn't these override?
local overrides = {
Jpan = "Latn",
Hani = "Latn",
Kore = "Latn",
-- ...
}
local scripts_with_identical_characters = {
Latn = { "Latf", "Latn", "Zyyy", "nv-Latn", "pjt-Latn" },
Arab = {
"Arab", "fa-Arab", "kk-Arab", "ks-Arab", "ku-Arab", "ms-Arab",
"mzn-Arab", "ota-Arab", "pa-Arab", "ps-Arab", "sd-Arab", "tt-Arab",
"ug-Arab", "ur-Arab",
},
Hani = { "Hani", "Hans", "Hant" },
}
local function in_array(array, val)
if array == nil then
return false
end
for _, v in ipairs(array) do
if val == v then
return true
end
end
return false
end
-- Returns the first value in one array that is also found in another array.
local function get_intersection(array1, array2)
for _, v in ipairs(array1) do
if in_array(array2, v) then
return v
end
end
end
local function compare_range_arrays(range1, range2)
return range1[1] < range2[1]
end
--[[
Binary search: more efficient for the longer lists of codepoint ranges than
for the shorter ones.
]]
local function binary_search(ranges, value)
if not ranges then
return nil
end
-- Initialize numbers.
local iStart, iMid = 1, 0
-- Can't use # because table is loaded by mw.loadData.
local iEnd = ranges.length or require("Module:table").size(ranges)
if iEnd == 0 then
return nil
end
local iterations = 0
-- Do search.
while iStart <= iEnd do
iterations = iterations + 1
-- Calculate middle.
iMid = floor((iStart + iEnd) / 2)
-- Get compare value.
local range = ranges[iMid]
if range[1] > value then
iEnd = iMid - 1
-- Return matching index. Assumes there are no duplicates.
elseif value <= range[2] then
return range
-- Keep searching.
else
iStart = iMid + 1
end
end
return nil
end
local function look_up_in_order(number, ranges)
for i, range in ipairs(ranges) do
if number < range[1] then
return nil
elseif number <= range[2] then
return range[3]
end
end
end
-- Save previously used codepoint ranges in case another character is in the
-- same range.
local ranges_cache = {}
--[=[
Takes a codepoint and finds the script code (if any) that is appropriate for
it, using the data module [[Module:Unicode data/scripts]]. The data module
was generated from the patterns in [[Module:scripts/data]] using
[[Module:User:Erutuon/script recognition]].
Returns a script code if the codepoint is in the list of individual
characters, or if it is in one of the defined ranges in the 4096-character
block that it belongs to, else returns "None".
]=]
local individual_lookup = lookup.individual
local function codepoint_to_script(codepoint)
local individual_match = individual_lookup[codepoint]
if individual_match then
return individual_match
else
local script = look_up_in_order(codepoint, ranges_cache)
if script then
return script
end
local index = floor(codepoint / 0x1000)
script = look_up_in_order(index, lookup.blocks)
if script then
return script
end
local range = binary_search(lookup[index], codepoint)
if range then
table.insert(ranges_cache, range)
table.sort(ranges_cache, compare_range_arrays)
return range[3]
end
end
return "None"
end
function export.get_script_counts(str)
local script_counts = {}
setmetatable(
script_counts,
{
__index = function(self, key)
self[key] = 0
return 0
end
})
for codepoint in mw.ustring.gcodepoint(str) do
local script = codepoint_to_script(codepoint)
script_counts[script] = script_counts[script] + 1
end
setmetatable(script_counts, nil)
return script_counts
end
local get_lang_data = require "Module:fun".memoize(function (lang_code)
if #lang_code == 3 then
return mw.loadData("Module:languages/data/3/" .. lang_code:sub(1, 1))[lang_code]
elseif #lang_code == 2 then
return mw.loadData "Module:languages/data/2"[lang_code]
else
return mw.loadData "Module:languages/data/exceptional"[lang_code]
end
end)
local function transfer_count(script_counts, from_script_code, to_script_code)
script_counts[from_script_code], script_counts[to_script_code] =
0, (script_counts[to_script_code] or 0) + script_counts[from_script_code]
end
function export.find_best_script(str, lang_code)
local script_counts = export.get_script_counts(str)
-- Show string and list of scripts.
-- mw.log(str, table.concat(require "Module:fun".mapIter(function(value, key) return key end, pairs(script_counts)), ", "))
-- Might save a little processing time.
script_counts.None = nil
if lang_code then
local data = get_lang_data(lang_code) or error("Language code " .. lang_code .. " not recognized.")
local scripts = data and data.scripts
local compound_script
for _, script in ipairs(scripts) do
if compound_scripts[script] then
compound_script = script
end
end
local overriding
for script, count in pairs(script_counts) do
if not in_array(scripts, script) then
local similar_script = shares_some_characters_with[script]
local other_script
-- in Ancient Greek: Grek -> polytonic
if similar_script and in_array(scripts, similar_script) then
other_script = similar_script
-- in Japanese: Kana -> Jpan
elseif compound_script and in_array(compound_scripts[compound_script], script) then
other_script = compound_script
-- in Navajo: Latn -> nv-Latn
elseif scripts_with_identical_characters[script] then
local intersection = get_intersection(scripts, scripts_with_identical_characters[script])
if intersection then
other_script = intersection
end
end
-- Transfer character count of original script to new script.
if other_script then
transfer_count(script_counts, script, other_script)
script = other_script
end
end
if overrides[script] then
overriding = script
end
end
if compound_script and script_counts[compound_script] then
local constituent_scripts = compound_scripts[compound_script]
for script, count in pairs(script_counts) do
if count > 0 and in_array(constituent_scripts, script) then
transfer_count(script_counts, script, compound_script)
end
end
end
if overriding then
local overridden = overrides[overriding]
if script_counts[overridden] then
transfer_count(script_counts, overridden, overriding)
end
end
end
local greatest_count = 0
local best_script
for script, count in pairs(script_counts) do
if count > greatest_count and script ~= "None" then
greatest_count = count
best_script = script
end
end
return best_script
end
return export