Module:Hani-sortkey/sandbox
Appearance
- The following documentation is located at Module:Hani-sortkey/sandbox/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
This module was used to develop the ideographic-description-sequence-detecting function in Module:Hani-sortkey.
- ⿰亻革 (
⿰人00革00
) - ⿰亻革家 (
⿰人00革00宀07
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心 / ⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心 (
⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿱苦⿲⿰⿹耳舌鼻⿱⿱平⿰惡意⿱眼⿰淨染⿰⿱女子身 (
⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00⿱艸05⿲⿰⿹耳00舌00鼻00⿱⿱干02⿰心08心09⿱目06⿰水08木05⿰女03身00
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵 / ⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心面 (
⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00麥09
)
Hypothetical defective examples:
- ⿰石 (
⿰石00
) - ⿰石⿳女子 (
⿰石00⿳女00子00
) - ⿳女子 (
⿳女00子00
)
local export = {}
local namespace = mw.title.getCurrentTitle().nsText
local substring = mw.ustring.sub
local function log(...)
if namespace == "Module" then
mw.log(...)
end
end
--[[
The number of characters or ideographic sequences that must follow each
ideographic description character.
]]
local IDchars = {
["⿰"] = 2,
["⿱"] = 2,
["⿲"] = 3,
["⿳"] = 3,
["⿴"] = 2,
["⿵"] = 2,
["⿶"] = 2,
["⿷"] = 2,
["⿸"] = 2,
["⿹"] = 2,
["⿺"] = 2,
["⿻"] = 2,
--[[
-- in future perhaps: https://www.unicode.org/L2/L2018/18012-irgn2273-four-new-idcs.pdf
[mw.ustring.char(0x2FFC)] = 2,
[mw.ustring.char(0x2FFD)] = 2,
[mw.ustring.char(0x2FFE)] = 1,
[mw.ustring.char(0x2FFF)] = 1,
--]]
}
--[[
Returns the index in the string where the ideographic description sequence
(IDS) ends, or the index of the end of the string. Iterates whenever
another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
if not ( text and IDchar and i) then
return nil
end
local j = i
local component = 1
-- Number of components expected after current IDC.
local components = IDchars[IDchar]
while component <= components do
j = j + 1
local char = substring(text, j, j)
if char == "" then
break
elseif IDchars[char] then
j = findEndOfIDS(text, char, j)
end
component = component + 1
end
--[[
If the expected number of components has been found,
return the current index in the text.
]]
if component - components == 1 then
return j
else
return nil
end
end
local module_cache = {}
function export.getData(char, returnModule)
if type(char) == "string" then
char = mw.ustring.codepoint(char)
elseif type(char) ~= number then
error("getData must operate on a single character or codepoint.")
end
local data = mw.loadData("Module:Hani-sortkey/data/sandbox")
local sectionStart = {
0x3400,
0xFA0E,
0x20000,
0x30000,
math.huge
}
for k, v in ipairs(sectionStart) do
if char < v then
local start = 5 * (char - sectionStart[k - 1]) + 1
return data[k - 1]:sub(start, start + 4)
end
end
end
local unsupported_data
function export.makeSortKey(text, lang, sc)
local scripts = {
Hani = true,
Hans = true,
Hant = true,
Jpan = true,
Kore = true
}
if sc and not scripts[sc] then
return mw.ustring.upper(text)
end
local sort = {}
local i = 1
while i <= mw.ustring.len(text) do
local character = substring(text, i, i)
--[=[
If we encounter an ideographic description character (IDC),
find out if it begins a valid ideographic description sequence (IDS).
If the IDS is valid and a sortkey for it is listed in
[[Module:Hani-sortkey/data/unsupported]], then return
the sortkey, and move to the next character after the
IDS.
Otherwise, insert the IDC into the sortkey and move to the next
character after the IDC.
If the IDS is valid and no sortkey for it is found, track it.
]=]
if IDchars[character] then
local j = findEndOfIDS(text, character, i)
local IDS, data
if j then
IDS = substring(text, i, j)
unsupported_data = unsupported_data or mw.loadData("Module:Hani-sortkey/data/unsupported")
data = unsupported_data[IDS]
end
if not data then
if IDS then
require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
mw.log("ideographic description sequence without sortkey: '"
.. IDS .. "'")
else
require("Module:debug").track("Hani-sortkey/invalid-IDS")
mw.log("invalid ideographic description sequence at the beginning of '"
.. substring(text, i) .. "'")
end
end
if IDS and data then
table.insert(sort, data)
i = j
else
table.insert(sort, character)
end
else
table.insert(sort, export.getData(character) or character)
end
i = i + 1
end
sort = table.concat(sort)
return sort
end
return export