Module:amf-utilities

From Wiktionary, the free dictionary
Jump to navigation Jump to search


local export = {}

-- internal encoding using [a-zA-Z]
export.encode = {
	["ã"] = "A",
	["ɓ"] = "B",
	["cʼ"] = "C",
	["ɗ"] = "D",
	["ɛ"] = "E",
	["ɠ"] = "G",
	["ĩ"] = "I",
	["ɲ"] = "N",
	["ɔ"] = "O",
	["sh"] = "S",
	["tʼ"] = "T",
	["ʔ"] = "Q",
}
export.decode = {
	A = "ã",
	B = "ɓ",
	C = "cʼ",
	D = "ɗ",
	E = "ɛ",
	G = "ɠ",
	I = "ĩ",
	N = "ɲ",
	O = "ɔ",
	S = "sh",
	T = "tʼ",
	Q = "ʔ",
}
export.sortkey = {
	A = "a",
	B = "b",
	C = "c",
	D = "d",
	E = "e",
	G = "g",
	I = "i",
	N = "n",
	O = "o",
	S = "sh",
	T = "t",
	Q = "ʔ",
}

-- parse a word into syllables using the internal encoding
-- returns a table with extra info, e.g. "Wucʼê" becomes:
-- { "wu", "Ce", accent=2, cap=true, falling=true }
function export.syllabify(word)
	word = mw.ustring.toNFD(word)
	
	local lowered = word:ulower()
	local cap = lowered ~= word
	word = lowered
	
	local accent = word:match('\204[\129\130]')
	local _, count = word:gsub('\204[\129\130]','')
	if count > 1 then error("More than one diacritic found.") end
	
	word = word:gsub("[tc]ʼ",export.encode)
	if word:match("ʼ") then error("Uncoupled ʼ found.") end
	word = word:gsub("sh","S")
	word = word:gsub("[\194-\223][\128-\191]",export.encode)
	
	word = word:gsub("[bBcCdDgGhjklmnNpqrsStwxyzQ][aeiouEO]",".%0")
		:gsub("([aeiouEO])([aeiouEO]\204[\129\130])","%1.%2") -- e.g. tiá -> ti.á
		:gsub("^%.","")
		:gsub("%.%.+",".")
	local syllables = mw.text.split(word,".",true)
	
	local accented = 0
	for i,syl in ipairs(syllables) do
		syllables[i],count = syl:gsub("\204[\129\130]","")
		if count == 1 then
			accented = i
			break
		end
	end
	
	syllables.accent = accented
	syllables.cap = cap
	syllables.falling = accent == "\204\130"
	return syllables
end

-- inverse of export.syllabify
function export.combine(syllables)
	local a,c,f = syllables.accent, syllables.cap, syllables.falling
	local diacritic = f and "\204\130" or "\204\129"
	local word = "" -- do not use table.concat to avoid modifying input
	for i,syl in ipairs(syllables) do
		if i == a then
			syl = syl:gsub("[aeiouEO]","%0"..diacritic,1)
		end
		word = word .. syl
	end
	word = word:gsub("[BCDEGNOSTQ]",export.decode)
	if c then
		word = word:gsub("^[\1-\127\194-\255][\128-\191]*",string.uupper,1)
	end
	return mw.ustring.toNFC(word)
end

-- generates the sort key for categorization
-- wucʼê --> wuce2'
-- (2: accent on second syllable)
-- (apostrophe at the end: falling tone)
function export.makeSortKey(text, lang, sc)
	if lang ~= "amf" or sc ~= "Latn" then
		require("Module:debug").track("amf-utilities/sort")
		return text
	end
	words = mw.text.split(text, " ", true)
	for i,word in ipairs(words) do
		local success, syllables = pcall(export.syllabify,word)
		if success then
			words[i] = table.concat(syllables):gsub("[BCDEGNOSTQ]",export.sortkey)
			.. syllables.accent
			.. (syllables.falling and "'" or "")
		else
			require("Module:debug").track("amf-utilities/sort")
		end
	end
	return table.concat(words, " ")
end

return export