Module:cs-pronunciation

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module generates Czech pronunciation transcriptions for {{cs-IPA}}.

Testcases

See Module:cs-pronunciation/testcases.


local export = {}

local m_str_utils = require("Module:string utilities")
local m_syllables = require("Module:syllables")
local pron_utilities_module = "Module:pron utilities"

local lang = require("Module:languages").getByCode("cs")
local sc = require("Module:scripts").getByCode("Latn")

function export.tag_text(text, face)
	return require("Module:script utilities").tag_text(text, lang, sc, face)
end

function export.link(term, face)
	return require("Module:links").full_link(
		{ term = term, lang = lang, sc = sc }, face
		)
end

local lower = m_str_utils.lower
local pattern_escape = m_str_utils.pattern_escape
local replacement_escape = m_str_utils.replacement_escape
local rmatch = m_str_utils.match
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rsplit = m_str_utils.split
local toNFC = mw.ustring.toNFC
local U = m_str_utils.char

local long = "ː"
local nonsyllabic = U(0x32F)	-- inverted breve below
local syllabic = U(0x0329)
local syllabic_below = U(0x030D)
local raised = U(0x031D)		-- uptack below
local ringabove = U(0x030A)		-- ring above
local caron = U(0x030C)			-- combining caron
local tie = U(0x0361)			-- combining double inverted breve
local AC = U(0x0301)			-- combining acute accent
local primary_stress = "ˈ"
local secondary_stress = "ˌ"

local single_char_subs = {
	["á"] = "a" .. long,
	["c"] = "t" .. tie .. "s",
	["č"] = "t" .. tie .. "ʃ",
	["ď"] = "ɟ",
	["e"] = "ɛ",
	["é"] = "ɛ" .. long,
	["ě"] = "jɛ",
	["g"] = "ɡ",
	["h"] = "ɦ",
	["i"] = "ɪ",
	["í"] = "i" .. long,
	["ň"] = "ɲ",
	["ó"] = "o" .. long,
	["q"] = "k",
	["ř"] = "r" .. raised,
	["š"] = "ʃ",
	["t"] = "t",
	["ť"] = "c",
	["ú"] = "u" .. long,
	["ů"] = "u" .. long,
	["w"] = "v",
	["x"] = "ks",
	["y"] = "ɪ",
	["ý"] = "i" .. long,
	["ž"] = "ʒ",
	["\""] = primary_stress,
	["%"] = secondary_stress,
	["?"] = "ʔ",
}

--[[	This allows multiple-character sounds to be replaced
		with single characters to make them easier to process.	]]

local multiple_to_single = {
	["t" .. tie .. "s"			] = "ʦ",
	["t" .. tie .. "ʃ"			] = "ʧ",
	["r" .. raised .. ringabove	] = "ṙ",
	["d" .. tie .. "z"			] = "ʣ",
	["d" .. tie .. "ʒ"			] = "ʤ",
	["r" .. raised				] = "ř",
}

--[[	"voiceless" and "voiced" are obstruents only;
		sonorants are not involved in voicing assimilation.	]]

-- ʦ, ʧ, "ṙ" replace t͡s, t͡ʃ, r̝̊
local voiceless	= { "p", "t", "c", "k", "f", "s", "ʃ", "x", "ʦ", "ʧ", "ṙ", "ʔ" }
-- "ʣ", ʤ, ř replace d͡z, d͡ʒ, r̝
local voiced	= { "b", "d", "ɟ", "ɡ", "v", "z", "ʒ", "ɦ", "ʣ", "ʤ", "ř", }
local sonorants = { "m", "n", "ɲ", "r", "l", "j", }
local consonant = "[" .. table.concat(sonorants) .. "ŋ"
	.. table.concat(voiceless) .. table.concat(voiced) .. "]"
assimil_consonants = {}
assimil_consonants.voiceless = voiceless
assimil_consonants.voiced = voiced

local features = {}
local indices = {}
for index, consonant in pairs(voiceless) do
	if not features[consonant] then
		features[consonant] = {}
	end
	features[consonant]["voicing"] = "voiceless"
	indices[consonant] = index
end

for index, consonant in pairs(voiced) do
	if not features[consonant] then
		features[consonant] = {}
	end
	features[consonant]["voicing"] = "voiced"
	indices[consonant] = index
end

local short_vowel = "[aɛɪou]"
local long_vowel = "[aɛiou]" .. long
local diphthong ="[aɛo]u" .. nonsyllabic
local syllabic_consonant = "[mnrl]" .. syllabic

local written_vowel = "[aeiouyáéíóúýěů]"
local written_acute_vowel = "[áéíóúý]"
local written_acute_to_plain_vowel = {
	["á"] = "a",
	["é"] = "e",
	["í"] = "i",
	["ó"] = "o",
	["ú"] = "u",
	["ý"] = "y",
}

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local function compose(text)
	return toNFC(text)
end

-- Canonicalize multiple spaces and remove leading and trailing spaces.
local function canon_spaces(text)
	text = rsub(text, "%s+", " ")
	text = rsub(text, "^ ", "")
	text = rsub(text, " $", "")
	return text
end

-- all but v and r̝
local causing_assimilation =
	rsub(
		"[" .. table.concat(voiceless) .. table.concat(voiced) .. "ʔ]",
		"[vř]",
		""
	)

local assimilable = "[" .. table.concat(voiceless):gsub("ʔ", "") .. table.concat(voiced) .. "]"

local function regressively_assimilate(IPA)
	IPA = rsub(
		IPA,
		"(" .. assimilable .. "+)(" .. causing_assimilation .. ")",
		function (assimilated, assimilator)
			local voicing = features[assimilator] and features[assimilator].voicing
				or error('The consonant "' .. consonant
					.. '" is not recognized by the function "regressively_assimilate".')
			return rsub(
				assimilated,
				".",
				function (consonant)
					return assimil_consonants[voicing][indices[consonant]]
				end)
				.. assimilator
			end)
	
	IPA = rsub(IPA, "smus", "zmus")
	
	return IPA	
end

local function devoice_finally(IPA)
	local obstruent = "[" .. table.concat(voiced) .. table.concat(voiceless) .. "]"
	
	IPA = rsub(
		IPA,
		"(" .. obstruent .. "+)#",
		function (final_obstruents)
			return rsub(
				final_obstruents,
				".",
				function (obstruent)
					return voiceless[indices[obstruent]]
				end)
				.. "#"
		end)
	
	return IPA
end

local function devoice_fricative_r(IPA)
	-- all but r̝̊, which is added by this function
	local voiceless = rsub("[" .. table.concat(voiceless) .. "]", "ṙ", "")
	
	-- ř represents r̝, "ṙ" represents r̝̊
	IPA = rsub(IPA, "(" .. voiceless .. ")" .. "ř", "%1ṙ")
	IPA = rsub(IPA, "ř" .. "(" .. voiceless .. ")", "ṙ%1")
	
	return IPA
end

local function syllabicize_sonorants(IPA)
	 -- all except ɲ and j
	local sonorant = rsub("[" .. table.concat(sonorants) .. "]", "[ɲj]", "")
	local obstruent = "[" .. table.concat(voiced) .. table.concat(voiceless) .. "]"
	
	-- between a consonant and an obstruent
	IPA = rsub(
		IPA,
		"(" .. consonant .. "+" .. sonorant .. ")(" .. consonant .. ")",
		"%1" .. syllabic .. "%2"
		)
	
	-- at the end of a word after an obstruent
	IPA = rsub(IPA, "(" .. obstruent .. sonorant .. ")#", "%1" .. syllabic)
	
	return IPA
end

local function assimilate_nasal(IPA)
	local velar = "[ɡk]"
	
	IPA = rsub(IPA, "n(" .. velar .. ")", "ŋ%1")
	
	return IPA
end

local function add_stress(IPA)
	local syllable_count = m_syllables.getVowels(IPA, lang)
	
	if not (rfind(IPA, " ") or rfind(IPA, primary_stress)) then
		IPA = primary_stress .. IPA
	end
	
	return IPA
end

local function syllabify(IPA)
	local syllables = {}
	
	local working_string = IPA
	
	local noninitial_cluster = rmatch(working_string, ".(" .. consonant .. consonant .. ").")
	local has_cluster = noninitial_cluster and not rfind(noninitial_cluster, "(.)%1")
	
	if not ( has_cluster or rfind(working_string, " ") ) then
		while #working_string > 0 do
			local syllable = rmatch(working_string, "^" .. consonant .. "*" .. diphthong)
				or rmatch(working_string, "^" .. consonant .. "*" .. long_vowel)
				or rmatch(working_string, "^" .. consonant .. "*" .. short_vowel)
				or rmatch(working_string, "^" .. consonant .. "*" .. syllabic_consonant)
			if syllable then
				table.insert(syllables, syllable)
				working_string = rsub(working_string, syllable, "", 1)
			elseif rfind(working_string, "^" .. consonant .. "+$")
				or rfind(working_string, primary_stress)
				then
			
				syllables[#syllables] = syllables[#syllables] .. working_string
				working_string = ""
			else
			error('The function "syllabify" could not find a syllable '
				.. 'in the IPA transcription "' .. working_string .. '".')
			end
		end
	end
	
	if #syllables > 0 then
		IPA = table.concat(syllables, ".")
	end
	
	return IPA
end

local function apply_rules(IPA)
	-- Handle consonantal prepositions: v, z.
	IPA = rsub(
		IPA,
		"(#[vz])# #(.)",
		function (preposition, initial_sound)
			if rfind(initial_sound, short_vowel) then
				return preposition .. "ʔ" .. initial_sound
			else
				return preposition .. initial_sound
			end
		end)
	
	for sound, character in pairs(multiple_to_single) do
		IPA = rsub(IPA, sound, character)
	end
	
	IPA = regressively_assimilate(IPA)
	IPA = devoice_finally(IPA)
	IPA = devoice_fricative_r(IPA)
	IPA = syllabicize_sonorants(IPA)
	IPA = assimilate_nasal(IPA)
	IPA = add_stress(IPA)
	
	for sound, character in pairs(multiple_to_single) do
		IPA = rsub(IPA, character, sound)
	end
	
	--[[	This replaces double (geminate) with single consonants,
			and changes a stop plus affricate to affricate:
			for instance, [tt͡s] to [t͡s].								]]
	IPA = rsub(IPA, "(" .. consonant .. ")%1", "%1")
	
	-- Remove # at word boundaries.
	IPA = rsub(IPA, "#", "")

	return IPA
end

function export.toIPA(text)
	local orig_respelling = text
	
	text = lower(text)

	-- convert commas and en/en dashes to IPA foot boundaries
	text = rsub_repeatedly(text, "%s*[,–—]%s*", " | ")
	-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary
	text = rsub_repeatedly(text, "([^%s])%s*[!?]%s+([^%s])", "%1 | %2")
	text = rsub(text, "[!?]$", "") -- eliminate remaining punctuation

	text = canon_spaces(text)

	-- put # at word beginning and end and double ## at text/foot boundary beginning/end
	text = rsub(text, " | ", "# | #")
	text = "##" .. rsub(text, " ", "# #") .. "##"

	text = rsub(text, "^%-", "")
	text = rsub(text, "%-$", "")
	text = rsub(text, "%-", " ")
	text = rsub(text, "nn", "n") -- similar operation is applied to IPA above

	-- Handle palatalization before ě, i and í.
	text = rsub(text, "([tdn])ě", "%1" .. caron .. "e")
	text = rsub(text, "([tdn])([ií])", "%1" .. caron .. "%2")
	text = rsub(text, "mě", "mn" .. caron .. "e")
	text = compose(text) -- recompose combining caron

	-- Handle initial ex- pronounced /egz/.
	text = rsub(text, "#exh", "#egzh")
	text = rsub(text, "#ex(" .. written_vowel .. ")", "#egz%1")

	-- Initial i- and y- + vowel are pronounced like /j/. Other sequences of i/y/í/ý + vowel need an interpolated /j/.
	text = rsub(text, "#[iy](" .. written_vowel .. ")", "#j%1")
	text = rsub(text, "([iyíý])(" .. written_vowel .. ")", "%1j%2")

	text = rsub(text, "ch", "X") -- temporary substitution

	-- convert to approximate phonetic notation; FIXME: this is being done way too early
	text = rsub(text, "([oa])u", "%1u" .. nonsyllabic)
	text = rsub(text, "eu", "ɛu" .. nonsyllabic)
	text = rsub(text, "eu", "ɛu" .. nonsyllabic)
	text = rsub(text, ".", single_char_subs)

	text = rsub(text, "X", "x")

	text = apply_rules(text)
	
	return text
end

local function convert_respelling_to_original(to, pagename, whole_word)
	local from = rsub(to, "[yý]", "i"):gsub("z", "s"):gsub("%?", "")
	from = rsub(from, written_acute_vowel, written_acute_to_plain_vowel)
	local escaped_from = pattern_escape(from)
	if whole_word then
		escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]"
	end
	if rfind(pagename, escaped_from) then
		return from
	end
	-- Check for partial replacement.
	escaped_from = pattern_escape(to)
	-- Replace specially-handled characters with a class matching the character and possible replacements. Order of the
	-- following substitutions is important to avoid a later substitution interfering with an earlier one.
	escaped_from = rsub(escaped_from, "[áéíóú]", function(v) return "[" .. v .. written_acute_to_plain_vowel[v] .. "]" end)
	escaped_from = "(" .. escaped_from:gsub("y", "[iy]"):gsub("ý", "[iíyý]"):gsub("z", "[sz]"):gsub("%%%?", "[?]?") .. ")"
	if whole_word then
		escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]"
	end
	local match = rmatch(pagename, escaped_from)
	if match then
		if match == to then
			error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):
				format(to, pagename))
		end
		return match
	end
	error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename))
end
	

-- Given raw respelling, canonicalize it. This currently applies substitutions of the form e.g. [ny] or [th:t,nýz].
local function canonicalize(text, pagename)
	if text == "+" then
		text = pagename
	elseif rfind(text, "^%[.*%]$") then
		local subs = rsplit(rmatch(text, "^%[(.*)%]$"), ",")
		text = pagename
		local function err(msg)
			error(msg .. ": " .. text)
		end
		for _, sub in ipairs(subs) do
			local from, escaped_from, to, escaped_to, whole_word
			if rfind(sub, "^~") then
				-- whole-word match
				sub = rmatch(sub, "^~(.*)$")
				whole_word = true
			end
			if sub:find(":") then
				from, to = rmatch(sub, "^(.-):(.*)$")
			else
				to = sub
				from = convert_respelling_to_original(to, pagename, whole_word)
			end
			escaped_from = pattern_escape(from)
			if whole_word then
				escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]"
			end
			escaped_to = replacement_escape(to)
			local subbed_text, nsubs = rsubn(text, escaped_from, escaped_to)
			if nsubs == 0 then
				err(("Substitution spec %s -> %s didn't match processed pagename"):format(from, to))
			elseif nsubs > 1 then
				err(("Substitution spec %s -> %s matched multiple substrings in processed pagename, add more context"):format(from, to))
			else
				text = subbed_text
			end
		end
	end

	return text
end

local function respelling_to_IPA(data)
	local respelling = canonicalize(data.respelling, data.pagename)
	local IPA = export.toIPA(respelling)
	return "[" .. IPA .. "]"
end

function export.show(frame)
	local parent_args = frame:getParent().args
	return require(pron_utilities_module).format_prons {
		lang = lang,
		respelling_to_IPA = respelling_to_IPA,
		raw_args = parent_args,
		track_module = "cs-pronunciation",
		template_default = "příklad",
	}
end

return export