Module:hu-pron

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Pronunciation module for Hungarian. See {{hu-IPA}}.

Testcases


local export = {}
local gsub = mw.ustring.gsub
local match = mw.ustring.match

local replace_set = {
	[1] = {
		['c'] = 'ʦ',
		['cssz'] = 'ʧṡ', ['szsz'] = 'ṡṡ', ['tssz'] = 'ʦʦ', ['zssz'] = 'sṡ',
		['ggy'] = 'ɟɟ', ['nny'] = 'ɲɲ', ['[dt]ty'] = 'cc',
	},
	[2] = {
		['dzs'] = 'ʤ', ['ddzs'] = 'ʤʤ', ['ssz'] = 'ṡṡ', ['szs'] = 'ss',
		['tsz'] = 'ʦʦ', ['dsz'] = 'ʦʦ', ['tts'] = 'ʧʧ',
		['gysz'] = 'cṡ', ['ttysz'] = 'ʦʦ', ['ttʦ'] = 'ʦʦ',
		['ngy'] = 'ɲɟ', ['ʦszs'] = 'ʤʒ',
		['gyj'] = 'ɟɟ', ['nyj'] = 'ɲɲ', ['tyj'] = 'cc',
		['llj'] = 'jj', ['ttj'] = 'cc',
		['lr'] = 'rr',
	},
	[3] = {
		['ʦs'] = 'ʧ', ['dz'] = 'ʣ',
		['gy'] = 'ɟ', ['ly'] = 'j', ['ny'] = 'ɲ',
		['ty'] = 'c', ['lj'] = 'jj', ['nj'] = 'ɲɲ',
		['tj'] = 'cc', ['dj'] = 'ɟɟ', ['tʦ'] = 'ʦʦ',
		['dʦ'] = 'ʦʦ', ['ts'] = 'ʧʧ', ['ds'] = 'ʧʧ',
		['gys'] = 'cʃ', ['gycs'] = 'cʧ',
		['qu'] = 'kv', ['sz'] = 'ṡ', ['z#s'] = 'ʃʃ', ['zs'] = 'ʒ',
	},
	[4] = {
		['s'] = 'ʃ', ['ʦʧ'] = 'ʧʧ',
		['w'] = 'v', ['x'] = 'kṡ',
	},
}

local replace_cons = {
	['c'] = 'ʦ', ['cs'] = 'ʧ', ['ccs'] = 'ʧʧ', ['cszs'] = 'ʤʒ', ['cssz'] = 'ʧṡ',
	['dc'] = 'ʦʦ', ['dj'] = 'ɟɟ', ['ds'] = 'ʧʧ', ['dsz'] = 'ʦʦ', ['dty'] = 'cc',
	['dz'] = 'ʣ', ['ddz'] = 'ʣʣ',
	['dzs'] = 'ʤ', ['ddzs'] = 'ʤʤ', ['dzssz'] = 'ʧs',
	['gy'] = 'ɟ', ['ggy'] = 'ɟɟ', ['gycs'] = 'cʧ', ['gyj'] = 'ɟɟ', ['gys'] = 'cʃ', ['gysz'] = 'cṡ',
	['lj'] = 'jj', ['llj'] = 'jj', ['lr'] = 'rr', ['llr'] = 'rr', ['ly'] = 'j',
	['ngy'] = 'ɲɟ', ['nj'] = 'ɲɲ', ['nny'] = 'ɲɲ', ['ny'] = 'ɲ', ['nyj'] = 'ɲɲ',
	['q'] = 'k',
	['s'] = 'ʃ', ['ssz'] = 'ṡṡ', ['sz'] = 'ṡ', ['szs'] = 'ʃʃ', ['szsz'] = 'ṡṡ',
	['tc'] = 'ʦʦ', ['tj'] = 'cc', ['ts'] = 'ʧʧ', ['tssz'] = 'ʦʦ', ['tsz'] = 'ʦʦ',
	['ttc'] = 'ʦʦ', ['ttj'] = 'cc', ['tts'] = 'ʧʧ',
	['tty'] = 'cc', ['ty'] = 'c', ['tyj'] = 'cc',
	['w'] = 'v',
	['x'] = 'kṡ',
	['zs'] = 'ʒ', ['zzs'] = 'ʒʒ', ['z#s'] = 'ʃʃ', ['zssz'] = 'ʃs',
}

local replace_vowels = {
	['y'] = 'i',
	['a'] = 'ɒ', ['á'] = 'aː', ['ȧ'] = 'a',
	['e'] = 'ɛ', ['é'] = 'eː', ['ë'] = 'e',
	['i'] = 'i', ['í'] = 'iː',
	['o'] = 'o', ['ó'] = 'oː',
	['ö'] = 'ø', ['ő'] = 'øː',
	['u'] = 'u', ['ú'] = 'uː',
	['ü'] = 'y', ['ű'] = 'yː',
}

local back_replace = {
	['ʦ'] = 't͡s', ['ʣ'] = 'd͡z',
	['ʧ'] = 't͡ʃ', ['ʤ'] = 'd͡ʒ',
	['ṡ'] = 's',
	['g'] = 'ɡ', ['χ'] = 'x',
	['#'] = '',
}

local nasal_assim = {
	['k'] = 'ŋ', ['g'] = 'ŋ',
	['c'] = 'ɲ', ['ɟ'] = 'ɲ', ['ɲ'] = 'ɲ',
	['f'] = 'ɱ', ['v'] = 'ɱ',
	['p'] = 'm', ['b'] = 'm', ['m'] = 'm',
}

local voicing_assim = {
	['devoicing'] = {
		['b'] = 'p', ['v'] = 'f',
		['d'] = 't', ['z'] = 'ṡ', ['ʣ'] = 'ʦ',
		['ʒ'] = 'ʃ', ['ʤ'] = 'ʧ',
		['ɟ'] = 'c', ['ʝ'] = 'ç',
		['g'] = 'k',
	},
	['voicing'] = {
		['p'] = 'b', ['f'] = 'v',
		['t'] = 'd', ['ṡ'] = 'z', ['ʦ'] = 'ʣ',
		['ʃ'] = 'ʒ', ['ʧ'] = 'ʤ',
		['c'] = 'ɟ',
		['k'] = 'g',
	},
}

local sonorant = {
	['m'] = true, ['n'] = true, ['ny'] = true,
	['l'] = true, ['j'] = true, ['r'] = true,
}

local unstressed_words = {
	-- indefinite articles liaise with the following word
	{'ɒ', 'liaise_next'}, -- indef. article 'a'
	{'ɒz', 'liaise_next'}, -- indef. article 'az'
	-- these particles liaise with the preceding word (when followed by a space, to avoid other words starting with these letters being involved)
	{'iʃ', 'liaise_prev'}, -- 'is'
	{'ʃɛ', 'liaise_prev'}, -- 'se'
	{'ʃɛm', 'liaise_prev'}, -- 'sem'
	{'hɒ', 'liaise_prev'}, -- 'ha'
	-- conjuntions/relative pronouns lose their accents (no need to liaise in either direction)
	{'vɒɟ', 'liaise_none'}, -- 'vagy'
	{'dɛ', 'liaise_none'}, -- 'de'
	{'hoɟ', 'liaise_none'}, -- 'hogy' (although it may be stressed too in certain senses)
	{'ɒmi', 'liaise_none'}, -- 'ami'
	{'ɒki', 'liaise_none'}, -- 'aki'
	{'ɒhol', 'liaise_none'}, -- 'ahol'
	{'mint', 'liaise_none'}, -- 'mint'
	{'eːʃ', 'liaise_none'}, -- 'és'
	-- these particles which take focus remove the accent on the following word
	{'nɛm', 'remove_next'}, -- 'nem'
	{'nɛ', 'remove_next'}, -- 'ne'
}

function export.IPA(frame)
	local args = type(frame) == 'string' and { frame } or frame:getParent().args
	local result = {}

	if args['phon'] and args['phon'] ~= '' then
		args = { args['phon'] }
	end
	args = (not args[1]) and { mw.title.getCurrentTitle().text } or args

	for _, text in ipairs(args) do
		text = mw.ustring.lower(text)

		local non_i, i_vowels, vowels = '([aeouëöüáéóőúű])', '([iíé])', '([aeiouëöüáéíóőúű])'

		if mw.ustring.len(gsub(text, '[^ ]', '')) == 1 then
			text = gsub(text, ' ', '#')
		end

		-- j-allophony
		text = gsub(text, '([fkp])j$', '%1ç')
		text = gsub(text, '([bvgrm])j(#?)(.?)', function(prev, sep, succ)
			return (succ == '' or not match(succ, vowels)) and (prev .. 'ʝ' .. sep .. succ) or (prev .. 'j' .. sep .. succ) end)

		-- h-allophony
		local post_conv = {}
		for word in mw.text.gsplit(text, " ", true) do
			word = gsub(word, 'ch$', 'χχ')
			word = gsub(word, 'ch#', 'χχ#')
			word = gsub(word, '(.)c(h[eoö]z)$', '%1c#%2')
			word = gsub(word, vowels .. 'hh' .. vowels, '%1χχ%2')
			word = gsub(word, vowels .. 'cch' .. vowels, '%1χχ%2')
			word = gsub(word, 'ch', 'h')

			word = gsub(word, '(.?)(.?)h(.?)', function(penul, prev, succ)
				if prev == '' and penul ~= '' then
					prev, penul = penul, ''
				end
				if succ == '' or match(succ, '[bcdfghjklmnprstvwxyz]') then
					return penul .. prev .. 'χ' .. succ
				elseif match(succ, vowels) and (match(prev, vowels) and prev ~= succ) or (sonorant[prev] or sonorant[penul..prev]) then
					return penul .. prev .. 'ɦ' .. succ
				else
					return penul .. prev .. 'h' .. succ
				end end)

			table.insert(post_conv, word)
		end
		text = table.concat(post_conv, " ")

		-- adding hiatus 'j'
		text = gsub(text, non_i .. i_vowels, '%1j%2')
		text = gsub(text, i_vowels .. non_i, '%1j%2')

		-- converting to IPA symbols
		text = gsub(text, '([bcdfghjklmnprstvwxyz#]+)', function(cons_clus)
			if replace_cons[cons_clus] then
				return replace_cons[cons_clus]
			else
				for i = 1, 4 do
					for source, replace in pairs(replace_set[i]) do
						cons_clus = gsub(cons_clus, source, replace)
					end
				end
				return cons_clus
			end end)

		text = gsub(text, 'qu', 'kv')
		text = gsub(text, '.', replace_vowels)

		-- adding stress marks to words
		text = match(text, '^[^-]') and ('ˈ' .. gsub(text, ' ', ' ˈ')) or text

		-- word boundaries
		text = gsub(text, '[,-]', '')

		-- nasal assimilation
		text = gsub(text, 'n(n?)(#?)([kgcɟɲfvpbm])', function(repet, sep, cons)
			return nasal_assim[cons] .. (repet ~="" and nasal_assim[cons] or "") .. sep .. cons
		end)

		text = gsub(text, 'm(#?)([fv])', 'ɱ%1%2')

		local cons, opt_cons = '([lmnskgpbtdrfvɲɟχczʦʧʣʤṡʃʒjhŋɱɦχçʝ])', '([lmnskgpbtdrfvɲɟχczʦʧʣʤṡʃʒjhŋɱɦχçʝ]?)'
		text = gsub(text, '(.)#(.)', function(prev, succ)
			return (voicing_assim['devoicing'][prev] ~= succ and voicing_assim['voicing'][prev] ~= succ) and prev .. succ or prev .. '#' .. succ end)

		-- voicing and devoicing assimilations
		text = gsub(text, '([bvdzʣʒʤɟʝg]+)(#?[pftṡʦʃʧckh])', function(prev_cons, next_cons)
			return gsub(prev_cons, '.', voicing_assim['devoicing']) .. next_cons
		end)

		text = gsub(text, '([pftṡʦʃʧck]+)(#?[bdzʣʒʤɟg])', function(prev_cons, next_cons)
			return gsub(prev_cons, '.', voicing_assim['voicing']) .. next_cons
		end)

		-- geminate notation
		text = gsub(text, cons .. '%1%1', '%1ː')
		text = gsub(text, cons .. '(#?)%1', '%1ː%2')

		-- degemination when preceded or followed by a consonant
		text = gsub(text, opt_cons .. cons .. 'ː' .. opt_cons, function(prev_cons, gem_cons, next_cons)
			return prev_cons .. gem_cons .. (prev_cons .. next_cons ~= "" and '' or 'ː') .. next_cons end)

		-- back-replacing special characters
		text = gsub(text, '.', back_replace)

		-- dealing with unstressed particles and their liaison behavior
		for _, word_info in ipairs(unstressed_words) do
			local word, liaison_type = word_info[1], word_info[2]

			if liaison_type == 'liaise_next' then
				text = gsub(text, 'ˈ' .. word .. ' ', word)
			elseif liaison_type == 'liaise_prev' then
				text = gsub(text, 'ˈ' .. word .. ' ', word .. ' ')
			elseif liaison_type == 'liaise_none' then
				text = gsub(text, 'ˈ' .. word .. ' ', word .. ' ')
			elseif liaison_type == 'remove_next' then
				text = gsub(text, word .. ' ˈ', word .. ' ')
			end
		end

		-- removing the primary stress mark if another such mark is manually supplied
		text = gsub(text, 'ˈˈ', 'ˈ')

		-- likewise if a secondary stress mark is manually supplied
		text = gsub(text, 'ˈˌ', 'ˌ')

		-- adding a space before primary and secondary stress marks in case this space is missing
		text = gsub(text, '()([ˈˌ])', function(position, stress)
			if position ~= 1 then
				return ' ' .. stress
			end
		end)

		-- replacing any double spaces (created by the previous command) with single ones
		text = gsub(text, '  ', ' ')

		table.insert(result, '[' .. text .. ']')
	end

	table.insert(result, 1, "hu")
	if (type(frame) == 'string') then
		return mw.ustring.sub(result[1], 2, mw.ustring.len(result[1]) - 1)
	else
		return frame:expandTemplate{ title = "IPA", args = result}
	end

end

return export