Jump to content

Module:User:Surjection/fi-UPA

From Wiktionary, the free dictionary


local export = {}

local m_fi_p = require("Module:fi-pronunciation")
local langcode = "fi"

local gsub = mw.ustring.gsub

local U = mw.ustring.char
local diphthong = U(0x361)
local hiatus = U(0x1DFC)
local unreleased = U(0x2FE)
local nasalized = U(0x330)

local letters_phonemes = {
	["å"] = "o",
    ["y"] = "ü",
	
	["q"] = "k",
	
	["x"] = "ks",
	["zz"] = "ts",
	["ng"] = "ŋŋ",
	["nk"] = "ŋk",
	["qu"] = "kv",
	["*"] = "ˣ",
	["’"] = "₍",
    ["."] = "₍",
}

local lookahead = 3 -- how many unstressed syllables at most in a single unit, thus max consecutive unstressed syllables

local long = "̄"
local vowels = "aeiouüäö"
local vowel = "[" .. vowels .. "]"
local consonants = "kptgbdfˀsnmŋlrhvšžrjɦχ"
local consonant = "[" .. consonants .. "]"
local diacriticsvv = long .. "̝̞̠̟̪́" .. unreleased
local diacriticsv = diacriticsvv .. diphthong .. nasalized
local diacritics = diacriticsv .. hiatus
local diacritic = "[" .. diacritics .. "]"

local spelled_consonants = "cvwxz"
local spelled_consonant = "[" .. consonants .. spelled_consonants .. "]"
local spelled_vowels = "y"
local spelled_vowel = "[" .. vowels .. spelled_vowels .. "]"

local tertiary = "ˌ" -- "tertiary stress", a weaker secondary stress (either rhythmic or in some compound words). is there a better way to represent this?
export.tertiary = tertiary

local stress_indicator = "[ ˈˌ" .. tertiary .. "/-]"
local plosives = "kptbdg"

local use_UPA_stress = true
local stress_p = "[ˈˌ" .. tertiary .. "]"
local stress_s = "[ˌ" .. tertiary .. "]"
local stress_pd = "[ˈˌ" .. tertiary .. "]"
local stress_sd = "[ˌ" .. tertiary .. "]"

--	This adds letters_phonemes["e"] = "e", letters_phonemes["i"] = "i", etc.
for letter in mw.ustring.gmatch("aeiouäödhfjklmnprstuv", ".") do
	letters_phonemes[letter] = letter
end

--[[	This regex finds the diphthongs in the UPA transcription,
		so that the correct tie diacritic can be added.						]]
-- /_i/ diphthongs can appear in any syllable
local diphthongs_i = {
	"([aeouüäö])(i)"
}
-- /_U/ diphthongs can appear in the initial syllable or later open syllables (no consonantal coda)
local diphthongs_u = {
	"([aoei])(u)",
	"([eiäö])(ü)",
}
-- rising diphthongs can only appear in the initial syllable (of a word, compound word part, etc.)
local diphthongs_rising = {
	"(u)(o)",
	"(i)(e)",
	"(ü)(ö)",
}

local function apply_post_fixes(p)
	-- initial <gn> is /gn/
	p = mw.ustring.gsub(p, "ˈŋn", "ˈɡn")

	-- ŋ is short before consonant (by default)
	p = mw.ustring.gsub(p, "ŋŋ("..consonant..")", "ŋ%1")

	-- dissimilation of vowels by sandhi
	p = mw.ustring.gsub(p, "("..vowel..diacritic.."*"..long.."?)("..stress_s..")%1", "%1₍%2%1")

	return p
end

local function apply_post_fixes_narrow(p)
	-- long j, v after i, u diphthong
	p = mw.ustring.gsub(p, "("..diphthong.."i)j("..vowel..")", "%1j("..long..")%2")
	-- /ʋ/ after /u/ usually realized as /w/ (see Suomi, Toivanen and Ylitalo 2008, p. )
	p = mw.ustring.gsub(p, "("..diphthong.."u)v("..vowel..")", "%1w(w)%2")
	-- cleanup
	p = mw.ustring.gsub(p, "("..stress_s..")%.", "%1")

    -- tautosyllabic nasals nasalize vowels between them (see Suomi, Toivanen and Ylitalo 2008, p. 22)
	p = mw.ustring.gsub(p, "([mnŋ][mnŋ]?)("..vowel..")("..diacritic.."*)([mnŋ])(.?)", function (n0, nv, nvd, n1, anchor)
		-- this cannot be simplified to "(.?)" => "([^" .. vowels .. "]?)", otherwise a vowel after would match
		if not mw.ustring.find(anchor, vowel) then
			return n0 .. nv .. nasalized .. nvd .. n1 .. anchor
		end
	end)
	
	-- sandhi: nm > mm, np > mp, nb > mb, nk > ŋk, ng > ŋg
	p = mw.ustring.gsub(p, "n%s-("..stress_pd.."?%s*)([gk])", "ŋ‿%1%2")
	p = mw.ustring.gsub(p, "n%s-("..stress_pd.."?%s*)([mpb])", "m‿%1%2")
	p = mw.ustring.gsub(p, "[nm]%s-("..stress_pd.."?%s*)([f])", "ᴍ͔‿%1%2")
	p = mw.ustring.gsub(p, "n("..stress_pd.."?%s*)([gk])", "ŋ%1%2")
	p = mw.ustring.gsub(p, "n("..stress_pd.."?%s*)([mpb])", "m%1%2")
	p = mw.ustring.gsub(p, "[nm]("..stress_pd.."?%s*)([f])", "ᴍ͔%1%2")
	
	-- handle potentially long consonants over secondary stresses
	p = mw.ustring.gsub(p, "("..stress_s..")("..consonant..diacritic.."*)%(%2%)", "(%2)%1%2")
	p = mw.ustring.gsub(p, "("..consonant..diacritic.."*)%(%1%)("..stress_s..")", "%2%1(%1)")
    p = mw.ustring.gsub(p, "(ŋ"..diacritic.."*)"..tertiary.."ɡ", "%1"..tertiary.."ŋ")
	
	-- [k] allophone before front vowels (see Suomi, Toivanen and Ylitalo 2008, p. 27)
	p = mw.ustring.gsub(p, "k([eiyæø])", "k̟%1")
	
	return p
end

function export.is_light_syllable(syllable)
	return mw.ustring.find(mw.ustring.lower(syllable), "^[" .. m_fi_p.sep_symbols .. "]?" .. spelled_consonant .. "?" .. spelled_vowel .. "%(?%*?%)?$")
end

function export.has_later_heavy_syllable(hyph, start)
	local stop = math.min(start + lookahead, #hyph - 1)
	for index = start, stop do
		if not export.is_light_syllable(hyph[index]) then
			return true
		end
	end
	return false	
end

-- applied *before* UPA conversion
local function add_secondary_stress(word)
	-- keep_sep_symbols = true
	local hyph = m_fi_p.syllabify(word, true)
	local res = ""
	local last_index = #hyph
	
	-- find stressed syllables and add secondary stress before each syllable
	for index, syllable in ipairs(hyph) do
		local stressed = false
		local has_symbol = mw.ustring.find(syllable, "^[" .. m_hyph.sep_symbols .. "₍ˈˌ" .. tertiary .. "]")
		
		if has_symbol then
			-- check if symbol indicates stress
			stressed = mw.ustring.find(syllable, "^" .. stress_indicator)
			has_symbol = stressed
		end
			
		if not stressed then
			if index == 1 then
				stressed = true
			elseif not prev_stress and index < last_index then
				-- shift stress if current syllable light and a heavy syllable occurs later (except as the last syllable)
				stressed = index == last_index - 1 or not export.is_light_syllable(syllable) or not export.has_later_heavy_syllable(hyph, index + 1)
			end
			
			if stressed then
				last_stressed = index
			end
		end
		
		-- check if next syllable already stressed
		-- if is, do not stress this syllable
		if stressed and index < last_index then
			stressed = stressed and not mw.ustring.find(hyph[index + 1], "^" .. stress_indicator)
		end

		if index > 1 and stressed and not has_symbol then
			res = res .. "-$"
		end
		res = res .. syllable

		prev_stress = stressed
	end

	local noninitial = {}
	local index = 1
	res = mw.ustring.gsub(res, "-([$]?)",
		function (dollar)
			index = index + 1
			noninitial[index] = #dollar > 0
			return #dollar > 0 and tertiary or "-"
		end)
	
	return res, noninitial
end

local function handle_diphthongs(UPA, strict_initial)
	for _, diphthong_regex in pairs(diphthongs_i) do
		UPA = mw.ustring.gsub(UPA, diphthong_regex, "%1" .. diphthong .. "%2")
	end

	local only_initial = "(" .. stress_indicator .. "[^" .. vowels .. "]*)"
	if strict_initial then
		only_initial = "^([^" .. vowels .. "]*)"
	end

	for _, diphthong_regex in pairs(diphthongs_rising) do
		-- initial syllables
		UPA = mw.ustring.gsub(UPA, only_initial .. diphthong_regex, "%1%2" .. diphthong .. "%3")
	end

	for _, diphthong_regex in pairs(diphthongs_u) do
		-- initial syllables
		UPA = mw.ustring.gsub(UPA, only_initial .. diphthong_regex, "%1%2" .. diphthong .. "%3")

		local open_noninitial = 
			function(v1, v2, after)
				if mw.ustring.find(after, "^" .. consonant .. diacritic .. "*" .. vowel) then
					-- consonant after diphthong
					-- must be followed by vowel so that it's part of the
					-- following syllable, else it's in this syllable
					-- and thus this syllabie is closed

					return v1 .. diphthong .. v2 .. after
				elseif mw.ustring.find(after, "^" .. consonant) then
					-- consonant after diphthong
					-- must be in this syllable

					return v1 .. hiatus .. v2 .. after
				end
				-- no consonant after diphthong => open
				return v1 .. diphthong .. v2 .. after
			end

		-- open non-initial syllables
		UPA = mw.ustring.gsub(UPA, diphthong_regex .. "(.+)", open_noninitial)
		UPA = mw.ustring.gsub(UPA, diphthong_regex .. "($)", open_noninitial)
	end
	
	UPA = mw.ustring.gsub(UPA, "(" .. vowel .. "[" .. diacriticsvv .. "]*)(" .. vowel .. "[" .. diacriticsvv .. "]*)", "%1" .. hiatus .. "%2")

	return UPA
end

local function UPA_word(term, is_narrow, has_initial)
	local rest = term
	local phonemes = {}
	
	while mw.ustring.len(rest) > 0 do
		-- Find the longest string of letters that matches a recognised sequence in the list
		local longestmatch = ""
		
		for letter, phoneme in pairs(letters_phonemes) do
			if mw.ustring.sub(rest, 1, mw.ustring.len(letter)) == letter and mw.ustring.len(letter) > mw.ustring.len(longestmatch) then
				longestmatch = letter
			end
		end
		
		-- Convert the string to UPA
		if mw.ustring.len(longestmatch) > 0 then
			table.insert(phonemes, letters_phonemes[longestmatch])
			rest = mw.ustring.sub(rest, mw.ustring.len(longestmatch) + 1)
		else
			table.insert(phonemes, mw.ustring.sub(rest, 1, 1))
			rest = mw.ustring.sub(rest, 2)
		end
	end
	
	local result = table.concat(phonemes)
	
	if is_narrow then
		-- articulation of h (Suomi, Toivanen & Ylitalo 2008, p. 28)
		result = mw.ustring.gsub(result, "(.?)h(.?)",
			function (before, after)
				local h
				if after ~= "" and after ~= "h" then
					if before ~= "" and vowels:find(before) then
						if consonants:find(after) then
							-- vihma, yhtiö
							if before == "i" or before == "ü" then
								h = "h́"
							-- mahti, kohme, tuhka
							elseif before == "a" or before == "o" or before == "u" then
								h = "χ"
							end
						-- maha
						elseif vowels:find(after) then
							h = "ɦ"
						end
					end
				end
				
				if h then
					return before .. h .. after
				end
			end)
		
		-- double letter replacement and diphthongs must be handled earlier here
		result = mw.ustring.gsub(result, "(" .. vowel .. ")%1", "%1" .. long)
		if has_initial then
			result = handle_diphthongs(result, true)
		end
	end
	
	return result
end

function export.UPA_wordparts(term, is_narrow)
	term = mw.ustring.lower(term)
	local notinitial = {} -- true if the component is not an initial component
	local hyphenstress = "ˌ" -- secondary by default
	local is_prefix = false
	local is_suffix = false

	if mw.ustring.find(term, "%/") then
		hyphenstress = tertiary -- tertiary if we have slashes
	end
	
	if is_narrow then
		term, notinitial = add_secondary_stress(term)
	end
	
	local found
	term, found = mw.ustring.gsub(term, "^%-+", "")
	is_suffix = found > 0
	term, found = mw.ustring.gsub(term, "%-+$", "")
	is_prefix = found > 0
	
	-- make sure we keep slashes to figure out if secondary or tertiary
	term = mw.ustring.gsub(term, "%/", "-%1")
	local wordparts = mw.text.split(term, "-", true)

	for key, val in ipairs(wordparts) do
		local stress = key > 1 and hyphenstress or "ˈ"
		local part = val

		if mw.ustring.find(part, "^%/") then
			stress = "ˌ" -- always secondary
			part = part:sub(2)
		end

		wordparts[key] = stress .. UPA_word(part, is_narrow, not notinitial[key])
	end
	
	UPA = table.concat(wordparts, "")
	
	if is_narrow then
		-- handle * in narrow transcription
		UPA = mw.ustring.gsub(UPA, "ˣ(%)?%s*"..stress_p.."?)((.?)" .. diacritic .. "*)",
			function (post, after, potential_consonant)
				if potential_consonant == "" then
					if mw.ustring.find(post, "^%)") then
						return "ˀ" .. post .. after
					else
						return post .. "(ˀ)" .. after
					end
				elseif consonants:find(potential_consonant) then
					if #post > 0 then
						local amark = ""
						if plosives:find(mw.ustring.sub(after, 1, 1)) then
							amark = unreleased
						end
						return after .. amark .. post .. after
					else
						return post .. after .. after
					end
				else
					return post .. "ˀ" .. after
				end
			end)		
	else
		--	Replace double letters (vowels or consonants) with single letter plus length sign.
		UPA = gsub(UPA, "(" .. vowel .. ")%1", "%1" .. long)
		UPA = handle_diphthongs(UPA, false)
	end
	
	UPA = apply_post_fixes(UPA)
	
	if is_narrow then
		UPA = apply_post_fixes_narrow(UPA)
	end
	
	if is_prefix then
		UPA = UPA .. "-"
	end
	if is_suffix then
		UPA = "-" .. UPA
	end

    if use_UPA_stress then
        UPA = mw.ustring.gsub(UPA, "([ˈˌ])([" .. consonants .. diacriticsv .. "]-)([" .. vowels .. diacriticsv .. "]+)", "%2%3%1")
        UPA = mw.ustring.gsub(UPA, "([ˈˌ])" .. hiatus, "%1")
        UPA = mw.ustring.gsub(UPA, "ˈ", "·")
        UPA = mw.ustring.gsub(UPA, "ˌ", ":")
    end
	
	return UPA
end

function export.UPA(term)
	if type(term) == "table" then
		term = term:getParent().args[1]
	end
	
	local title = mw.title.getCurrentTitle().text
	
	if not term then
		term = title
	elseif term == "*" then
		term = title .. "*"
	end
	
	--local no_count = mw.ustring.match(term, " ")
	
	UPA_narrow = export.UPA_wordparts(term, true)
	UPA = export.UPA_wordparts(term, false)
	return require("Module:User:Surjection/UPA").format_UPA_full(require("Module:languages").getByCode(langcode), {{pron = UPA, phonetic = false}, {pron = UPA_narrow, phonetic = true}})
end

return export