Jump to content

Module:de-IPA

From Wiktionary, the free dictionary

This module is experimental.
The details of its operation have not yet been fully decided upon. Do not deploy widely until the module is finished.

Testcases

[edit]

--[=====[
TODO:
* Function for pre-consonantal and final obstruent devoicing of d, g, b, s
* Function for syllable-final uvularisation of r (ɐ̯)
* Function to reduce geminates [DONE]
* List of environments which trigger the palatalisation of /x/ (liquids + non-low front vowels) [DONE]
* Function to determine if H is word initial (> /h/) or non-initial (> 0) [DONE]
* Function to put stress in general, function to check for prefixes and realign stress accordingly
* Function to convert ⟨e⟩ in unstressed syllables to ə > Function to reduce -ər to -r + "devoicing"
* Function to convert ⟨c⟩ before front vowels to /t͡s/ [DONE]
* Function to convert final ⟨-ehe⟩ as /eː/ (verbs only)
* Function to mark whether the word is Germanic or Romanic - makes a lot of exceptions
  predictable/automatable, e.g. /ɪ, ɔ, ʊ/ > /i, o, u/ for short vowels in closed syllables,
  penultimate or final stress
* Inseparable prefixes do not take stress > Stress on the 2nd syllable
** A complete list could be compiled and the process automated, instead of making the user enter the stress by hand
* Rules to determine when to make vowels short vs. long. These are usually predictable,
  but there are some exceptions; use a macron (e.g. ā ē ī) to force a long vowel,
  and a breve (e.g. ă ĕ ĭ) to force a short vowel. Below are the general rules:
  - vowels are long in an open syllable (no final consonant, e.g. bēten, hōlen)
  - vowels are also long before a single consonant (e.g. kām), as well as before
    a silent ⟨h⟩ (e.g. gēhen, zēhn)
  - vowels are also short before a double (geminate) consonant (e.g. Wăsser, Mŭtter)
  - however, vowels before two unique consonants are not predictable (they can either be
    long, e.g. Mōnd, or short, e.g. Mŭnd)
  - note that a long ⟨i⟩ is usually written as ⟨ie⟩, except word-initially (e.g. Īgel)
    and the exception of short ⟨ie⟩ in vier and its derivatives (e.g. vierzehn)
  - vowels are usually long in a stressed final syllable before a single
    consonant (but with possible exceptions, e.g. '-eg')
  - unstressed syllables do not have long vowels
* Stress is usually on the first syllable, but there are some exceptions:
  - syllables with secondary stress are treated as if stressed
  - syllables directly following a known prefix (aus-, zu-, über-, ge-, etc.)
    should be treated as if stressed, whether they are actually stressed or not
  - when there's an explicit slash to separate compounds, all parts should be
    treated as if they were separate words for vowel-length purposes (e.g.
    '-tag' in 'Reichs/tag' should be long)
  - what about other unstressed syllables?
--]=====]

local export = {}

local u = require("Module:string/char")
local strutl = require("Module:string utilities")
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ucomp = mw.ustring.toNFC
local udecomp = mw.ustring.toNFD
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local function ine(x)
	if x == "" then return nil else return x end
end

local AC = u(0x0301) -- acute accent
local GR = u(0x0300) -- grave accent
local MA = u(0x0304) -- macron
local BR = u(0x0306) -- breve
local DI = u(0x0308) -- diaeresis

local stress_accent = AC .. GR
local length_accent = MA .. BR
local all_accents = stress_accent .. length_accent
local front_vowel = "eiyäöü"
local back_vowel = "aou"
local vowel = front_vowel .. back_vowel .. all_accents
local vowel_stressed = "[" .. vowel .. "][" .. stress_accent .. "]"
local cons_c = "[^" .. vowel .. ".⁀ %-()]"
local cons_or_boundary_c = "[^" .. vowel .. "rl. %-()]" -- includes ⁀  -- I have added /l/ & /r/ as a stopgap against Brücke -> /ˈprʏkə/, but this may need a new name.
local stress_marks = "ˈˌ"

local devoice_conv = { ["b"] = "p", ["d"] = "t", ["g"] = "k" }
local umlaut_conv = { ["a"] = "ä", ["o"] = "ö", ["u"] = "ü" }
local sequences = {
	["a"] = {
		["ai"  ] = "aɪ̯";
		["au"  ] = "aʊ̯";
		["ay"  ] = "aɪ̯";
		[BR    ] = "a";
		[MA    ] = "aː";
		[false ] = "ɐ";
	};
	["ä"] = {
		["äu"  ] = "ɔʏ̯";
		[BR    ] = "ɛ";
		[MA    ] = "ɛː";
		[false ] = "ɛ";
	};
	["b"] = "b";
	["c"] = {
		["chs" ] = { "k", "s" }; -- FIXME: should we have this
		["ch"  ] = "ç"; -- front allophone (ich-laut)
		["ck"  ] = "k";
		[false ] = "t͡s";
	};
	["d"] = {
		["dsch"] = "d͡ʒ";
		["dt"  ] = "t";
		[false ] = "d";
	};
	["e"] = {
		["ei"  ] = "aɪ̯";
		["eu"  ] = "ɔʏ̯";
		["ey"  ] = "aɪ̯";
		[BR    ] = "ɛ";
		[MA    ] = "eː";
		[false ] = "ə";
	};
	["f"] = "f";
	["g"] = "ɡ";
	["h"] = "h";
	["i"] = {
		["ieh" ] = "iː";
		["ie"  ] = "iː";
		[BR    ] = "ɪ";
		[MA    ] = "iː";
		[false ] = "ɪ";
	};
	["j"] = "j";
	["k"] = {
		["khs" ] = { "k", "s" };
		["kh"  ] = "χ"; -- back allophone of /ç/ (ach-laut)
		[false ] = "k";
	};
	["l"] = "l";
	["m"] = "m";
	["n"] = {
		["nk"  ] = { "ŋ", "k" };
		["ng"  ] = "ŋ";
		[false ] = "n";
	};
	["o"] = {
		[BR    ] = "ɔ";
		[MA    ] = "oː";
		[false ] = "ɔ";
	};
	["ö"] = {
		[BR    ] = "œ";
		[MA    ] = "œː"; -- sometimes /øː/
		[false ] = "œ";
	};
	["p"] = {
		["pf"  ] = "p͡f";
		["ph"  ] = "f";
		[false ] = "p";
	};
	["q"] = {
		["qu"  ] = { "k", "ʋ" }; -- only before another vowel
		[false ] = "k";
	};
	["r"] = "r"; -- phonetically [ʀ] syllable-initially; /ɐ/ syllable-finally
	["s"] = {
		["sch" ] = "ʃ";
		["sh"  ] = "ʃ";
		["sp"  ] = { "ʃ", "p" };
		["st"  ] = { "ʃ", "t" };
		[false ] = "s";
	};
	["t"] = {
		["tsch"] = "t͡ʃ";
		["tz"  ] = "t͡s";
		[false ] = "t";
	};
	["u"] = {
		[BR    ] = "ʊ";
		[MA    ] = "uː";
		[false ] = "ʊ";
	};
	["ü"] = {
		[BR    ] = "ʏ";
		[MA    ] = "yː";
		[false ] = "ʏ";
	};
	["v"] = "f";
	["w"] = "ʋ";
	["x"] = { "k", "s" }; -- XXX
	["y"] = {
		[BR    ] = "ʏ";
		[MA    ] = "yː";
		[false ] = "ʏ";
	};
	["z"] = "z"; -- respellt from s
	["́"] = "ˈ"; -- FIXME
	["̀"] = "ˌ";
	-- [AC ] = "ˈ";
	-- [GR ] = "ˌ";
}


-- normalise the function by substituting strings, making the text lowercase,
-- decomposing and recomposing umlauted vowels, and then converting ae, oe, ue
-- to umlauted vowels (ä, ö, ü)
local function normalise(text)
	--[[
	if not text or text == "+" then
		text = pagename
	end
	--]]

	-- handle the string substitution syntax [a:1,b:2]
	if rfind(text, "^%[.*%]$") then
		local subs = rsplit(rmatch(text, "^%[(.*)%]$"), ",")
		for _, sub in ipairs(subs) do
			local fromto = rsplit(sub, ":")
			if #fromto ~= 2 then
				error("Bad substitution spec " .. sub .. " in {{de-IPA}}")
			end
			local from, to = fromto[1], fromto[2]
			if rfind(from, "^~") then
				-- formerly, ~ was required to match within a word
				from = rmatch(from, "^~(.*)$")
			end
			local newtext = text
			if rfind(from, "^%^") then
				-- whole-word match
				from = rmatch(from, "^%^(.*)$")
				newtext = rsub(text, "%f[%a]" .. strutl.pattern_escape(from) .. "%f[%A]", to)
			else
				newtext = rsub(text, strutl.pattern_escape(from), to)
			end
			if newtext == text then
				error("Substitution spec " .. sub .. " didn't match respelling '" .. text .. "'")
			end
			text = newtext
		end
	end

	-- make text lowercase
	text = ulower(text)

	-- simplify checking for word boundary markers by adding ⁀ at
	-- the beginning and end of all words then removing them at the end
	text = rsub(text, "%s*,%s*", "⁀⁀ | ⁀⁀") -- mark between commas and treat it as a pause
	text = rsub(text, "%s+", "⁀ ⁀") -- mark between spaces
	text = rsub(text, "[%-/]+", "⁀-⁀") -- mark between compound word boundaries including hyphens
	text = "⁀⁀" .. text .. "⁀⁀" -- mark at the start and end of the whole entry

	-- handle combining accents
	text = udecomp(text) -- decompose accented characters into their base and combining parts
	text = rsub(text, "([" .. all_accents .. "]*)([e" .. DI .. "])", "%2%1") -- avoid confusion of wrongly-ordered umlauts/e's and other accents
	text = rsub(text, "([aou])[e" .. DI .. "]", umlaut_conv) -- recompose umlauted vowels
	text = rsub(text, "([" .. length_accent .. "])([" .. stress_accent .. "])", "%2%1") -- put length accents after stress accents

	return text
end

-- handle stress by shifting accent mark. if there's no stress
-- mark, add stress mark according to predetermined rules
local function handle_stress(text, orig, pos)
	if not rfind(text, AC) then -- FIXME later
		return rsubn(text, "⁀(" .. cons_c .. "*[" .. vowel .. "])", "⁀%1" .. AC)
	else
		return text
	end
end

-- respell the text more phonetically to allow easier conversion to IPA
local function respell(text, orig, pos)
	-- handle ⟨q⟩
	text = rsub(text, "q([" .. vowel .. "]?" .. cons_c .. ")", "k%1") -- convert ⟨q⟩ before a single or no vowel

	-- handle ⟨c⟩/⟨s⟩/⟨z⟩
	text = rsub(text, "c([^eiyäöühk])", "k%1") -- convert ⟨c⟩ (single letter) before non-front vowels to /k/
	text = rsub(text, "([" .. back_vowel .. "])ch", "%1kh") -- convert ⟨ch⟩ after back vowels to /χ/
	text = rsub(text, "z", "c") -- convert ⟨z⟩ to /t͡s/
	text = rsub(text, "s([" .. vowel .. "])", "z%1") -- ⟨s⟩ is voiced as z before vowels

	-- handle consonant devoicing
	text = rsub(text, "([bdg])(" .. cons_or_boundary_c .. ")", -- devoice syllable-final obstruents
		function(c1, c2)
			return devoice_conv[c1] .. c2
		end)

	-- handle predictable stressed vowel lengths; other cases must explicitly
	-- be marked by the user or else the module will return an error
	text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. "[" .. vowel .. "])", "%1" .. MA .. "%2") -- long vowel before consonant + vowel
	text = rsub(text, "(" .. vowel_stressed .. ")⁀", "%1" .. MA .. "⁀") -- long vowel before a word boundary
	text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. ")%2", "%1" .. BR .. "%2%2") -- short vowel before a double consonant

	-- handle pronounced ⟨h⟩ (FIXME)
	text = rsub(text, "([" .. vowel .. "])h([" .. vowel .. "])", "%1" .. MA .. "[h]%2") -- ⟨h⟩ is pronounced /h/ in between vowels

	-- shift stress accents before letter
	return rsub(text, "(%w[" .. length_accent .. "]*)([" .. stress_accent .. "])", "%2%1")
end

-- convert letters to phonemes using the sequences table,
-- then return the phonemes as a concatenated string
local function parse_table(text)
	local phones, i, n = {}, 1, ulen(text)
	while i <= n do
		local is_stressed = false
		local cid = usub(text, i, i)
		local value = sequences[cid]
		local phone, cidl

		if value == nil then -- skip over invalid values
			i = i + 1
		elseif rmatch(cid, "[" .. stress_accent .. "]") then -- check for stressed vowel
			is_stressed = true
			table.insert(phones, value)
			i = i + 1
		else -- process letters
			local cid_next = usub(text, i + 1, i + 1)

			if rmatch(cid, "[" .. vowel .. "]") then
				cidl = 1 -- default character id length value

				if cid_next == "h" or cid_next == cid or cid_next == MA then -- long vowel if following an 'h' or a double letter
					phone = value[MA]
					cidl = cidl + 1
				elseif cid_next == BR then
					phone = value[BR]
					cidl = cidl + 1
				else
					local found = false
					for seq, seq_phone in pairs(value) do
						if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
							phone = seq_phone
							cidl = cidl + ulen(seq) - 1
							found = true
							break
						end
					end
					if not found then
						if is_stressed then -- return error if vowel is stressed
							error("Vowel length is ambiguous for the stressed vowel. Please specify vowel length.")
						else
							phone = value[false]
						end
					end
				end
				is_stressed = false -- turn off stress until end or next stressed vowel
			else
				cidl = 1 -- default character id length value

				if type(value) ~= "table" or value[1] then
					phone = value
				elseif cid_next == cid then -- double consonants are treated as singular
					phone = value[cid]
				else -- otherwise go over table
					local found = false
					for seq, seq_phone in pairs(value) do
						if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
							phone = seq_phone
							cidl = ulen(seq)
							found = true
							break
						end
					end
					if not found then
						phone = value[false]
					end
				end
			end

			if type(phone) == "string" then
				table.insert(phones, phone)
			elseif type(phone) == "table" then
				for _, p in ipairs(phone) do
					table.insert(phones, p)
				end
			end

			i = i + cidl
		end
	end

	-- concatenate the phonemes into a string
	return table.concat(phones)
end

-- final phonemic substituations
local function phonemic(text, orig, pos)
	text = rsub(text, "n([" .. stress_marks .. "][kɡ])", "ŋ%1")
	return text
end

function export.toIPA(text, orig, pos)
	if type(text) == 'table' then
		text, orig, pos = ine(text.args[1]), ine(text.args.orig), ine(text.args.pos)
	end
	text = text or mw.title.getCurrentTitle().text

	text = normalise(text)
	-- text = handle_stress(text, orig, pos)
	text = respell(text, orig, pos)
	text = parse_table(text)
	text = phonemic(text, orig, pos)

	-- remove hyphens and word-boundary markers
	return rsub(text, "[⁀%-]", "")
end

return export