Module:de-IPA

The following documentation is located at Module:de-IPA/documentation. ^[edit] Categories were auto-generated by Module:module categorization. ^[edit]
Useful links: subpage list • links • transclusions • testcases • sandbox
	This module is experimental.
	The details of its operation have not yet been fully decided upon. Do not deploy widely until the module is finished.
Testcases

Module:de-IPA/testcases
--[=====[
TODO:
* Function for pre-consonantal and final obstruent devoicing of d, g, b, s
* Function for syllable-final uvularisation of r (ɐ̯)
* Function to reduce geminates [DONE]
* List of environments which trigger the palatalisation of /x/ (liquids + non-low front vowels) [DONE]
* Function to determine if H is word initial (> /h/) or non-initial (> 0) [DONE]
* Function to put stress in general, function to check for prefixes and realign stress accordingly
* Function to convert ⟨e⟩ in unstressed syllables to ə > Function to reduce -ər to -r + "devoicing"
* Function to convert ⟨c⟩ before front vowels to /t͡s/ [DONE]
* Function to convert final ⟨-ehe⟩ as /eː/ (verbs only)
* Function to mark whether the word is Germanic or Romanic - makes a lot of exceptions
  predictable/automatable, e.g. /ɪ, ɔ, ʊ/ > /i, o, u/ for short vowels in closed syllables,
  penultimate or final stress
* Inseparable prefixes do not take stress > Stress on the 2nd syllable
** A complete list could be compiled and the process automated, instead of making the user enter the stress by hand
* Rules to determine when to make vowels short vs. long. These are usually predictable,
  but there are some exceptions; use a macron (e.g. ā ē ī) to force a long vowel,
  and a breve (e.g. ă ĕ ĭ) to force a short vowel. Below are the general rules:
  - vowels are long in an open syllable (no final consonant, e.g. bēten, hōlen)
  - vowels are also long before a single consonant (e.g. kām), as well as before
    a silent ⟨h⟩ (e.g. gēhen, zēhn)
  - vowels are also short before a double (geminate) consonant (e.g. Wăsser, Mŭtter)
  - however, vowels before two unique consonants are not predictable (they can either be
    long, e.g. Mōnd, or short, e.g. Mŭnd)
  - note that a long ⟨i⟩ is usually written as ⟨ie⟩, except word-initially (e.g. Īgel)
    and the exception of short ⟨ie⟩ in vier and its derivatives (e.g. vierzehn)
  - vowels are usually long in a stressed final syllable before a single
    consonant (but with possible exceptions, e.g. '-eg')
  - unstressed syllables do not have long vowels
* Stress is usually on the first syllable, but there are some exceptions:
  - syllables with secondary stress are treated as if stressed
  - syllables directly following a known prefix (aus-, zu-, über-, ge-, etc.)
    should be treated as if stressed, whether they are actually stressed or not
  - when there's an explicit slash to separate compounds, all parts should be
    treated as if they were separate words for vowel-length purposes (e.g.
    '-tag' in 'Reichs/tag' should be long)
  - what about other unstressed syllables?
--]=====]

local export = {}

local u = require("Module:string/char")
local strutl = require("Module:string utilities")
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ucomp = mw.ustring.toNFC
local udecomp = mw.ustring.toNFD
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local function ine(x)
	if x == "" then return nil else return x end
end

local AC = u(0x0301) -- acute accent
local GR = u(0x0300) -- grave accent
local MA = u(0x0304) -- macron
local BR = u(0x0306) -- breve
local DI = u(0x0308) -- diaeresis

local stress_accent = AC .. GR
local length_accent = MA .. BR
local all_accents = stress_accent .. length_accent
local front_vowel = "eiyäöü"
local back_vowel = "aou"
local vowel = front_vowel .. back_vowel .. all_accents
local vowel_stressed = "[" .. vowel .. "][" .. stress_accent .. "]"
local cons_c = "[^" .. vowel .. ".⁀ %-()]"
local cons_or_boundary_c = "[^" .. vowel .. "rl. %-()]" -- includes ⁀  -- I have added /l/ & /r/ as a stopgap against Brücke -> /ˈprʏkə/, but this may need a new name.
local stress_marks = "ˈˌ"

local devoice_conv = { ["b"] = "p", ["d"] = "t", ["g"] = "k" }
local umlaut_conv = { ["a"] = "ä", ["o"] = "ö", ["u"] = "ü" }
local sequences = {
	["a"] = {
		["ai"  ] = "aɪ̯";
		["au"  ] = "aʊ̯";
		["ay"  ] = "aɪ̯";
		[BR    ] = "a";
		[MA    ] = "aː";
		[false ] = "ɐ";
	};
	["ä"] = {
		["äu"  ] = "ɔʏ̯";
		[BR    ] = "ɛ";
		[MA    ] = "ɛː";
		[false ] = "ɛ";
	};
	["b"] = "b";
	["c"] = {
		["chs" ] = { "k", "s" }; -- FIXME: should we have this
		["ch"  ] = "ç"; -- front allophone (ich-laut)
		["ck"  ] = "k";
		[false ] = "t͡s";
	};
	["d"] = {
		["dsch"] = "d͡ʒ";
		["dt"  ] = "t";
		[false ] = "d";
	};
	["e"] = {
		["ei"  ] = "aɪ̯";
		["eu"  ] = "ɔʏ̯";
		["ey"  ] = "aɪ̯";
		[BR    ] = "ɛ";
		[MA    ] = "eː";
		[false ] = "ə";
	};
	["f"] = "f";
	["g"] = "ɡ";
	["h"] = "h";
	["i"] = {
		["ieh" ] = "iː";
		["ie"  ] = "iː";
		[BR    ] = "ɪ";
		[MA    ] = "iː";
		[false ] = "ɪ";
	};
	["j"] = "j";
	["k"] = {
		["khs" ] = { "k", "s" };
		["kh"  ] = "χ"; -- back allophone of /ç/ (ach-laut)
		[false ] = "k";
	};
	["l"] = "l";
	["m"] = "m";
	["n"] = {
		["nk"  ] = { "ŋ", "k" };
		["ng"  ] = "ŋ";
		[false ] = "n";
	};
	["o"] = {
		[BR    ] = "ɔ";
		[MA    ] = "oː";
		[false ] = "ɔ";
	};
	["ö"] = {
		[BR    ] = "œ";
		[MA    ] = "œː"; -- sometimes /øː/
		[false ] = "œ";
	};
	["p"] = {
		["pf"  ] = "p͡f";
		["ph"  ] = "f";
		[false ] = "p";
	};
	["q"] = {
		["qu"  ] = { "k", "ʋ" }; -- only before another vowel
		[false ] = "k";
	};
	["r"] = "r"; -- phonetically [ʀ] syllable-initially; /ɐ/ syllable-finally
	["s"] = {
		["sch" ] = "ʃ";
		["sh"  ] = "ʃ";
		["sp"  ] = { "ʃ", "p" };
		["st"  ] = { "ʃ", "t" };
		[false ] = "s";
	};
	["t"] = {
		["tsch"] = "t͡ʃ";
		["tz"  ] = "t͡s";
		[false ] = "t";
	};
	["u"] = {
		[BR    ] = "ʊ";
		[MA    ] = "uː";
		[false ] = "ʊ";
	};
	["ü"] = {
		[BR    ] = "ʏ";
		[MA    ] = "yː";
		[false ] = "ʏ";
	};
	["v"] = "f";
	["w"] = "ʋ";
	["x"] = { "k", "s" }; -- XXX
	["y"] = {
		[BR    ] = "ʏ";
		[MA    ] = "yː";
		[false ] = "ʏ";
	};
	["z"] = "z"; -- respellt from s
	["́"] = "ˈ"; -- FIXME
	["̀"] = "ˌ";
	-- [AC ] = "ˈ";
	-- [GR ] = "ˌ";
}


-- normalise the function by substituting strings, making the text lowercase,
-- decomposing and recomposing umlauted vowels, and then converting ae, oe, ue
-- to umlauted vowels (ä, ö, ü)
local function normalise(text)
	--[[
	if not text or text == "+" then
		text = pagename
	end
	--]]

	-- handle the string substitution syntax [a:1,b:2]
	if rfind(text, "^%[.*%]$") then
		local subs = rsplit(rmatch(text, "^%[(.*)%]$"), ",")
		for _, sub in ipairs(subs) do
			local fromto = rsplit(sub, ":")
			if #fromto ~= 2 then
				error("Bad substitution spec " .. sub .. " in {{de-IPA}}")
			end
			local from, to = fromto[1], fromto[2]
			if rfind(from, "^~") then
				-- formerly, ~ was required to match within a word
				from = rmatch(from, "^~(.*)$")
			end
			local newtext = text
			if rfind(from, "^%^") then
				-- whole-word match
				from = rmatch(from, "^%^(.*)$")
				newtext = rsub(text, "%f[%a]" .. strutl.pattern_escape(from) .. "%f[%A]", to)
			else
				newtext = rsub(text, strutl.pattern_escape(from), to)
			end
			if newtext == text then
				error("Substitution spec " .. sub .. " didn't match respelling '" .. text .. "'")
			end
			text = newtext
		end
	end

	-- make text lowercase
	text = ulower(text)

	-- simplify checking for word boundary markers by adding ⁀ at
	-- the beginning and end of all words then removing them at the end
	text = rsub(text, "%s*,%s*", "⁀⁀ | ⁀⁀") -- mark between commas and treat it as a pause
	text = rsub(text, "%s+", "⁀ ⁀") -- mark between spaces
	text = rsub(text, "[%-/]+", "⁀-⁀") -- mark between compound word boundaries including hyphens
	text = "⁀⁀" .. text .. "⁀⁀" -- mark at the start and end of the whole entry

	-- handle combining accents
	text = udecomp(text) -- decompose accented characters into their base and combining parts
	text = rsub(text, "([" .. all_accents .. "]*)([e" .. DI .. "])", "%2%1") -- avoid confusion of wrongly-ordered umlauts/e's and other accents
	text = rsub(text, "([aou])[e" .. DI .. "]", umlaut_conv) -- recompose umlauted vowels
	text = rsub(text, "([" .. length_accent .. "])([" .. stress_accent .. "])", "%2%1") -- put length accents after stress accents

	return text
end

-- handle stress by shifting accent mark. if there's no stress
-- mark, add stress mark according to predetermined rules
local function handle_stress(text, orig, pos)
	if not rfind(text, AC) then -- FIXME later
		return rsubn(text, "⁀(" .. cons_c .. "*[" .. vowel .. "])", "⁀%1" .. AC)
	else
		return text
	end
end

-- respell the text more phonetically to allow easier conversion to IPA
local function respell(text, orig, pos)
	-- handle ⟨q⟩
	text = rsub(text, "q([" .. vowel .. "]?" .. cons_c .. ")", "k%1") -- convert ⟨q⟩ before a single or no vowel

	-- handle ⟨c⟩/⟨s⟩/⟨z⟩
	text = rsub(text, "c([^eiyäöühk])", "k%1") -- convert ⟨c⟩ (single letter) before non-front vowels to /k/
	text = rsub(text, "([" .. back_vowel .. "])ch", "%1kh") -- convert ⟨ch⟩ after back vowels to /χ/
	text = rsub(text, "z", "c") -- convert ⟨z⟩ to /t͡s/
	text = rsub(text, "s([" .. vowel .. "])", "z%1") -- ⟨s⟩ is voiced as z before vowels

	-- handle consonant devoicing
	text = rsub(text, "([bdg])(" .. cons_or_boundary_c .. ")", -- devoice syllable-final obstruents
		function(c1, c2)
			return devoice_conv[c1] .. c2
		end)

	-- handle predictable stressed vowel lengths; other cases must explicitly
	-- be marked by the user or else the module will return an error
	text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. "[" .. vowel .. "])", "%1" .. MA .. "%2") -- long vowel before consonant + vowel
	text = rsub(text, "(" .. vowel_stressed .. ")⁀", "%1" .. MA .. "⁀") -- long vowel before a word boundary
	text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. ")%2", "%1" .. BR .. "%2%2") -- short vowel before a double consonant

	-- handle pronounced ⟨h⟩ (FIXME)
	text = rsub(text, "([" .. vowel .. "])h([" .. vowel .. "])", "%1" .. MA .. "[h]%2") -- ⟨h⟩ is pronounced /h/ in between vowels

	-- shift stress accents before letter
	return rsub(text, "(%w[" .. length_accent .. "]*)([" .. stress_accent .. "])", "%2%1")
end

-- convert letters to phonemes using the sequences table,
-- then return the phonemes as a concatenated string
local function parse_table(text)
	local phones, i, n = {}, 1, ulen(text)
	while i <= n do
		local is_stressed = false
		local cid = usub(text, i, i)
		local value = sequences[cid]
		local phone, cidl

		if value == nil then -- skip over invalid values
			i = i + 1
		elseif rmatch(cid, "[" .. stress_accent .. "]") then -- check for stressed vowel
			is_stressed = true
			table.insert(phones, value)
			i = i + 1
		else -- process letters
			local cid_next = usub(text, i + 1, i + 1)

			if rmatch(cid, "[" .. vowel .. "]") then
				cidl = 1 -- default character id length value

				if cid_next == "h" or cid_next == cid or cid_next == MA then -- long vowel if following an 'h' or a double letter
					phone = value[MA]
					cidl = cidl + 1
				elseif cid_next == BR then
					phone = value[BR]
					cidl = cidl + 1
				else
					local found = false
					for seq, seq_phone in pairs(value) do
						if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
							phone = seq_phone
							cidl = cidl + ulen(seq) - 1
							found = true
							break
						end
					end
					if not found then
						if is_stressed then -- return error if vowel is stressed
							error("Vowel length is ambiguous for the stressed vowel. Please specify vowel length.")
						else
							phone = value[false]
						end
					end
				end
				is_stressed = false -- turn off stress until end or next stressed vowel
			else
				cidl = 1 -- default character id length value

				if type(value) ~= "table" or value[1] then
					phone = value
				elseif cid_next == cid then -- double consonants are treated as singular
					phone = value[cid]
				else -- otherwise go over table
					local found = false
					for seq, seq_phone in pairs(value) do
						if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
							phone = seq_phone
							cidl = ulen(seq)
							found = true
							break
						end
					end
					if not found then
						phone = value[false]
					end
				end
			end

			if type(phone) == "string" then
				table.insert(phones, phone)
			elseif type(phone) == "table" then
				for _, p in ipairs(phone) do
					table.insert(phones, p)
				end
			end

			i = i + cidl
		end
	end

	-- concatenate the phonemes into a string
	return table.concat(phones)
end

-- final phonemic substituations
local function phonemic(text, orig, pos)
	text = rsub(text, "n([" .. stress_marks .. "][kɡ])", "ŋ%1")
	return text
end

function export.toIPA(text, orig, pos)
	if type(text) == 'table' then
		text, orig, pos = ine(text.args[1]), ine(text.args.orig), ine(text.args.pos)
	end
	text = text or mw.title.getCurrentTitle().text

	text = normalise(text)
	-- text = handle_stress(text, orig, pos)
	text = respell(text, orig, pos)
	text = parse_table(text)
	text = phonemic(text, orig, pos)

	-- remove hyphens and word-boundary markers
	return rsub(text, "[⁀%-]", "")
end

return export