Module:de-IPA
Appearance
- The following documentation is located at Module:de-IPA/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
Testcases
[edit]--[=====[
TODO:
* Function for pre-consonantal and final obstruent devoicing of d, g, b, s
* Function for syllable-final uvularisation of r (ɐ̯)
* Function to reduce geminates [DONE]
* List of environments which trigger the palatalisation of /x/ (liquids + non-low front vowels) [DONE]
* Function to determine if H is word initial (> /h/) or non-initial (> 0) [DONE]
* Function to put stress in general, function to check for prefixes and realign stress accordingly
* Function to convert ⟨e⟩ in unstressed syllables to ə > Function to reduce -ər to -r + "devoicing"
* Function to convert ⟨c⟩ before front vowels to /t͡s/ [DONE]
* Function to convert final ⟨-ehe⟩ as /eː/ (verbs only)
* Function to mark whether the word is Germanic or Romanic - makes a lot of exceptions
predictable/automatable, e.g. /ɪ, ɔ, ʊ/ > /i, o, u/ for short vowels in closed syllables,
penultimate or final stress
* Inseparable prefixes do not take stress > Stress on the 2nd syllable
** A complete list could be compiled and the process automated, instead of making the user enter the stress by hand
* Rules to determine when to make vowels short vs. long. These are usually predictable,
but there are some exceptions; use a macron (e.g. ā ē ī) to force a long vowel,
and a breve (e.g. ă ĕ ĭ) to force a short vowel. Below are the general rules:
- vowels are long in an open syllable (no final consonant, e.g. bēten, hōlen)
- vowels are also long before a single consonant (e.g. kām), as well as before
a silent ⟨h⟩ (e.g. gēhen, zēhn)
- vowels are also short before a double (geminate) consonant (e.g. Wăsser, Mŭtter)
- however, vowels before two unique consonants are not predictable (they can either be
long, e.g. Mōnd, or short, e.g. Mŭnd)
- note that a long ⟨i⟩ is usually written as ⟨ie⟩, except word-initially (e.g. Īgel)
and the exception of short ⟨ie⟩ in vier and its derivatives (e.g. vierzehn)
- vowels are usually long in a stressed final syllable before a single
consonant (but with possible exceptions, e.g. '-eg')
- unstressed syllables do not have long vowels
* Stress is usually on the first syllable, but there are some exceptions:
- syllables with secondary stress are treated as if stressed
- syllables directly following a known prefix (aus-, zu-, über-, ge-, etc.)
should be treated as if stressed, whether they are actually stressed or not
- when there's an explicit slash to separate compounds, all parts should be
treated as if they were separate words for vowel-length purposes (e.g.
'-tag' in 'Reichs/tag' should be long)
- what about other unstressed syllables?
--]=====]
local export = {}
local u = require("Module:string/char")
local strutl = require("Module:string utilities")
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ucomp = mw.ustring.toNFC
local udecomp = mw.ustring.toNFD
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local function ine(x)
if x == "" then return nil else return x end
end
local AC = u(0x0301) -- acute accent
local GR = u(0x0300) -- grave accent
local MA = u(0x0304) -- macron
local BR = u(0x0306) -- breve
local DI = u(0x0308) -- diaeresis
local stress_accent = AC .. GR
local length_accent = MA .. BR
local all_accents = stress_accent .. length_accent
local front_vowel = "eiyäöü"
local back_vowel = "aou"
local vowel = front_vowel .. back_vowel .. all_accents
local vowel_stressed = "[" .. vowel .. "][" .. stress_accent .. "]"
local cons_c = "[^" .. vowel .. ".⁀ %-()]"
local cons_or_boundary_c = "[^" .. vowel .. "rl. %-()]" -- includes ⁀ -- I have added /l/ & /r/ as a stopgap against Brücke -> /ˈprʏkə/, but this may need a new name.
local stress_marks = "ˈˌ"
local devoice_conv = { ["b"] = "p", ["d"] = "t", ["g"] = "k" }
local umlaut_conv = { ["a"] = "ä", ["o"] = "ö", ["u"] = "ü" }
local sequences = {
["a"] = {
["ai" ] = "aɪ̯";
["au" ] = "aʊ̯";
["ay" ] = "aɪ̯";
[BR ] = "a";
[MA ] = "aː";
[false ] = "ɐ";
};
["ä"] = {
["äu" ] = "ɔʏ̯";
[BR ] = "ɛ";
[MA ] = "ɛː";
[false ] = "ɛ";
};
["b"] = "b";
["c"] = {
["chs" ] = { "k", "s" }; -- FIXME: should we have this
["ch" ] = "ç"; -- front allophone (ich-laut)
["ck" ] = "k";
[false ] = "t͡s";
};
["d"] = {
["dsch"] = "d͡ʒ";
["dt" ] = "t";
[false ] = "d";
};
["e"] = {
["ei" ] = "aɪ̯";
["eu" ] = "ɔʏ̯";
["ey" ] = "aɪ̯";
[BR ] = "ɛ";
[MA ] = "eː";
[false ] = "ə";
};
["f"] = "f";
["g"] = "ɡ";
["h"] = "h";
["i"] = {
["ieh" ] = "iː";
["ie" ] = "iː";
[BR ] = "ɪ";
[MA ] = "iː";
[false ] = "ɪ";
};
["j"] = "j";
["k"] = {
["khs" ] = { "k", "s" };
["kh" ] = "χ"; -- back allophone of /ç/ (ach-laut)
[false ] = "k";
};
["l"] = "l";
["m"] = "m";
["n"] = {
["nk" ] = { "ŋ", "k" };
["ng" ] = "ŋ";
[false ] = "n";
};
["o"] = {
[BR ] = "ɔ";
[MA ] = "oː";
[false ] = "ɔ";
};
["ö"] = {
[BR ] = "œ";
[MA ] = "œː"; -- sometimes /øː/
[false ] = "œ";
};
["p"] = {
["pf" ] = "p͡f";
["ph" ] = "f";
[false ] = "p";
};
["q"] = {
["qu" ] = { "k", "ʋ" }; -- only before another vowel
[false ] = "k";
};
["r"] = "r"; -- phonetically [ʀ] syllable-initially; /ɐ/ syllable-finally
["s"] = {
["sch" ] = "ʃ";
["sh" ] = "ʃ";
["sp" ] = { "ʃ", "p" };
["st" ] = { "ʃ", "t" };
[false ] = "s";
};
["t"] = {
["tsch"] = "t͡ʃ";
["tz" ] = "t͡s";
[false ] = "t";
};
["u"] = {
[BR ] = "ʊ";
[MA ] = "uː";
[false ] = "ʊ";
};
["ü"] = {
[BR ] = "ʏ";
[MA ] = "yː";
[false ] = "ʏ";
};
["v"] = "f";
["w"] = "ʋ";
["x"] = { "k", "s" }; -- XXX
["y"] = {
[BR ] = "ʏ";
[MA ] = "yː";
[false ] = "ʏ";
};
["z"] = "z"; -- respellt from s
["́"] = "ˈ"; -- FIXME
["̀"] = "ˌ";
-- [AC ] = "ˈ";
-- [GR ] = "ˌ";
}
-- normalise the function by substituting strings, making the text lowercase,
-- decomposing and recomposing umlauted vowels, and then converting ae, oe, ue
-- to umlauted vowels (ä, ö, ü)
local function normalise(text)
--[[
if not text or text == "+" then
text = pagename
end
--]]
-- handle the string substitution syntax [a:1,b:2]
if rfind(text, "^%[.*%]$") then
local subs = rsplit(rmatch(text, "^%[(.*)%]$"), ",")
for _, sub in ipairs(subs) do
local fromto = rsplit(sub, ":")
if #fromto ~= 2 then
error("Bad substitution spec " .. sub .. " in {{de-IPA}}")
end
local from, to = fromto[1], fromto[2]
if rfind(from, "^~") then
-- formerly, ~ was required to match within a word
from = rmatch(from, "^~(.*)$")
end
local newtext = text
if rfind(from, "^%^") then
-- whole-word match
from = rmatch(from, "^%^(.*)$")
newtext = rsub(text, "%f[%a]" .. strutl.pattern_escape(from) .. "%f[%A]", to)
else
newtext = rsub(text, strutl.pattern_escape(from), to)
end
if newtext == text then
error("Substitution spec " .. sub .. " didn't match respelling '" .. text .. "'")
end
text = newtext
end
end
-- make text lowercase
text = ulower(text)
-- simplify checking for word boundary markers by adding ⁀ at
-- the beginning and end of all words then removing them at the end
text = rsub(text, "%s*,%s*", "⁀⁀ | ⁀⁀") -- mark between commas and treat it as a pause
text = rsub(text, "%s+", "⁀ ⁀") -- mark between spaces
text = rsub(text, "[%-/]+", "⁀-⁀") -- mark between compound word boundaries including hyphens
text = "⁀⁀" .. text .. "⁀⁀" -- mark at the start and end of the whole entry
-- handle combining accents
text = udecomp(text) -- decompose accented characters into their base and combining parts
text = rsub(text, "([" .. all_accents .. "]*)([e" .. DI .. "])", "%2%1") -- avoid confusion of wrongly-ordered umlauts/e's and other accents
text = rsub(text, "([aou])[e" .. DI .. "]", umlaut_conv) -- recompose umlauted vowels
text = rsub(text, "([" .. length_accent .. "])([" .. stress_accent .. "])", "%2%1") -- put length accents after stress accents
return text
end
-- handle stress by shifting accent mark. if there's no stress
-- mark, add stress mark according to predetermined rules
local function handle_stress(text, orig, pos)
if not rfind(text, AC) then -- FIXME later
return rsubn(text, "⁀(" .. cons_c .. "*[" .. vowel .. "])", "⁀%1" .. AC)
else
return text
end
end
-- respell the text more phonetically to allow easier conversion to IPA
local function respell(text, orig, pos)
-- handle ⟨q⟩
text = rsub(text, "q([" .. vowel .. "]?" .. cons_c .. ")", "k%1") -- convert ⟨q⟩ before a single or no vowel
-- handle ⟨c⟩/⟨s⟩/⟨z⟩
text = rsub(text, "c([^eiyäöühk])", "k%1") -- convert ⟨c⟩ (single letter) before non-front vowels to /k/
text = rsub(text, "([" .. back_vowel .. "])ch", "%1kh") -- convert ⟨ch⟩ after back vowels to /χ/
text = rsub(text, "z", "c") -- convert ⟨z⟩ to /t͡s/
text = rsub(text, "s([" .. vowel .. "])", "z%1") -- ⟨s⟩ is voiced as z before vowels
-- handle consonant devoicing
text = rsub(text, "([bdg])(" .. cons_or_boundary_c .. ")", -- devoice syllable-final obstruents
function(c1, c2)
return devoice_conv[c1] .. c2
end)
-- handle predictable stressed vowel lengths; other cases must explicitly
-- be marked by the user or else the module will return an error
text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. "[" .. vowel .. "])", "%1" .. MA .. "%2") -- long vowel before consonant + vowel
text = rsub(text, "(" .. vowel_stressed .. ")⁀", "%1" .. MA .. "⁀") -- long vowel before a word boundary
text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. ")%2", "%1" .. BR .. "%2%2") -- short vowel before a double consonant
-- handle pronounced ⟨h⟩ (FIXME)
text = rsub(text, "([" .. vowel .. "])h([" .. vowel .. "])", "%1" .. MA .. "[h]%2") -- ⟨h⟩ is pronounced /h/ in between vowels
-- shift stress accents before letter
return rsub(text, "(%w[" .. length_accent .. "]*)([" .. stress_accent .. "])", "%2%1")
end
-- convert letters to phonemes using the sequences table,
-- then return the phonemes as a concatenated string
local function parse_table(text)
local phones, i, n = {}, 1, ulen(text)
while i <= n do
local is_stressed = false
local cid = usub(text, i, i)
local value = sequences[cid]
local phone, cidl
if value == nil then -- skip over invalid values
i = i + 1
elseif rmatch(cid, "[" .. stress_accent .. "]") then -- check for stressed vowel
is_stressed = true
table.insert(phones, value)
i = i + 1
else -- process letters
local cid_next = usub(text, i + 1, i + 1)
if rmatch(cid, "[" .. vowel .. "]") then
cidl = 1 -- default character id length value
if cid_next == "h" or cid_next == cid or cid_next == MA then -- long vowel if following an 'h' or a double letter
phone = value[MA]
cidl = cidl + 1
elseif cid_next == BR then
phone = value[BR]
cidl = cidl + 1
else
local found = false
for seq, seq_phone in pairs(value) do
if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
phone = seq_phone
cidl = cidl + ulen(seq) - 1
found = true
break
end
end
if not found then
if is_stressed then -- return error if vowel is stressed
error("Vowel length is ambiguous for the stressed vowel. Please specify vowel length.")
else
phone = value[false]
end
end
end
is_stressed = false -- turn off stress until end or next stressed vowel
else
cidl = 1 -- default character id length value
if type(value) ~= "table" or value[1] then
phone = value
elseif cid_next == cid then -- double consonants are treated as singular
phone = value[cid]
else -- otherwise go over table
local found = false
for seq, seq_phone in pairs(value) do
if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
phone = seq_phone
cidl = ulen(seq)
found = true
break
end
end
if not found then
phone = value[false]
end
end
end
if type(phone) == "string" then
table.insert(phones, phone)
elseif type(phone) == "table" then
for _, p in ipairs(phone) do
table.insert(phones, p)
end
end
i = i + cidl
end
end
-- concatenate the phonemes into a string
return table.concat(phones)
end
-- final phonemic substituations
local function phonemic(text, orig, pos)
text = rsub(text, "n([" .. stress_marks .. "][kɡ])", "ŋ%1")
return text
end
function export.toIPA(text, orig, pos)
if type(text) == 'table' then
text, orig, pos = ine(text.args[1]), ine(text.args.orig), ine(text.args.pos)
end
text = text or mw.title.getCurrentTitle().text
text = normalise(text)
-- text = handle_stress(text, orig, pos)
text = respell(text, orig, pos)
text = parse_table(text)
text = phonemic(text, orig, pos)
-- remove hyphens and word-boundary markers
return rsub(text, "[⁀%-]", "")
end
return export