Module:sla-common
Appearance
- The following documentation is located at Module:sla-common/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module contains common helper functions for Proto-Slavic, that are needed by other modules.
local export = {}
local m_links = require("Module:links")
local m_table_tools = require("Module:table tools")
local lang = require("Module:languages").getByCode("sla-pro")
local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local TILDE = u(0x0303) -- tilde = ̃
local BREVE = u(0x0306) -- breve = ̆
local INVBREVE = u(0x0311) -- inverse breve = ̑
local DOUBLEAC = u(0x030B) -- double acute = ̋
local DOUBLEGR = u(0x030F) -- double grave = ̏
local MACRON = u(0x0304) -- macron = ̄
local CARON = u(0x030C) -- caron = ̌
local OGONEK = u(0x0328) -- ogonek = ̨
local stressed_accents = AC .. GR .. INVBREVE .. DOUBLEGR .. DOUBLEAC .. TILDE
local stressed_accents_c = "[" .. stressed_accents .. "]"
local accents = stressed_accents .. MACRON
local accents_c = "[" .. accents .. "]"
local vowels = "aeiouyьъěęǫ"
local vowels_c = "[" .. vowels .. "]"
local non_vowels_c = "[^" .. vowels .. "]"
local short_vowels = "eoьъ"
local short_vowels_c = "[" .. short_vowels .. "]"
local long_vowels = "aiuyěęǫ"
local long_vowels_c = "[" .. long_vowels .. "]"
local cons_c = "[^" .. vowels .. accents .. "]"
local iotated_cons = "čďjľňřšťž"
local iotated_cons_c = "[" .. iotated_cons .. "]"
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
function export.tag_form(form, tag)
if form ~= "" then
return "<" .. (tag or "span") .. " lang=\"sla-pro\" class=\"Unicode\">*" .. form .. "</" .. (tag or "span") .. ">"
else
return "—"
end
end
-- Make a link out of a form, or show a dash if empty.
function export.link_form(form, tag)
local SUBPAGENAME = mw.title.getCurrentTitle().subpageText
if type(form) == "table" then
if not form.notesym then
local retval = {}
for _, subform in ipairs(form) do
table.insert(retval, export.link_form(subform, tag))
end
return table.concat(retval, ", ")
else
return m_links.full_link({ lang = lang, term = "*" .. form[1] }) .. m_table_tools.superscript_notes(form.notesym)
end
elseif form ~= "" then
return m_links.full_link({ lang = lang, term = "*" .. form })
else
return "—"
end
end
local recomposer = {
["e" .. CARON] = "ě", -- Latin e and E
["E" .. CARON] = "Ě",
["e" .. OGONEK] = "ę", -- Latin e and E
["E" .. OGONEK] = "Ę",
["o" .. OGONEK] = "ǫ", -- Latin o and O
["O" .. OGONEK] = "Ǫ",
["c" .. CARON] = "č",
["C" .. CARON] = "Č",
["d" .. CARON] = "ď",
["D" .. CARON] = "Ď",
["l" .. CARON] = "ľ",
["L" .. CARON] = "Ľ",
["n" .. CARON] = "ň",
["N" .. CARON] = "Ň",
["r" .. CARON] = "ř",
["R" .. CARON] = "Ř",
["s" .. CARON] = "š",
["S" .. CARON] = "Š",
["t" .. CARON] = "ť",
["T" .. CARON] = "Ť",
["z" .. CARON] = "ž",
["Z" .. CARON] = "Ž",
}
-- Decompose acute, grave, etc. on letters into individivual character +
-- combining accent. But recompose characters that we want to treat
-- as units and get caught in the crossfire.
function export.decompose(text)
text = mw.ustring.toNFD(text)
text = rsub(text, ".[" .. CARON .. OGONEK .. "]", recomposer)
return text
end
-- Decompose as in export.decompose(), but also canonicalize circumflex to
-- inverse breve in case it accidentally gets used.
function export.canon_decompose(text)
text = export.decompose(text)
text = rsub(text, CFLEX, INVBREVE)
return text
end
function export.assert_decomposed(text)
assert(not rfind(text, "[áéíóúýàèìòùỳãẽĩõũỹāēīōūȳȃȇȋȏȗȁȅȉȍȕÁÉÍÓÚÝÀÈÌÒÙỲÃẼĨÕŨỸĀĒĪŌŪȲȂȆȊȎȖȀȄȈȌȔ]"))
end
function export.first_palatalization(stem)
stem = rsub(stem, "...$", {["zdz"]="ždž"})
stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"})
stem = rsub(stem, ".$", {["k"]="č", ["g"]="ž", ["x"]="š", ["c"]="č", ["ś"]="š"})
return stem
end
function export.second_palatalization(stem)
return rsub(stem, ".$", {["k"]="c", ["g"]="dz", ["x"]="ś"})
end
function export.iotate(stem)
stem = rsub(stem, "...$", {["zdz"]="ždž"})
stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"})
stem = rsub(stem, ".$", {
["b"]="bľ",
["c"]="č",
["d"]="ď",
["g"]="ž",
["k"]="č",
["l"]="ľ",
["m"]="mľ",
["n"]="ň",
["p"]="pľ",
["r"]="ř",
["s"]="š",
["ś"]="š",
["t"]="ť",
["v"]="vľ",
["x"]="š",
["z"]="ž",
})
if not rfind(stem, iotated_cons_c .. "$") then
stem = stem .. "j"
end
return stem
end
-- Check if word has a stress accent
function export.is_stressed(word)
export.assert_decomposed(word)
return rfind(word, stressed_accents_c)
end
-- Remove any stress accents from the word
function export.make_unstressed(word)
export.assert_decomposed(word)
return rsub(word, stressed_accents_c, "")
end
-- Check if word is nonsyllabic (has no vowels)
function export.is_nonsyllabic(word)
export.assert_decomposed(word)
return rfind(word, "^" .. non_vowels_c .. "*$")
end
-- Check if word is monosyllabic (has only one vowel)
function export.is_monosyllabic(word)
export.assert_decomposed(word)
return rfind(word, "^" .. non_vowels_c .. "*" .. vowels_c .. non_vowels_c .. "*$")
end
-- Set the accent in STEM to ACCENT, replacing any stressed accent already
-- there. If there isn't such an accent already then:
-- (1) If the accent is inverse breve (= old circumflex or short accent) or
-- double grave (= old short accent), put it on the first syllable;
-- (2) If the accent is tilde (= neoacute), put it on the last syllable;
-- (3) If the accent is a single grave (= old acute), put it on the vowel if
-- there's only one, otherwise don't add it as it can go anywhere.
-- Placing the accent will replace any unstressed accent already there
-- (specifically the macron).
--
-- In addition, if the accent is tilde (= neoacute), we put the accent on the
-- last syllable of the stem, regardless of any existing accent. The logic here
-- is that, in nouns at least, a neoacute on the stem that we request (i.e. not
-- already in the stem) is always retracted from the ending, and thus should
-- go on the last syllable if there is more than one. FIXME: May not apply to
-- verbs.
--
-- Also apply certain conversions to the result:
-- (1) Original short vowels e o ь ъ can't get a macron. Per Derksen 2008,
-- this also includes liquid diphthongs, which normally behave like
-- long vowels; cf. 'borzdà' "burrow" in class b, where you expect the
-- preceding vowel to be long if possible. However, we go against
-- Derksen in this respect when the first vowel is e or o because Czech,
-- Slovak and Polish show clear length distinctions (or reflections thereof)
-- in original pre-tonic syllables in class b vs. c. (Serbo-Croat reflects
-- length in both classes but this can be a later development due to
-- analogy.) Per Kortlandt, the metathesis of liquid diphthongs preceded
-- Dybo's law and (probably) the shortening of pre-tonic vowels.
-- (2) Original long vowels a i u y ě ę ǫ can't get a double grave, nor can
-- liquid diphthongs; instead, convert to inverse breve (circumflex accent).
-- (3) Original short vowels e o ь ъ not in liquid diphthongs can't receive a
-- tilde (neoacute) per the May 2019 discussion in
-- [[Wiktionary talk:About Proto-Slavic#Use the traditional accent symbols]];
-- instead we convert to single grave.
function export.set_accent(stem, accent)
export.assert_decomposed(stem)
-- string containing a hyphen is the value of UNK = unknown, and removes
-- all accents including macrons
if accent == "-" then
return rsub(stem, accents_c, "")
end
if accent == DOUBLEGR then
error("Double grave should not be specified as an accent; use inverted breve instead")
end
if not export.is_stressed(stem) and accent ~= TILDE and
(accent ~= GR or export.is_monosyllabic(stem)) then
-- If no stressed accent, put one on the first syllable, removing any
-- non-stress accent, i.e. macron (it doesn't matter which accent we put
-- as long as it's a stress accent, as it will be overwritten in the
-- next clause). But don't do this if accent is a tilde (no point, it
-- will be ignored and removed in the next clause), and if the accent is
-- a grave, only do this if the stem is monosyllabic.
stem = rsub(stem, "^(.-" .. vowels_c .. ")" .. accents_c .. "*",
"%1" .. INVBREVE)
end
if accent == TILDE then
-- If a tilde, cancel out any existing stressed accent and put the tilde
-- on the last syllable. (FIXME, might not apply to verbs.) Later on
-- we will conver this to a single grave if it's on a short monophthong.
stem = export.make_unstressed(stem)
stem = rsub(stem, "^(.*" .. vowels_c .. ")" .. accents_c .. "*",
"%1" .. TILDE)
else
-- Otherwise just replace the stressed accent, if any, with the given
-- accent. There will always be such an accent except in multisyllabic
-- words where the accent is a single grave; in other circumstances
-- we added an accent on the first syllable if it was missing.
stem = rsub(stem, stressed_accents_c .. "+", accent)
end
if accent == MACRON then
-- hack to handle liquid diphthongs: generate two macrons, since the
-- following regex will remove one.
stem = rsub(stem, "([eo])" .. MACRON .. "([lr]" .. cons_c .. ")",
"%1" .. MACRON .. MACRON .. "%2")
stem = rsub(stem, "(" .. short_vowels_c .. ")" .. MACRON, "%1")
end
-- Convert inverse breve after short vowel not in liquid diphthong to
-- double grave.
if rfind(stem, short_vowels_c .. INVBREVE) and
not rfind(stem, short_vowels_c .. INVBREVE .. "[lr]" .. cons_c) then
stem = rsub(stem, INVBREVE, DOUBLEGR)
end
-- Convert tilde after short vowel not in liquid diphthong to single grave.
if rfind(stem, short_vowels_c .. TILDE) and
not rfind(stem, short_vowels_c .. TILDE .. "[lr]" .. cons_c) then
stem = rsub(stem, TILDE, GR)
end
return stem
end
-- Infer the accentual pattern for a given unstressed word and accent pattern.
-- Return a list of possibilities (possibly empty if no accent could be inferred,
-- possibly containing more than one entry if multiple accentual patterns are
-- possible, e.g. with *voľa-type nouns), each of which is a three-entry list of
-- {stem, desinence, final_accent}.
local function infer_accent(word, ap)
assert(not export.is_stressed(word))
local possible_accents = {}
local stem, desinence, final_accent = export.split_stem_desinence(word)
if ap == "a" then
if export.is_monosyllabic(stem) then
table.insert(possible_accents, {export.set_accent(stem, GR), desinence, final_accent})
end
elseif ap == "b" then
if export.is_nonsyllabic(stem) then
table.insert(possible_accents, {stem, desinence, GR})
elseif desinence == "ь" or desinence == "ъ" then
table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
else
table.insert(possible_accents, {stem, desinence, GR})
if desinence == "a" and rfind(stem, iotated_cons_c .. "$") then
-- *voľa-type accent
table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
end
end
elseif ap == "c" then
if export.is_nonsyllabic(stem) then
table.insert(possible_accents, {stem, desinence, INVBREVE})
elseif desinence == "a" then
table.insert(possible_accents, {stem, desinence, GR})
else
table.insert(possible_accents, {export.set_accent(stem, INVBREVE), desinence, final_accent})
end
end
return possible_accents
end
-- If WORD is unstressed, add the appropriate accent for the accent pattern AP
-- if possible (it won't be possible with accent pattern a in words with a
-- multisyllabic stem). If WORD is stressed, check that the accent on the word
-- is appropriate for the accent pattern, and throw an error if not. In either
-- case, return three values, STEM, DESINENCE and FINAL_ACCENT, which when
-- concatenated together produce the original word.
function export.auto_accent_and_check_accents(word, ap)
local unstressed = not export.is_stressed(word)
if unstressed then
local possible_accents = infer_accent(word, ap)
if #possible_accents == 0 then
return export.split_stem_desinence(word)
end
local first = possible_accents[1]
local stem, desinence, final_accent = first[1], first[2], first[3]
return stem, desinence, final_accent
else
local uword = export.make_unstressed(word)
local possible_accents = infer_accent(uword, ap)
if #possible_accents == 0 then
return export.split_stem_desinence(word)
end
local possible_words = {}
for _, split_possible in ipairs(possible_accents) do
local stem, desinence, final_accent = split_possible[1], split_possible[2], split_possible[3]
local possible_word = stem .. desinence .. final_accent
if possible_word == word then
return stem, desinence, final_accent
end
table.insert(possible_words, possible_word)
end
error("For accent pattern " .. ap .. ", accented lemma should look like " ..
table.concat(possible_words, " or ") .. " but is actually " .. word)
end
end
function export.split_stem_desinence(word)
export.assert_decomposed(word)
local stem, desinence, final_accent = rmatch(word, "^(.-)(.)(" .. accents_c .. "?)$")
if not stem or not desinence then
error("Something wrong with '" .. word .. "', probably too short")
end
return stem, desinence, final_accent
end
return export