Jump to content

Module:sla-common

From Wiktionary, the free dictionary

This module contains common helper functions for Proto-Slavic, that are needed by other modules.


local export = {}

local m_links = require("Module:links")
local m_table_tools = require("Module:table tools")

local lang = require("Module:languages").getByCode("sla-pro")

local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub

local AC = u(0x0301) -- acute =  ́
local GR = u(0x0300) -- grave =  ̀
local CFLEX = u(0x0302) -- circumflex =  ̂
local TILDE = u(0x0303) -- tilde =  ̃
local BREVE = u(0x0306) -- breve =  ̆
local INVBREVE = u(0x0311) -- inverse breve =  ̑
local DOUBLEAC = u(0x030B) -- double acute =  ̋
local DOUBLEGR = u(0x030F) -- double grave =  ̏
local MACRON = u(0x0304) -- macron =  ̄
local CARON = u(0x030C) -- caron =  ̌
local OGONEK = u(0x0328) -- ogonek =  ̨

local stressed_accents = AC .. GR .. INVBREVE .. DOUBLEGR .. DOUBLEAC .. TILDE
local stressed_accents_c = "[" .. stressed_accents .. "]"
local accents = stressed_accents .. MACRON
local accents_c = "[" .. accents .. "]"
local vowels = "aeiouyьъěęǫ"
local vowels_c = "[" .. vowels .. "]"
local non_vowels_c = "[^" .. vowels .. "]"
local short_vowels = "eoьъ"
local short_vowels_c = "[" .. short_vowels .. "]"
local long_vowels = "aiuyěęǫ"
local long_vowels_c = "[" .. long_vowels .. "]"
local cons_c = "[^" .. vowels .. accents .. "]"
local iotated_cons = "čďjľňřšťž"
local iotated_cons_c = "[" .. iotated_cons .. "]"

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

function export.tag_form(form, tag)
	if form ~= "" then
		return "<" .. (tag or "span") .. " lang=\"sla-pro\" class=\"Unicode\">*" .. form .. "</" .. (tag or "span") .. ">"
	else
		return "&mdash;"
	end
end

-- Make a link out of a form, or show a dash if empty.
function export.link_form(form, tag)
	local SUBPAGENAME = mw.title.getCurrentTitle().subpageText
	
	if type(form) == "table" then
		if not form.notesym then
			local retval = {}
			for _, subform in ipairs(form) do
				table.insert(retval, export.link_form(subform, tag))
			end
			return table.concat(retval, ", ")
		else
			return m_links.full_link({ lang = lang, term = "*" .. form[1] }) .. m_table_tools.superscript_notes(form.notesym)
		end
	elseif form ~= "" then
		return m_links.full_link({ lang = lang, term = "*" .. form })
	else
		return "&mdash;"
	end
end

local recomposer = { 
	["e" .. CARON] = "ě", -- Latin e and E
	["E" .. CARON] = "Ě",
	["e" .. OGONEK] = "ę", -- Latin e and E
	["E" .. OGONEK] = "Ę",
	["o" .. OGONEK] = "ǫ", -- Latin o and O
	["O" .. OGONEK] = "Ǫ",
	["c" .. CARON] = "č",
	["C" .. CARON] = "Č",
	["d" .. CARON] = "ď",
	["D" .. CARON] = "Ď",
	["l" .. CARON] = "ľ",
	["L" .. CARON] = "Ľ",
	["n" .. CARON] = "ň",
	["N" .. CARON] = "Ň",
	["r" .. CARON] = "ř",
	["R" .. CARON] = "Ř",
	["s" .. CARON] = "š",
	["S" .. CARON] = "Š",
	["t" .. CARON] = "ť",
	["T" .. CARON] = "Ť",
	["z" .. CARON] = "ž",
	["Z" .. CARON] = "Ž",
}

-- Decompose acute, grave, etc. on letters into individivual character +
-- combining accent. But recompose characters that we want to treat
-- as units and get caught in the crossfire.
function export.decompose(text)
	text = mw.ustring.toNFD(text)
	text = rsub(text, ".[" .. CARON .. OGONEK .. "]", recomposer)
	return text
end

-- Decompose as in export.decompose(), but also canonicalize circumflex to
-- inverse breve in case it accidentally gets used.
function export.canon_decompose(text)
	text = export.decompose(text)
	text = rsub(text, CFLEX, INVBREVE)
	return text
end

function export.assert_decomposed(text)
	assert(not rfind(text, "[áéíóúýàèìòùỳãẽĩõũỹāēīōūȳȃȇȋȏȗȁȅȉȍȕÁÉÍÓÚÝÀÈÌÒÙỲÃẼĨÕŨỸĀĒĪŌŪȲȂȆȊȎȖȀȄȈȌȔ]"))
end

function export.first_palatalization(stem)
	stem = rsub(stem, "...$", {["zdz"]="ždž"})
	stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"})
	stem = rsub(stem, ".$", {["k"]="č", ["g"]="ž", ["x"]="š", ["c"]="č", ["ś"]="š"})
	return stem
end

function export.second_palatalization(stem)
	return rsub(stem, ".$", {["k"]="c", ["g"]="dz", ["x"]="ś"})
end

function export.iotate(stem)
	stem = rsub(stem, "...$", {["zdz"]="ždž"})
	stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"})
	stem = rsub(stem, ".$", {
		["b"]="bľ",
		["c"]="č",
		["d"]="ď",
		["g"]="ž",
		["k"]="č",
		["l"]="ľ",
		["m"]="mľ",
		["n"]="ň",
		["p"]="pľ",
		["r"]="ř",
		["s"]="š",
		["ś"]="š",
		["t"]="ť",
		["v"]="vľ",
		["x"]="š",
		["z"]="ž",
	})
	
	if not rfind(stem, iotated_cons_c .. "$") then
		stem = stem .. "j"
	end
	return stem
end

-- Check if word has a stress accent
function export.is_stressed(word)
	export.assert_decomposed(word)
	return rfind(word, stressed_accents_c)
end

-- Remove any stress accents from the word
function export.make_unstressed(word)
	export.assert_decomposed(word)
	return rsub(word, stressed_accents_c, "")
end

-- Check if word is nonsyllabic (has no vowels)
function export.is_nonsyllabic(word)
	export.assert_decomposed(word)
	return rfind(word, "^" .. non_vowels_c .. "*$")
end

-- Check if word is monosyllabic (has only one vowel)
function export.is_monosyllabic(word)
	export.assert_decomposed(word)
	return rfind(word, "^" .. non_vowels_c .. "*" .. vowels_c .. non_vowels_c .. "*$")
end

-- Set the accent in STEM to ACCENT, replacing any stressed accent already
-- there. If there isn't such an accent already then:
-- (1) If the accent is inverse breve (= old circumflex or short accent) or
--     double grave (= old short accent), put it on the first syllable;
-- (2) If the accent is tilde (= neoacute), put it on the last syllable;
-- (3) If the accent is a single grave (= old acute), put it on the vowel if
--     there's only one, otherwise don't add it as it can go anywhere.
-- Placing the accent will replace any unstressed accent already there
-- (specifically the macron).
--
-- In addition, if the accent is tilde (= neoacute), we put the accent on the
-- last syllable of the stem, regardless of any existing accent. The logic here
-- is that, in nouns at least, a neoacute on the stem that we request (i.e. not
-- already in the stem) is always retracted from the ending, and thus should
-- go on the last syllable if there is more than one. FIXME: May not apply to
-- verbs.
--
-- Also apply certain conversions to the result:
-- (1) Original short vowels e o ь ъ can't get a macron. Per Derksen 2008,
--     this also includes liquid diphthongs, which normally behave like
--     long vowels; cf. 'borzdà' "burrow" in class b, where you expect the
--     preceding vowel to be long if possible. However, we go against
--     Derksen in this respect when the first vowel is e or o because Czech,
--     Slovak and Polish show clear length distinctions (or reflections thereof)
--     in original pre-tonic syllables in class b vs. c. (Serbo-Croat reflects
--     length in both classes but this can be a later development due to
--     analogy.) Per Kortlandt, the metathesis of liquid diphthongs preceded
--     Dybo's law and (probably) the shortening of pre-tonic vowels.
-- (2) Original long vowels a i u y ě ę ǫ can't get a double grave, nor can
--     liquid diphthongs; instead, convert to inverse breve (circumflex accent).
-- (3) Original short vowels e o ь ъ not in liquid diphthongs can't receive a
--     tilde (neoacute) per the May 2019 discussion in
--     [[Wiktionary talk:About Proto-Slavic#Use the traditional accent symbols]];
--     instead we convert to single grave.
function export.set_accent(stem, accent)
	export.assert_decomposed(stem)
	-- string containing a hyphen is the value of UNK = unknown, and removes
	-- all accents including macrons
	if accent == "-" then
		return rsub(stem, accents_c, "")
	end
	if accent == DOUBLEGR then
		error("Double grave should not be specified as an accent; use inverted breve instead")
	end
	if not export.is_stressed(stem) and accent ~= TILDE and
		(accent ~= GR or export.is_monosyllabic(stem)) then
		-- If no stressed accent, put one on the first syllable, removing any
		-- non-stress accent, i.e. macron (it doesn't matter which accent we put
		-- as long as it's a stress accent, as it will be overwritten in the
		-- next clause). But don't do this if accent is a tilde (no point, it
		-- will be ignored and removed in the next clause), and if the accent is
		-- a grave, only do this if the stem is monosyllabic.
		stem = rsub(stem, "^(.-" .. vowels_c .. ")" .. accents_c .. "*",
			"%1" .. INVBREVE)
	end
	if accent == TILDE then
		-- If a tilde, cancel out any existing stressed accent and put the tilde
		-- on the last syllable. (FIXME, might not apply to verbs.) Later on
		-- we will conver this to a single grave if it's on a short monophthong.
		stem = export.make_unstressed(stem)
		stem = rsub(stem, "^(.*" .. vowels_c .. ")" .. accents_c .. "*",
			"%1" .. TILDE)
	else
		-- Otherwise just replace the stressed accent, if any, with the given
		-- accent. There will always be such an accent except in multisyllabic
		-- words where the accent is a single grave; in other circumstances
		-- we added an accent on the first syllable if it was missing.
		stem = rsub(stem, stressed_accents_c .. "+", accent)
	end
	if accent == MACRON then
		-- hack to handle liquid diphthongs: generate two macrons, since the
		-- following regex will remove one.
		stem = rsub(stem, "([eo])" .. MACRON .. "([lr]" .. cons_c .. ")",
			"%1" .. MACRON .. MACRON .. "%2")
		stem = rsub(stem, "(" .. short_vowels_c .. ")" .. MACRON, "%1")
	end
	-- Convert inverse breve after short vowel not in liquid diphthong to
	-- double grave.
	if rfind(stem, short_vowels_c .. INVBREVE) and
		not rfind(stem, short_vowels_c .. INVBREVE .. "[lr]" .. cons_c) then
		stem = rsub(stem, INVBREVE, DOUBLEGR)
	end
	-- Convert tilde after short vowel not in liquid diphthong to single grave.
	if rfind(stem, short_vowels_c .. TILDE) and
		not rfind(stem, short_vowels_c .. TILDE .. "[lr]" .. cons_c) then
		stem = rsub(stem, TILDE, GR)
	end
	return stem
end

-- Infer the accentual pattern for a given unstressed word and accent pattern.
-- Return a list of possibilities (possibly empty if no accent could be inferred,
-- possibly containing more than one entry if multiple accentual patterns are
-- possible, e.g. with *voľa-type nouns), each of which is a three-entry list of
-- {stem, desinence, final_accent}.
local function infer_accent(word, ap)
	assert(not export.is_stressed(word))
	local possible_accents = {}
	local stem, desinence, final_accent = export.split_stem_desinence(word)
	if ap == "a" then
		if export.is_monosyllabic(stem) then
			table.insert(possible_accents, {export.set_accent(stem, GR), desinence, final_accent})
		end
	elseif ap == "b" then
		if export.is_nonsyllabic(stem) then
			table.insert(possible_accents, {stem, desinence, GR})
		elseif desinence == "ь" or desinence == "ъ" then
			table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
		else
			table.insert(possible_accents, {stem, desinence, GR})
			if desinence == "a" and rfind(stem, iotated_cons_c .. "$") then
				-- *voľa-type accent
				table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
			end
		end
	elseif ap == "c" then
		if export.is_nonsyllabic(stem) then
			table.insert(possible_accents, {stem, desinence, INVBREVE})
		elseif desinence == "a" then
			table.insert(possible_accents, {stem, desinence, GR})
		else
			table.insert(possible_accents, {export.set_accent(stem, INVBREVE), desinence, final_accent})
		end
	end
	return possible_accents
end

-- If WORD is unstressed, add the appropriate accent for the accent pattern AP
-- if possible (it won't be possible with accent pattern a in words with a
-- multisyllabic stem). If WORD is stressed, check that the accent on the word
-- is appropriate for the accent pattern, and throw an error if not. In either
-- case, return three values, STEM, DESINENCE and FINAL_ACCENT, which when
-- concatenated together produce the original word.
function export.auto_accent_and_check_accents(word, ap)
	local unstressed = not export.is_stressed(word)
	if unstressed then
		local possible_accents = infer_accent(word, ap)
		if #possible_accents == 0 then
			return export.split_stem_desinence(word)
		end
		local first = possible_accents[1]
		local stem, desinence, final_accent = first[1], first[2], first[3]
		return stem, desinence, final_accent
	else
		local uword = export.make_unstressed(word)
		local possible_accents = infer_accent(uword, ap)
		if #possible_accents == 0 then
			return export.split_stem_desinence(word)
		end
		local possible_words = {}
		for _, split_possible in ipairs(possible_accents) do
			local stem, desinence, final_accent = split_possible[1], split_possible[2], split_possible[3]
			local possible_word = stem .. desinence .. final_accent
			if possible_word == word then
				return stem, desinence, final_accent
			end
			table.insert(possible_words, possible_word)
		end
		error("For accent pattern " .. ap .. ", accented lemma should look like " ..
			table.concat(possible_words, " or ") .. " but is actually " .. word)
	end
end

function export.split_stem_desinence(word)
	export.assert_decomposed(word)
	local stem, desinence, final_accent = rmatch(word, "^(.-)(.)(" .. accents_c .. "?)$")
	if not stem or not desinence then
		error("Something wrong with '" .. word .. "', probably too short")
	end
	return stem, desinence, final_accent
end

return export