Module:en-utilities

The following documentation is located at Module:en-utilities/documentation. ^[edit]

Useful links: subpage list • links • transclusions • testcases • sandbox

Provides some utility functions for manipulating English-language words, such as pluralizing, singularizing, deriving the correct indefinite article, etc.

Functions

export.remove_possessive

function export.remove_possessive(stem)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.add_suffix

function export.add_suffix(term, suffix, pos)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.pluralize

function export.pluralize(str)

Pluralize a word in a smart fashion, according to normal English rules.

If the word ends in a consonant or "qu" + "-y", replace "-y" with "-ies".
If the word ends in "s", "x", "z", "ch", "sh" or "zh", add "-es".
Otherwise, add "-s".

This handles links correctly:

If a piped link, change the second part appropriately.
If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.

export.is_regular_plural

function export.is_regular_plural(plural, term, pos)

Returns true if plural is an expected, regular plural of term. The optional parameter pos can be used to specify the part of speech, which is necessary because proper nouns do not change a "-y" suffix to "-ies" (e.g. "Abby" → "Abbys"). By default, pos is set to "noun". In addition to "proper noun", it can also take the special value "noun+", which means that the function will first attempt the check with the "noun" setting, and will then attempt it with the "proper noun" setting iff the term begins with a capital letter.

export.singularize

function export.singularize(str)

Singularize a word in a smart fashion, according to normal English rules. Works analogously to pluralize().

NOTE: This doesn't always work as well as pluralize(). Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".

If word ends in -ies, replace -ies with -y.
If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
Otherwise, remove -s.

This handles links correctly:

If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
If a non-piped link, singularize the link.
A link like "parishes" will be handled correctly because the code that checks for -shes etc. allows ] characters between the 'sh' etc. and final -es.

export.get_indefinite_article

function export.get_indefinite_article(str, ucfirst)

Return the appropriate indefinite article to prefix to str. Correctly handles links and capitalized text. Does not correctly handle words like union, uniform and university that take "a" despite beginning with a 'u'. The returned article will have its first letter capitalized if ucfirst is specified, otherwise lowercase.

export.add_indefinite_article

function export.add_indefinite_article(text, ucfirst)

Prefix text with the appropriate indefinite article to prefix to text. Correctly handles links and capitalized text. Does not correctly handle words like union, uniform and university that take "a" despite beginning with a 'u'. The returned article will have its first letter capitalized if ucfirst is specified, otherwise lowercase.

local export = {}

local add_suffix -- Defined below.
local find = string.find
local is_regular_plural -- Defined below.
local match = string.match
local remove_possessive -- Defined below.
local reverse = string.reverse
local sub = string.sub
local toNFD = mw.ustring.toNFD
local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local uupper = mw.ustring.upper

local vowels = "aæᴀᴁɐɑɒ@eᴇǝⱻəɛɘɜɞɤiıɪɨᵻoøœᴏɶɔᴐɵuᴜʉᵾɯꟺʊʋʌyʏ"
local hyphens = "%-‐‑‒–—"

--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
	local diacritics
	local function get_diacritics()
		diacritics, get_diacritics = mw.loadData("Module:headword/data").page.comb_chars.diacritics_all .. "+", nil
		return diacritics
	end

-- Normalize a string, so that case and diacritics are ignored. By default, "gu"
-- and "qu" are normalized to "g" and "q", because they behave like consonants
-- under certain conditions (e.g. final "y" does not usually have the plural
-- "ies" after a vowel, but it's regular for "quy" to become "quies". The flag
-- `not_gu` prevents this happening to "gu", and is needed because terms ending
-- "-guy" are almost always compounds of "guy" (→ "guys").
local function normalize(str, followed_by, not_gu)
	if not followed_by then
		followed_by = ""
	end
	str = ugsub(toNFD(str) .. followed_by, "([" .. (not_gu and "" or "Gg") .. "Qq])u([".. vowels .. "])", "%1%2")
	return ulower(ugsub(sub(str, 1, #str - #followed_by), diacritics or get_diacritics(), ""))
end

local function epenthetic_e_default(stem)
	return sub(stem, -1) ~= "e"
end

local function epenthetic_e_for_s(stem, term)
	-- If the stem is different, it must be from "y" → "i".
	if stem ~= term then
		return true
	end
	local final
	if match(stem, "^[^\128-\255]*$") then
		final = sub(stem, -1)
	else
		stem = ugsub(toNFD(stem), diacritics or get_diacritics(), "")
		final = usub(stem, -1)
	end
	-- Epenthetic "e" is added after a sibilant or sibilant-affricate. The vast
	-- majority of these are spelled "s", "x", "z", "ch" and "sh", but "dg"
	-- (→ "dge") and "ß" (→ "ss") can be found in obsolete spellings, "shh" in
	-- onomatopoeia, and "zh", "dj", "jj" (and more) in loanwords.
	return (
		final == "g" and sub(stem, -2, -2) == "d" or
		final == "h" and match(stem, "[csz]h+$") or
		final == "j" and umatch(stem, "[^" .. vowels .. "]j$") or
		final == "s" or
		final == "u" and umatch(stem, "%f[%w']u$") or
		final == "x" or
		final == "z" or
		final == "ß"
	)
end

function export.remove_possessive(stem)
	return match(stem, "^(.*)'s$") or match(stem, "^(.*s)'$") or stem
end
remove_possessive = export.remove_possessive

local suffixes = {}

suffixes["'s"] = {
	truncated = function(stem)
		return sub(stem, -1) == "s" and "'" or "'s"
	end,
}

suffixes["s.plural"] = {
	final_y_is_i = true,
	epenthetic_e = epenthetic_e_for_s,
	modifies_possessive = true,
}

suffixes["s.verb"] = {
	final_y_is_i = true,
	final_consonant_is_doubled = true,
	epenthetic_e = epenthetic_e_for_s
}

suffixes["ing"] = {
	final_consonant_is_doubled = true,
	remove_silent_e = true,
}

suffixes["d"] = {
	final_y_is_i = true,
	final_consonant_is_doubled = true,
	epenthetic_e = epenthetic_e_default,
}

suffixes["dst"] = suffixes["d"]
suffixes["st.verb"] = suffixes["d"]
suffixes["th"] = suffixes["d"]

suffixes["n"] = {
	final_y_is_i = true,
	final_y_is_i_after_vowel = true,
	final_guy_is_gui = true,
	final_consonant_is_doubled = true,
	-- No epenthetic "e" after an "e", or an "i", "r" or "w" preceded by a vowel.
	epenthetic_e = function(stem)
		return not (
			sub(stem, -1) == "e" or
			umatch(normalize(stem), "[" .. vowels .. "][irw]$")
		)
	end,
}

suffixes["r"] = {
	final_y_is_i = true,
	final_ey_is_i = true,
	final_guy_is_gui = true,
	final_consonant_is_doubled = true,
	epenthetic_e = epenthetic_e_default
}

suffixes["st.superlative"] = suffixes["r"]

-- Returns the stem used for suffixes that sometimes convert final "y" into "i",
-- such as "-es" ("-ies"), e.g. "penny" → "penni" ("pennies"). If
-- `final_ey_is_i` is true, final "ey" may also be converted, e.g. "plaguey" →
-- "plagui"; this is needed for "-er" ("-ier") and "-est" ("-iest"). If `not_gu`
-- is true, then normalize() will be called with the `not_gu` flag (see there
-- for more info); this is true in most cases.
local function convert_final_y_to_i(str, not_gu, final_ey_is_i, final_y_is_i_after_vowel)
	local final3 = usub(str, -3)
	-- Special case: treat "eey" as "ee" + "y" (e.g. "treey" → "treeiest").
	-- "oey" and "uey" are usually vowel + "ey", but examples of "oe" + "y" and
	-- "ue" = "y" do also exist: compare "go" → "goey" → "goier" with "doe" →
	-- "doey" → "doeier"; "flu" → "fluey" → "fluiest" and "flue" → "fluey" →
	-- "flueiest" form a theoretically possible minimal pair.
	if final3 == "eey" then
		return sub(str, 1, -2) .. "i"
	end
	local final2 = usub(str, -2)
	-- If `final_ey_is_i` is true, treat final "-ey" can also be reduced.
	if final_ey_is_i and final2 == "ey" then
		-- Remove "ey" to get the base stem.
		local base_stem = sub(str, 1, -3)
		-- Special case: allow final "-ey" ("potato-ey" → "potato-iest").
		if umatch(final3, "[" .. hyphens .. "]ey") then
			return base_stem .. "i"
		end
		-- Final "ey" becomes "i" iff the term is polysyllabic (e.g. not
		-- "grey"). "ey" is common if the base stem ends in a vowel ("echo →
		-- "echoey"), so the presence of a vowel anywhere in the base stem is
		-- sufficient to deem it polysyllabic. ("echoey" → "echo" → "echoiest",
		-- "beigey" → "beig" → "beigiest", but "grey" → "gr" → "greyest"). The
		-- first "y" in "-yey" can be treated as a vowel as long as it's
		-- preceded by something ("clayey" → "clay" → "clayiest", "cryey" →
		-- "cry" → "cryiest", but "*yey" → "*y" → "*yeyest"), so it needs to be
		-- treated as a special case.
		local normalized = normalize(base_stem, "ey")
		if sub(normalized, -1) == "y" then
			if umatch(normalized, "[%w@][yY]$") then
				return base_stem .. "i"
			end
		elseif umatch(normalized, "[" .. vowels .. "%d]%w*$") then
			return base_stem .. "i"
		end
	-- Special cases:
	-- Final "quy" ("soliloquy" → "soliloquies").
	-- Final "guy" iff `not_gu` is false ("roguy" → "roguiest").
	-- Final "y" after a vowel iff `final_y_is_i_after_vowel` is true ("slay" →
	-- "slain").
	-- Final "-y" ("bro-y" → "bro-iest"), accounting for hyphen variation.
	elseif umatch(final2, "[" .. hyphens .. "]y") then
		-- Replace final "y" with "i".
		return sub(str, 1, -2) .. "i"
	-- Otherwise, final "y" becomes "i" iff it's not preceded by a vowel
	-- ("shy" → "shiest", "horsy" → "horsies", but "day" → "days", "coy" →
	-- "coyest").
	else
		-- Remove "y" to get the base stem.
		local base_stem = sub(str, 1, -2)
		if umatch(normalize(base_stem, "y", not_gu), "[^%s%p" .. (final_y_is_i_after_vowel and "" or vowels) .. "]$") then
			return base_stem .. "i"
		end
	end
	return str
end

local function double_final_consonant(str, final)
	local initial = umatch(normalize(sub(str, 1, -2), final), "^.*%f[^%z%s" .. hyphens .. "…]([%l%p]*)[" .. vowels .. "]$")
	return initial and (
		initial == "" or
		initial == "y" or
		match(initial, "^.[\128-\191]*$") and umatch(initial, "[^" .. vowels .. "]") or
		umatch(initial, "^[^" .. vowels .. "]*%f[^%l]$")
	) and (str .. final) or str
end

local function remove_silent_e(str)
	local final2 = sub(str, -2)
	if final2 == "ie" then
		-- Replace "ie" with "y", unless it follows another "y" (e.g.
		-- "spulyie" → "spulyieing").
		return ugsub(str, "([^yY%s%p])ie$", "%1y")
	end
	local base_stem = sub(str, 1, -2)
	-- Silent "e" occurs after "u" or a consonant (cluster) preceded by a vowel.
	return (
		final2 == "ue" or
		umatch(normalize(base_stem, "e"), "[" .. vowels .. "][^" .. vowels .. "]+$")
	) and base_stem or str
end

function export.add_suffix(term, suffix, pos)
	local data, possessive = suffixes[suffix]
	-- If modifies_possessive is set, check for and remove any possessive
	-- suffix, which will be re-added again at the end.
	if data.modifies_possessive then
		local new = remove_possessive(term)
		if new ~= term then
			term, possessive = new, true
		end
	end
	suffix = match(suffix, "^([^.]*)")
	local final, stem = sub(term, -1)
	-- Proper nouns don't have a final "y" changed to "i" (e.g. "the Gettys",
	-- "the public Ivys").
	if data.final_y_is_i and final == "y" and pos ~= "proper noun" then
		stem = convert_final_y_to_i(term, not data.final_guy_is_gui, data.final_ey_is_i, data.final_y_is_i_after_vowel)
	elseif data.remove_silent_e and final == "e" then
		stem = remove_silent_e(term)
	else
		stem = term
	end
	local epenthetic_e = data.epenthetic_e
	if epenthetic_e and epenthetic_e(stem, term) then
		suffix = "e" .. suffix
	end
	if (
		data.final_consonant_is_doubled and
		match(final, "^[bcdfgjklmnpqrstvz]$") and -- Only double regular consonants.
		umatch(suffix, "^[" .. vowels .. "]")
	) then
		stem = double_final_consonant(term, final)
	end
	local truncated = data.truncated
	if truncated then
		suffix = truncated(stem)
	end
	local output = stem .. suffix
	-- Re-add the possessive suffix, if applicable.
	if possessive then
		output = add_suffix(output, "'s", pos)
	end
	return output
end
add_suffix = export.add_suffix

--[==[
Pluralize a word in a smart fashion, according to normal English rules.
# If the word ends in a consonant or "qu" + "-y", replace "-y" with "-ies".
# If the word ends in "s", "x", "z", "ch", "sh" or "zh", add "-es".
# Otherwise, add "-s".

This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
	-- Treat as a link if a "[[" is present and the string ends with "]]".
	if not (find(str, "[[", 1, true) and sub(str, -2) == "]]") then
		return add_suffix(str, "s.plural")
	end
	-- Find the last "[[" (in case there is more than one) by reversing
	-- the string.
	local str_rev = reverse(str)
	local open = find(str_rev, "[[", 3, true)
	-- If the last "[[" is followed by a "]]" which isn't at the end,
	-- then the final "]]" is just plaintext (e.g. "[[foo]]bar]]").
	local bad_close = find(str_rev, "]]", 3, true)
	-- Note: the bad "]]" will have a lower index than the last "[[" in
	-- the reversed string.
	if bad_close and bad_close < open then
		return add_suffix(str, "s.plural")
	end
	open = #str - open + 2
	-- Get the target and display text by searching from just after "[[".
	local target, display = match(str, "([^|]*)|?(.*)%]%]$", open)
	display = add_suffix(display ~= "" and display or target, "s.plural")
	-- If the link target is a substring of the display text, then
	-- use a trail (e.g. "[[foo]]" → "[[foo]]s", since "foo" is a substring
	-- of "foos").
	local index, trail = find(display, target, 1, true)
	if index == 1 then
		return sub(str, 1, open - 1) .. target .. "]]" .. sub(display, trail + 1)
	end
	-- Otherwise, return a piped link.
	return sub(str, 1, open - 1) .. target .. "|" .. display .. "]]"
end

--[==[
Returns true if `plural` is an expected, regular plural of `term`.
The optional parameter `pos` can be used to specify the part of speech,
which is necessary because proper nouns do not change a {"-y"} suffix to {"-ies"}
(e.g. {"Abby"} → {"Abbys"}). By default, `pos` is set to {"noun"}. In addition to
{"proper noun"}, it can also take the special value {"noun+"}, which means that
the function will first attempt the check with the {"noun"} setting, and will
then attempt it with the {"proper noun"} setting iff the term begins with a
capital letter.
]==]
function export.is_regular_plural(plural, term, pos)
	local init_plural, init_term, try_as_proper_noun = plural, term
	if pos == "noun+" then
		pos, try_as_proper_noun = "noun", true
	end
	-- Ignore any final punctuation that occurs in both forms, which is common
	-- in abbreviations (e.g. "abbr." → "abbrs.").
	local final_punc = umatch(term, "%p*$")
	local final_punc_len = #final_punc
	if sub(plural, -final_punc_len) == final_punc then
		term = sub(term, 1, -final_punc_len - 1)
		plural = sub(plural, 1, -final_punc_len - 1)
	end
	if plural == add_suffix(term, "s.plural", pos) then
		return true
	end
	local final = sub(term, -1)
	if (
		-- Doubled final consonants in "s" and "z".
		final == "s" and plural == term .. "ses" or -- e.g. "busses"
		final == "z" and plural == term .. "zes" or -- e.g. "quizzes"
		-- convert_final_y_to_i() without the `not_gu` flag set, to catch
		-- "-guy" → "-guies", but not "day" → "daies".
		final == "y" and plural == convert_final_y_to_i(term) .. "es" or
		-- Capitalized terms like "$DEITY" → "$DEITIES (should we treat this as regular?)
		final == "Y" and ulower(plural) == convert_final_y_to_i(ulower(term)) .. "es"
	) then
		return true
	elseif try_as_proper_noun then
		local init = umatch(init_term, "^[^%w%s]*(%w)")
		return init and uupper(init) == init and ulower(init) ~= init and
			is_regular_plural(init_plural, init_term, "proper noun") or
			false
	end
	return false
end
is_regular_plural = export.is_regular_plural

do
	local function do_singularize(str)
		local sing = match(str, "^(.-)ies$")
		if sing then
			return sing .. "y"
		end
		-- Handle cases like "[[parish]]es"
		return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
		-- Handle cases like "[[box]]es"
			match(str, "^(.-x%]*)es$") or -- not -ses or -zes
		-- Handle regular plurals
			match(str, "^(.-)s$") or
		-- Otherwise, return input
			str
	end
	
	local function collapse_link(link, linktext)
		if link == linktext then
			return "[[" .. link .. "]]"
		end
		return "[[" .. link .. "|" .. linktext .. "]]"
	end
	
	--[==[
	Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.

	'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
	# If word ends in -ies, replace -ies with -y.
	# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
	# Otherwise, remove -s.

	This handles links correctly:
	# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
	# If a non-piped link, singularize the link.
	# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
	  'sh' etc. and final -es.
	]==]
	function export.singularize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Check for a link. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
		if not link then
			return do_singularize(str)
		elseif linktext ~= "" then
			return beginning .. collapse_link(link, do_singularize(linktext))
		end
		return beginning .. "[[" .. do_singularize(link) .. "]]"
	end
end

--[==[
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
	str = str or ""
	-- If there's a link at the beginning, examine the first letter of the
	-- link text. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
	if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then
		return ucfirst and "An" or "an"
	end
	return ucfirst and "A" or "a"
end
get_indefinite_article = export.get_indefinite_article

--[==[
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.add_indefinite_article(text, ucfirst)
	return get_indefinite_article(text, ucfirst) .. " " .. text
end

export.vowels = vowels
export.vowel = "[" .. vowels .. "]"

return export