Module:ar-nominals
Jump to navigation
Jump to search
- The following documentation is located at Module:ar-nominals/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module implements template {{ar-decl-noun}}
, {{ar-decl-adj}}
, {{ar-decl-coll-noun}}
, {{ar-decl-sing-noun}}
, {{ar-decl-gendered-noun}}
, and {{ar-decl-numeral}}
.
For testcases, see Module:ar-nominals/testcases.
Functions
show_noun(frame)
- Main entry point for implementing
{{ar-decl-noun}}
.
show_adj(frame)
- Main entry point for implementing
{{ar-decl-adj}}
.
show_coll_noun(frame)
- Main entry point for implementing
{{ar-decl-coll-noun}}
.
show_sing_noun(frame)
- Main entry point for implementing
{{ar-decl-sing-noun}}
.
show_gendered_noun(frame)
- Main entry point for implementing
{{ar-decl-gendered-noun}}
.
show_numeral(frame)
- Main entry point for implementing
{{ar-decl-numeral}}
.
detect_type(stem, isfem, num)
- Detect declension of noun or adjective stem or lemma. We allow triptotes, diptotes and sound plurals to either come with ʾiʿrāb or not. We detect some cases where vowels are missing, when it seems fairly unambiguous to do so.
isfem
is true if we are dealing with a feminine form (only for adjectives).num
is sg, du or pl depending on the intended number of the noun or adjective.
stem_and_type(word, sg, sgtype, isfem, num)
- Return stem and declension of an argument given the singular form and declension type, whether this is a feminine form (only for adjectives), and the intended number of the noun or adjective (sg, du or pl). Singular form and type only used when inferring plural based on singular (e.g. when argument is a bare plural declension type such as 'sfp' for the sound feminine plural).
-- Author: Benwing, based on early version by CodeCat.
--[[
FIXME: Nouns/adjectives to create to exemplify complex declensions:
-- riḍan (رِضًا or رِضًى)
--]]
local m_utilities = require("Module:utilities")
local m_links = require("Module:links")
local ar_utilities = require("Module:ar-utilities")
local lang = require("Module:languages").getByCode("ar")
local u = require("Module:string/char")
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
-- This is used in place of a transliteration when no manual
-- translit is specified and we're unable to automatically generate
-- one (typically because some vowel diacritics are missing).
local BOGUS_CHAR = u(0xFFFD)
-- hamza variants
local HAMZA = u(0x0621) -- hamza on the line (stand-alone hamza) = ء
local HAMZA_ON_ALIF = u(0x0623)
local HAMZA_ON_W = u(0x0624)
local HAMZA_UNDER_ALIF = u(0x0625)
local HAMZA_ON_Y = u(0x0626)
local HAMZA_ANY = "[" .. HAMZA .. HAMZA_ON_ALIF .. HAMZA_UNDER_ALIF .. HAMZA_ON_W .. HAMZA_ON_Y .. "]"
local HAMZA_PH = u(0xFFF0) -- hamza placeholder
-- various letters
local ALIF = u(0x0627) -- ʾalif = ا
local AMAQ = u(0x0649) -- ʾalif maqṣūra = ى
local AMAD = u(0x0622) -- ʾalif madda = آ
local TAM = u(0x0629) -- tāʾ marbūṭa = ة
local T = u(0x062A) -- tāʾ = ت
local HYPHEN = u(0x0640)
local N = u(0x0646) -- nūn = ن
local W = u(0x0648) -- wāw = و
local Y = u(0x064A) -- yā = ي
-- diacritics
local A = u(0x064E) -- fatḥa
local AN = u(0x064B) -- fatḥatān (fatḥa tanwīn)
local U = u(0x064F) -- ḍamma
local UN = u(0x064C) -- ḍammatān (ḍamma tanwīn)
local I = u(0x0650) -- kasra
local IN = u(0x064D) -- kasratān (kasra tanwīn)
local SK = u(0x0652) -- sukūn = no vowel
local SH = u(0x0651) -- šadda = gemination of consonants
local DAGGER_ALIF = u(0x0670)
local DIACRITIC_ANY_BUT_SH = "[" .. A .. I .. U .. AN .. IN .. UN .. SK .. DAGGER_ALIF .. "]"
-- common combinations
local NA = N .. A
local NI = N .. I
local AH = A .. TAM
local AT = A .. T
local AA = A .. ALIF
local AAMAQ = A .. AMAQ
local AAH = AA .. TAM
local AAT = AA .. T
local II = I .. Y
local IIN = II .. N
local IINA = II .. NA
local IY = II
local UU = U .. W
local UUN = UU .. N
local UUNA = UU .. NA
local AY = A .. Y
local AW = A .. W
local AYSK = AY .. SK
local AWSK = AW .. SK
local AAN = AA .. N
local AANI = AA .. NI
local AYN = AYSK .. N
local AYNI = AYSK .. NI
local AWN = AWSK .. N
local AWNA = AWSK .. NA
local AYNA = AYSK .. NA
local AYAAT = AY .. AAT
local UNU = "[" .. UN .. U .. "]"
-- optional diacritics/letters
local AOPT = A .. "?"
local AOPTA = A .. "?" .. ALIF
local IOPT = I .. "?"
local UOPT = U .. "?"
local UNOPT = UN .. "?"
local UNUOPT = UNU .. "?"
local SKOPT = SK .. "?"
-- lists of consonants
-- exclude tāʾ marbūṭa because we don't want it treated as a consonant
-- in patterns like أَفْعَل
local consonants_needing_vowels_no_tam = "بتثجحخدذرزسشصضطظعغفقكلمنهپچڤگڨڧأإؤئء"
-- consonants on the right side; includes alif madda
local rconsonants_no_tam = consonants_needing_vowels_no_tam .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants_no_tam = consonants_needing_vowels_no_tam .. "وي"
local CONS = "[" .. lconsonants_no_tam .. "]"
local CONSPAR = "([" .. lconsonants_no_tam .. "])"
local LRM = u(0x200E) --left-to-right mark
-- First syllable or so of elative/color-defect adjective
local ELCD_START = "^" .. HAMZA_ON_ALIF .. AOPT .. CONSPAR
local export = {}
--------------------
-- Utility functions
--------------------
function ine(x) -- If Not Empty
if x == nil then
return nil
elseif rfind(x, '^".*"$') then
local ret = rmatch(x, '^"(.*)"$')
return ret
elseif rfind(x, "^'.*'$") then
local ret = rmatch(x, "^'(.*)'$")
return ret
elseif x == "" then
return nil
else
return x
end
end
-- Compare two items, recursively comparing arrays.
-- FIXME, doesn't work for tables that aren't arrays.
function equals(x, y)
if type(x) == "table" and type(y) == "table" then
if #x ~= #y then
return false
end
for key, value in ipairs(x) do
if not equals(value, y[key]) then
return false
end
end
return true
end
return x == y
end
-- true if array contains item
function contains(tab, item)
for _, value in pairs(tab) do
if equals(value, item) then
return true
end
end
return false
end
-- append to array if element not already present
function insert_if_not(tab, item)
if not contains(tab, item) then
table.insert(tab, item)
end
end
-- version of rsubn() that discards all but the first return value
function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- version of rsub() that asserts that a match occurred
function assert_rsub(term, foo, bar)
local retval, numsub = rsubn(term, foo, bar)
assert(numsub > 0)
return retval
end
function make_link(arabic)
--return m_links.full_link(nil, arabic, lang, nil, "term", nil, {tr = "-"}, false)
return m_links.full_link({lang = lang, alt = arabic}, "term")
end
function track(page)
require("Module:debug").track("ar-nominals/" .. page)
return true
end
-------------------------------------
-- Functions for building inflections
-------------------------------------
-- Functions that do the actual inflecting by creating the forms of a basic term.
local inflections = {}
local max_mods = 9 -- maximum number of modifiers
local mod_list = {"mod"} -- list of "mod", "mod2", "mod3", ...
for i=2,max_mods do
table.insert(mod_list, "mod" .. i)
end
-- Create and return the 'data' structure that will hold all of the
-- generated declensional forms, as well as other ancillary information
-- such as the possible numbers, genders and cases the the actual numbers
-- and states to store (in 'data.numbers' and 'data.states' respectively).
function init_data()
-- FORMS contains a table of forms for each inflectional category,
-- e.g. "nom_sg_ind" for nouns or "nom_m_sg_ind" for adjectives. The value
-- of an entry is an array of alternatives (e.g. different plurals), where
-- each alternative is either a string of the form "ARABIC" or
-- "ARABIC/TRANSLIT", or an array of such strings (this is used for
-- alternative spellings involving different hamza seats,
-- e.g. مُبْتَدَؤُون or مُبْتَدَأُون). Alternative hamza spellings are separated
-- in display by an "inner separator" (/), while alternatives on
-- the level of different plurals are separated by an "outer separator" (;).
return {forms = {}, title = nil, categories = {},
allgenders = {"m", "f"},
allstates = {"ind", "def", "con"},
allnumbers = {"sg", "du", "pl"},
states = {}, -- initialized later
numbers = {}, -- initialized later
engnumbers = {sg="singular", du="dual", pl="plural",
coll="collective", sing="singulative", pauc="paucal"},
engnumberscap = {sg="Singular", du="Dual", pl="Plural",
coll="Collective", sing="Singulative", pauc="Paucal (3-10)"},
allcases = {"nom", "acc", "gen", "inf"},
allcases_with_lemma = {"nom", "acc", "gen", "inf", "lemma"},
-- index into endings array indicating correct ending for given
-- combination of state and case
statecases = {
ind = {nom = 1, acc = 2, gen = 3, inf = 10, lemma = 13},
def = {nom = 4, acc = 5, gen = 6, inf = 11, lemma = 14},
-- used for a definite adjective modifying a construct-state noun
defcon = {nom = 4, acc = 5, gen = 6, inf = 11, lemma = 14},
con = {nom = 7, acc = 8, gen = 9, inf = 12, lemma = 15},
},
}
end
-- Initialize and return ARGS, ORIGARGS and DATA (see init_data()).
-- ARGS is a table of user-supplied arguments, massaged from the original
-- arguments by converting empty-string arguments to nil and appending
-- translit arguments to their base arguments with a separating slash.
-- ORIGARGS is the original table of arguments.
function init(origargs)
-- Massage arguments by converting empty arguments to nil, and
-- "" or '' arguments to empty.
local args = {}
for k, v in pairs(origargs) do
args[k] = ine(v)
end
-- Further massage arguments by appending translit arguments to the
-- corresponding base arguments, with a slash separator, as is expected
-- in the rest of the code.
--
-- FIXME: We should consider separating translit and base arguments by the
-- separators ; , | (used in overrides; see handle_lemma_and_overrides())
-- and matching up individual parts, to allow separate translit arguments
-- to be specified for overrides. But maybe not; the point of allowing
-- separate translit arguments is for compatibility with headword
-- templates such as "ar-noun" and "ar-adj", and those templates don't
-- handle override arguments.
local function dotr(arg, argtr)
if not args[arg] then
error("Argument '" .. argtr .."' specified but not corresponding base argument '" .. arg .. "'")
end
args[arg] = args[arg] .. "/" .. args[argtr]
end
-- By convention, corresponding to arg 1 is tr; corresponding to
-- head2, head3, ... is tr2, tr3, ...; corresponding to
-- modhead2, modhead3, ... is modtr2, modtr3, ...; corresponding to
-- modNhead2, modNhead3, ... is modNtr2, modNtr3, ..; corresponding to
-- all other arguments FOO, FOO2, ... is FOOtr, FOO2tr, ...
for k, v in pairs(args) do
if k == "tr" then
dotr(1, "tr")
elseif rfind(k, "tr[0-9]+$") then
dotr(assert_rsub(k, "tr([0-9]+)$", "head%1"), k)
elseif rfind(k, "tr$") then
dotr(assert_rsub(k, "tr$", ""), k)
end
end
-- Construct data.
local data = init_data()
return args, origargs, data
end
-- Parse the user-specified state spec and other related arguments. The
-- user can specify, using idafaN=, how modifiers are related to previous
-- words. The user can also manually specify which states are to appear;
-- whether to omit the definite article in the definite state; and
-- how/whether to restrict modifiers to a particular state, case or number.
-- Normally the modN_* parameters and basestate= do not need to be set
-- directly; instead, use idafaN=. It may be necessary to explicitly
-- specify state= in the presence of proper nouns or definite-only
-- adjectival expressions. NOTE: At the time this function is called,
-- data.numbers has not yet been initialized.
function parse_state_etc_spec(data, args)
local function check(arg, dataval, allvalues)
if args[arg] then
if not contains(allvalues, args[arg]) then
error("For " .. arg .. "=, value '" .. args[arg] .. "' should be one of " ..
table.concat(allvalues, ", "))
end
data[dataval] = args[arg]
end
end
local function check_boolean(arg, dataval)
check(arg, dataval, {"yes", "no"})
if data[dataval] == "yes" then
data[dataval] = true
elseif data[dataval] == "no" then
data[dataval] = false
end
end
-- Make sure no holes in mod values
for i=1,(#mod_list)-1 do
if args[mod_list[i+1]] and not args[mod_list[i]] then
error("Hole in modifier arguments -- " .. mod_list[i+1] ..
" present but not " .. mod_list[i])
end
end
-- FIXME! Remove this once we're sure there are no instances of mod2
-- that haven't been converted to modhead2.
if args["mod2"] then
track("mod2")
end
-- Set default value; may be overridden e.g. by arg["state"] or
-- by idafaN=.
data.states = data.allstates
-- List of pairs of idafaN/modN parameters
local idafa_mod_list = {{"idafa", "mod"}}
for i=2,max_mods do
table.insert(idafa_mod_list, {"idafa" .. i, "mod" .. i})
end
-- True if the value of an |idafa= param is a valid adjectival modifier
-- value.
local function valid_adjectival_idafaval(idafaval)
return idafaval == "adj" or idafaval == "adj-base" or
idafaval == "adj-mod" or rfind(idafaval, "^adj%-mod[0-9]+$")
end
-- Extract the referent (base or modifier) of an adjectival |idafa= param.
-- Assumes the value is valid.
local function adjectival_idafaval_referent(idafaval)
if idafaval == "adj" then
return "base"
end
return assert_rsub(idafaval, "^adj%-", "")
end
-- Convert a base/mod spec to an index: 0=base, 1=mod, 2=mod2, etc.
local function basemod_to_index(basemod)
if basemod == "base" then return 0 end
if basemod == "mod" then return 1 end
return tonumber(assert_rsub(basemod, "^mod", ""))
end
-- Recognize idafa spec and handle it.
-- We do the following:
-- (1) Check that if idafaN= is given, then modN= is also given.
-- (2) Check that adjectival modifiers aren't followed by idafa modifiers.
-- (3) Check that adjectival modifiers are modifying the base or an
-- ʾidāfa modifier, not another adjectival modifier.
-- (4) Support idafa values "adj-base", "adj-mod", "adj-mod2", "adj"
-- (="adj-base") etc. and check that we're referring to an earlier
-- word.
-- (5) For ʾidāfa modifiers, set basestate=con, set modN_case=gen,
-- set modN_idafa=true, and set modN_number to the number specified
-- in the parameter value (e.g. 'sg' or 'def-pl'); and if the
-- parameter value specifies a state (e.g. 'def' or 'ind-du'),
-- set modN_state= to this value, and if this is the last ʾidāfa
-- modifier, also set state= to this value; if this is not the last
-- ʾidāfa modifier, set modN_state=con and disallow a state to be
-- specified in the parameter value.
-- (6) For adjectival modifiers of the base, do nothing.
-- (7) For adjectival modifiers of ʾidāfa modifiers, set modN_case=gen;
-- set modN_idafa=false; and set modN_number=, modN_numgen= and
-- modN_state= to match the values of the idafa modifier.
-- error checking and find last ʾidāfa modifier
local last_is_idafa = true
local last_idafa_mod = "base"
for _, idafa_mod in ipairs(idafa_mod_list) do
local idafaparam = idafa_mod[1]
local mod = idafa_mod[2]
local idafaval = args[idafaparam]
if idafaval then
local paramval = idafaparam .. "=" .. idafaval
if not args[mod] then
error("'" .. idafaparam .. "' parameter without corresponding '"
.. mod .. "' parameter")
end
if not valid_adjectival_idafaval(idafaval) then
-- We're a construct (ʾidāfa) modifier
if not last_is_idafa then
error("ʾidāfa modifier " .. paramval .. " follows adjectival modifier")
end
last_idafa_mod = mod
else
last_is_idafa = false
local adjref = adjectival_idafaval_referent(idafaval)
if adjref ~= "base" then
if basemod_to_index(adjref) >= basemod_to_index(mod) then
error(paramval .. " can only refer to an earlier element")
end
local idafaref = assert_rsub(adjref, "^mod", "idafa")
if not args[idafaref] then
error(paramval .. " cannot refer to a missing modifier")
elseif valid_adjectival_idafaval(args[idafaref]) then
error(paramval .. " cannot refer to an adjectival modifier")
end
end
end
end
end
-- Now go through and set all the modN_ data values appropriately.
for _, idafa_mod in ipairs(idafa_mod_list) do
local idafaparam = idafa_mod[1]
local mod = idafa_mod[2]
local idafaval = args[idafaparam]
if idafaval then
local paramval = idafaparam .. "=" .. idafaval
local bad_idafa = true
if idafaval == "yes" then
idafaval = "sg"
end
if idafaval == "ind-def" or contains(data.allstates, idafaval) then
idafaval = idafaval .. "-sg"
end
if not idafaval then
bad_idafa = false
elseif valid_adjectival_idafaval(idafaval) then
local adjref = adjectival_idafaval_referent(idafaval)
if adjref ~= "base" then
data[mod .. "_case"] = "gen"
data[mod .. "_state"] = data[adjref .. "_state"]
-- if agreement is with ind-def, make it def
if data[mod .. "_state"] == "ind-def" then
data[mod .. "_state"] = "def"
end
data[mod .. "_number"] = data[adjref .. "_number"]
data[mod .. "_numgen"] = data[adjref .. "_numgen"]
data[mod .. "_idafa"] = false
end
bad_idafa = false
elseif contains(data.allnumbers, idafaval) then
data.basestate = "con"
data[mod .. "_case"] = "gen"
data[mod .. "_number"] = idafaval
data[mod .. "_idafa"] = true
if mod ~= last_idafa_mod then
data[mod .. "_state"] = "con"
end
bad_idafa = false
elseif rfind(idafaval, "%-") then
local state_num = rsplit(idafaval, "%-")
-- Support ind-def as a possible value. We set modstate to
-- ind-def, which will signal definite agreement with adjectival
-- modifiers; then later on we change the value to ind.
if #state_num == 3 and state_num[1] == "ind" and state_num[2] == "def" then
state_num[1] = "ind-def"
state_num[2] = state_num[3]
table.remove(state_num)
end
if #state_num == 2 then
local state = state_num[1]
local num = state_num[2]
if (state == "ind-def" or contains(data.allstates, state))
and contains(data.allnumbers, num) then
if mod == last_idafa_mod then
if state == "ind-def" then
data.states = {"def"}
else
data.states = {state}
end
else
error(paramval .. " cannot specify a state because it is not the last ʾidāfa modifier")
end
data.basestate = "con"
data[mod .. "_case"] = "gen"
data[mod .. "_state"] = state
data[mod .. "_number"] = num
data[mod .. "_idafa"] = true
bad_idafa = false
end
end
end
if bad_idafa then
error(paramval .. " should be one of yes, def, sg, def-sg, adj, adj-base, adj-mod, adj-mod2 or similar")
end
end
end
if args["state"] == "ind-def" then
data.states = {"def"}
data.basestate = "ind"
elseif args["state"] then
data.states = rsplit(args["state"], ",")
for _, state in ipairs(data.states) do
if not contains(data.allstates, state) then
error("For state=, value '" .. state .. "' should be one of " ..
table.concat(data.allstates, ", "))
end
end
end
-- Now process explicit settings, so that they can override the
-- settings based on idafaN=.
check("basestate", "basestate", data.allstates)
check_boolean("noirreg", "noirreg")
check_boolean("omitarticle", "omitarticle")
data.prefix = args.prefix
for _, mod in ipairs(mod_list) do
check(mod .. "state", mod .. "_state", data.allstates)
check(mod .. "case", mod .. "_case", data.allcases)
check(mod .. "number", mod .. "_number", data.allnumgens)
check(mod .. "numgen", mod .. "_numgen", data.allnumgens)
check_boolean(mod .. "idafa", mod .. "_idafa")
check_boolean(mod .. "omitarticle", mod .. "_omitarticle")
data[mod .. "_prefix"] = args[mod .. "prefix"]
end
-- Make sure modN_numgen is initialized, to modN_number if necessary.
-- This simplifies logic in certain places, e.g. call_inflections().
-- Also convert ind-def to ind.
for _, mod in ipairs(mod_list) do
data[mod .. "_numgen"] = data[mod .. "_numgen"] or data[mod .. "_number"]
if data[mod .. "_state"] == "ind-def" then
data[mod.. "_state"] = "ind"
end
end
end
-- Parse the user-specified number spec. The user can manually specify which
-- numbers are to appear. Return true if |number= was specified.
function parse_number_spec(data, args)
if args["number"] then
data.numbers = rsplit(args["number"], ",")
for _, num in ipairs(data.numbers) do
if not contains(data.allnumbers, num) then
error("For number=, value '" .. num .. "' should be one of " ..
table.concat(data.allnumbers, ", "))
end
end
return true
else
data.numbers = data.allnumbers
return false
end
end
-- Determine which numbers will appear using the logic for nouns.
-- See comment just below.
function determine_noun_numbers(data, args, pls)
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, duals and plurals
-- appear; else, only singular (on the assumption that the word is a proper
-- noun or abstract noun that exists only in the singular); however,
-- singular won't appear if "-" given for singular, and similarly for dual.
if not parse_number_spec(data, args) then
data.numbers = {}
local sgarg1 = args[1]
local duarg1 = args["d"]
if sgarg1 ~= "-" then
table.insert(data.numbers, "sg")
end
if #pls["base"] > 0 then
-- Dual appears if either: explicit dual stem (not -) is given, or
-- default dual is used and explicit singular stem (not -) is given.
if (duarg1 and duarg1 ~= "-") or (not duarg1 and sgarg1 ~= "-") then
table.insert(data.numbers, "du")
end
table.insert(data.numbers, "pl")
elseif duarg1 and duarg1 ~= "-" then
-- If explicit dual but no plural given, include it. Useful for
-- dual tantum words.
table.insert(data.numbers, "du")
end
end
end
-- For stem STEM, convert to stem-and-type format and insert stem and type
-- into RESULTS, checking to make sure it's not already there. SGS is the
-- list of singular items to base derived forms off of (masculine or feminine
-- as appropriate), an array of length-two arrays of {COMBINED_STEM, TYPE} as
-- returned by stem_and_type(); ISFEM is true if this is feminine gender;
-- NUM is "sg", "du" or "pl". POS is the part of speech, generally "noun" or
-- "adjective".
function insert_stems(stem, results, sgs, isfem, num, pos)
if stem == "-" then
return
end
for _, sg in ipairs(sgs) do
local combined_stem, ty = export.stem_and_type(stem,
sg[1], sg[2], isfem, num, pos)
insert_if_not(results, {combined_stem, ty})
end
end
-- Handle manually specified overrides of individual forms. Separate
-- outer-level alternants with ; or , or the Arabic equivalents; separate
-- inner-level alternants with | (we can't use / because it's already in
-- use separating Arabic from translit).
--
-- Also determine lemma and allow it to be overridden.
-- Also allow POS (part of speech) to be overridden.
function handle_lemma_and_overrides(data, args)
local function handle_override(arg)
if args[arg] then
local ovval = {}
local alts1 = rsplit(args[arg], "[;,؛،]")
for _, alt1 in ipairs(alts1) do
local alts2 = rsplit(alt1, "|")
table.insert(ovval, alts2)
end
data.forms[arg] = ovval
end
end
local function do_overrides(mod)
for _, numgen in ipairs(data.allnumgens) do
for _, state in ipairs(data.allstates) do
for _, case in ipairs(data.allcases) do
local arg = mod .. case .. "_" .. numgen .. "_" .. state
handle_override(arg)
if args[arg] and not data.noirreg then
insert_cat(data, mod, numgen,
"Arabic NOUNs with irregular SINGULAR",
"SINGULAR of irregular NOUN")
end
end
end
end
end
do_overrides("")
for _, mod in ipairs(mod_list) do
do_overrides(mod .. "_")
end
local function get_lemma(mod)
for _, numgen in ipairs(data.numgens()) do
for _, state in ipairs(data.states) do
local arg = mod .. "lemma_" .. numgen .. "_" .. state
if data.forms[arg] and #data.forms[arg] > 0 then
return data.forms[arg]
end
end
end
return nil
end
data.forms["lemma"] = get_lemma("")
for _, mod in ipairs(mod_list) do
data.forms[mod .. "_lemma"] = get_lemma(mod .. "_")
end
handle_override("lemma")
for _, mod in ipairs(mod_list) do
handle_override(mod .. "_lemma")
end
end
-- Return the part of speech based on the part of speech contained in
-- data.pos and MOD (either "", "mod_", "mod2_", etc., same as in
-- do_gender_number_1()). If we're a modifier, don't use data.pos but
-- instead choose based on whether modifier is adjectival or nominal
-- (ʾiḍāfa).
function get_pos(data, mod)
local ismod = mod ~= ""
if not ismod then
return data.pos
elseif data[mod .. "idafa"] then
return "noun"
else
return "adjective"
end
end
-- Find the stems associated with a particular gender/number combination.
-- ARGS is the set of all arguments. ARGPREFS is an array of argument prefixes
-- (e.g. "f" for the actual arguments "f", "f2", ..., for the feminine
-- singular; we allow more than one to handle "cpl"). SGS is a
-- "stem-type list" (see do_gender_number()), and is the list of stems to
-- base derived forms off of (masculine or feminine as appropriate), an array
-- of length-two arrays of {COMBINED_STEM, TYPE} as returned by
-- stem_and_type(). DEFAULT, ISFEM and NUM are as in do_gender_number().
-- MOD is either "", "mod_", "mod2_", etc. depending if we're working on a
-- base or modifier argument (in the latter case, basically if the argument
-- begins with "mod").
function do_gender_number_1(data, args, argprefs, sgs, default, isfem, num, mod)
local results = {}
local function handle_stem(stem)
insert_stems(stem, results, sgs, isfem, num, get_pos(data, mod))
end
-- If no arguments specified, use the default instead.
need_default = true
for _, argpref in ipairs(argprefs) do
if args[argpref] then
need_default = false
break
end
end
if need_default then
if not default then
return results
end
handle_stem(default)
return results
end
-- For explicitly specified arguments, make sure there's at least one
-- stem to generate off of; otherwise specifying e.g. 'sing=- pauc=فُلَان'
-- won't override paucal.
if #sgs == 0 then
sgs = {{"", ""}}
end
for _, argpref in ipairs(argprefs) do
if args[argpref] then
handle_stem(args[argpref])
end
local i = 2
while args[argpref .. i] do
handle_stem(args[argpref .. i])
i = i + 1
end
end
return results
end
-- For a given gender/number combination, parse and return the full set
-- of stems for both base and modifier. The return value is a
-- "stem specification", i.e. table with a "base" key for the base, a
-- "mod" key for the first modifier (see below), a "mod2" key for the
-- second modifier, etc. listing all stems for both the base and modifier(s).
-- The value of each key is a "stem-type list", i.e. an array of stem-type
-- pairs, where each element is a size-two array of {COMBINED_STEM, STEM_TYPE}.
-- COMBINED_STEM is a stem with attached transliteration in the form
-- STEM/TRANSLIT (where the transliteration is either manually specified in
-- the stem argument, e.g. 'pl=لُورْدَات/lordāt', or auto-transliterated from
-- the Arabic, with BOGUS_CHAR substituting for the transliteration if
-- auto-translit fails). STEM_TYPE is the declension of the stem, either
-- manually specified, e.g. 'بَبَّغَاء:di' for manually-specified diptote, or
-- auto-detected (see stem_and_type() and detect_type()).
--
-- DATA and ARGS are as in init(). ARGPREFS is an array of the prefixes for
-- the argument(s) specifying the stem (and optional translit and declension
-- type). For a given ARGPREF, we check ARGPREF, ARGPREF2, ARGPREF3, ... in
-- turn for the base, and modARGPREF, modARGPREF2, modARGPREF3, ... in turn
-- for the first modifier, and mod2ARGPREF, mod2ARGPREF2, mod2ARGPREF3, ...
-- for the second modifier, etc. SGS is a stem specification (see above),
-- giving the stems that are used to base derived forms off of (e.g. if a stem
-- type "smp" appears in place of a stem, the sound masculine plural of the
-- stems in SGS will be derived). DEFAULT is a single stem (i.e. a string) that
-- is used when no stems were explicitly given by the user (typically either
-- "f", "m", "d" or "p"), or nil for no default. ISFEM is true if we're
-- accumulating stems for a feminine number/gender category, and NUM is the
-- number (expected to be "sg", "du" or "pl") of the number/gender category
-- we're accumulating stems for.
--
-- About bases and modifiers: Note that e.g. in the noun phrase يَوْم الاِثْنَيْن
-- the head noun يَوْم is the base and the noun الاِثْنَيْن is the modifier.
-- In a noun phrase like البَحْر الأَبْيَض المُتَوَسِّط, there are two modifiers.
-- Note that modifiers come in two varieties, adjectival modifiers and
-- construct (ʾidāfa) modifiers. The first above noun phrase is an example
-- of a noun phrase with a construct modifier, where the base is fixed in
-- the construct state and the modifier is fixed in number and case
-- (which is always genitive) and possibly in state. The second above noun
-- phrase is an example of a noun phrase with two adjectival modifiers.
-- A construct modifier is generally a noun, whereas an adjectival modifier
-- is an adjective that usually agrees in state, number and case with the
-- base noun. (Note that in the case of multiple modifiers, it is possible
-- for e.g. the second modifier to be an adjectival modifier that agrees
-- with the first, construct, modifier, in which case its case will be fixed
-- to genitive, its number will be fixed to the same number as the first
-- modifier and its state will vary or not depending on whether the first
-- modifier's state varies. It is not possible in general to distinguish
-- adjectival and construct modifiers by looking at the values of
-- modN_state, modN_case or modN_number, since e.g. a third modifier could
-- have all of them specified and be either kind. Thus we have modN_idafa,
-- which is true for a construct modifier, false otherwise.)
function do_gender_number(data, args, argprefs, sgs, default, isfem, num)
local results = do_gender_number_1(data, args, argprefs, sgs["base"],
default, isfem, num, "")
basemodtable = {base=results}
for _, mod in ipairs(mod_list) do
local modn_argprefs = {}
for _, argpref in ipairs(argprefs) do
table.insert(modn_argprefs, mod .. argpref)
end
local modn_results = do_gender_number_1(data, args, modn_argprefs,
sgs[mod] or {}, default, isfem, num, mod .. "_")
basemodtable[mod] = modn_results
end
return basemodtable
end
-- Generate inflections for the given combined stem and type, for MOD
-- (either "" if we're working on the base or "mod_", "mod2_", etc. if we're
-- working on a modifier) and NUMGEN (number or number-gender combination,
-- of the sort that forms part of the keys in DATA.FORMS).
function call_inflection(combined_stem, ty, data, mod, numgen)
if ty == "-" then
return
end
if not inflections[ty] then
error("Unknown inflection type '" .. ty .. "'")
end
local ar, tr = split_arabic_tr(combined_stem)
inflections[ty](ar, tr, data, mod, numgen)
end
-- Generate inflections for the stems of a given number/gender combination
-- and for either the base or the modifier. STEMTYPES is a stem-type list
-- (see do_gender_number()), listing all the stems and corresponding
-- declension types. MOD is either "", "mod_", "mod2_", etc. depending on
-- whether we're working on the base or a modifier. NUMGEN is the number or
-- number-gender combination we're working on, of the sort that forms part
-- of the keys in DATA.FORMS, e.g. "sg" or "m_sg".
function call_inflections(stemtypes, data, mod, numgen)
local mod_with_modnumgen = mod ~= "" and data[mod .. "numgen"]
-- If modN_numgen= is given, do nothing if NUMGEN isn't the same
if mod_with_modnumgen and data[mod .. "numgen"] ~= numgen then
return
end
-- always call inflection() if mod_with_modnumgen since it may affect
-- other numbers (cf. يَوْم الاِثْنَيْن)
if mod_with_modnumgen or contains(data.numbers, rsub(numgen, "^.*_", "")) then
for _, stemtype in ipairs(stemtypes) do
call_inflection(stemtype[1], stemtype[2], data, mod, numgen)
end
end
end
-- Generate the entire set of inflections for a noun or adjective.
-- Also handle any manually-specified part of speech and any manual
-- inflection overrides. The value of INFLECTIONS is an array of stem
-- specifications, one per number, where each element is a size-two
-- array of a stem specification (containing the set of stems and
-- corresponding declension types for the base and any modifiers;
-- see do_gender_number()) and a NUMGEN string, i.e. a string identifying
-- the number or number/gender in question (e.g. "sg", "du", "pl",
-- "m_sg", "f_pl", etc.).
function do_inflections_and_overrides(data, args, inflections)
-- do this before generating inflections so POS change is reflected in
-- categories
if args["pos"] then
data.pos = args["pos"]
end
for _, inflection in ipairs(inflections) do
call_inflections(inflection[1]["base"] or {}, data, "", inflection[2])
for _, mod in ipairs(mod_list) do
call_inflections(inflection[1][mod] or {}, data,
mod .. "_", inflection[2])
end
end
handle_lemma_and_overrides(data, args)
end
-- Helper function for get_heads(). Parses the stems for either the
-- base or the modifier (see do_gender_number()). ARG1 is the argument
-- for the first stem and ARGN is the prefix of the arguments for the
-- remaining stems. For example, for the singular base, ARG1=1 and
-- ARGN="head"; for the first singular modifier, ARG1="mod" and
-- ARGN="modhead"; for the plural base, ARG1=ARGN="pl". The arguments
-- other than the first are numbered 2, 3, ..., which is appended to
-- ARGN. MOD is either "", "mod_", "mod2_", etc. depending if we're
-- working on a base or modifier argument. The returned value is an
-- array of stems, where each element is a size-two array of
-- {COMBINED_STEM, STEM_TYPE}. See do_gender_number().
function get_heads_1(data, args, arg1, argn, mod)
if not args[arg1] then
return {}
end
local heads
if args[arg1] == "-" then
heads = {{"", "-"}}
else
heads = {}
insert_stems(args[arg1], heads, {{args[arg1], ""}}, false, "sg",
get_pos(data, mod))
end
local i = 2
while args[argn .. i] do
local arg = args[argn .. i]
insert_stems(arg, heads, {{arg, ""}}, false, "sg",
get_pos(data, mod))
i = i + 1
end
return heads
end
-- Very similar to do_gender_number(), and returns the same type of
-- structure, but works specifically for the stems of the head (the
-- most basic gender/number combiation, e.g. singular for nouns,
-- masculine singular for adjectives and gendered nouns, collective
-- for collective nouns, etc.), including both base and modifier.
-- See do_gender_number(). Note that the actual return value is
-- two items, the first of which is the same type of structure
-- returned by do_gender_number() and the second of which is a boolean
-- indicating whether we were called from within a template documentation
-- page (in which case no user-specified arguments exist and we
-- substitute sample ones). The reason for this boolean is to indicate
-- whether sample arguments need to be substituted for other numbers
-- as well.
function get_heads(data, args, headtype)
if not args[1] and mw.title.getCurrentTitle().nsText == "Template" then
return {base={{"{{{1}}}", "tri"}}}, true
end
if not args[1] then error("Parameter 1 (" .. headtype .. " stem) may not be empty.") end
local base = get_heads_1(data, args, 1, "head", "")
basemodtable = {base=base}
for _, mod in ipairs(mod_list) do
local modn = get_heads_1(data, args, mod, mod .. "head", mod .. "_")
basemodtable[mod] = modn
end
return basemodtable, false
end
-- The main entry point for noun tables.
function export.show_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "noun"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
local sgs, is_template = get_heads(data, args, "singular")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, sgs, nil, false, "pl")
-- always do dual so cases like يَوْم الاِثْنَيْن work -- a singular with
-- a dual modifier, where data.number refers only the singular
-- but we need to go ahead and compute the dual so it parses the
-- "modd" modifier dual argument. When the modifier dual argument
-- is parsed, it will store the resulting dual declension for اِثْنَيْن
-- in the modifier slot for all numbers, including specifically
-- the singular.
local dus = do_gender_number(data, args, {"d"}, sgs, "d", false, "du")
parse_state_etc_spec(data, args)
determine_noun_numbers(data, args, pls)
do_inflections_and_overrides(data, args,
{{sgs, "sg"}, {dus, "du"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
function any_feminine(data, stem_spec)
for basemod, stemtypelist in pairs(stem_spec) do
-- Only check modifiers if modN_numgen= not given. If not given, the
-- modifier needs to be declined for all numgens; else only for the
-- given numgen, which should be explicitly specified.
if not (basemod ~= "base" and data[basemod .. "_numgen"]) then
for _, stemtype in ipairs(stemtypelist) do
if rfind(stemtype[1], TAM .. UNUOPT .. "/") then
return true
end
end
end
end
return false
end
function all_feminine(data, stem_spec)
for basemod, stemtypelist in pairs(stem_spec) do
-- Only check modifiers if modN_numgen= not given. If not given, the
-- modifier needs to be declined for all numgens; else only for the
-- given numgen, which should be explicitly specified.
if not (basemod ~= "base" and data[basemod .. "_numgen"]) then
for _, stemtype in ipairs(stemtypelist) do
if not rfind(stemtype[1], TAM .. UNUOPT .. "/") then
return false
end
end
end
end
return true
end
-- The main entry point for collective noun tables.
function export.show_coll_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "noun"
data.allnumbers = {"coll", "sing", "du", "pauc", "pl"}
data.engnumberscap["pl"] = "Plural of variety"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
local colls, is_template = get_heads(data, args, "collective")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, colls, nil, false, "pl")
parse_state_etc_spec(data, args)
-- If collective noun is already feminine in form, don't try to
-- form a feminine singulative
local collfem = any_feminine(data, colls)
local sings = do_gender_number(data, args, {"sing"}, colls,
not already_feminine and "f" or nil, true, "sg")
local singfem = all_feminine(data, sings)
local dus = do_gender_number(data, args, {"d"}, sings, "d", singfem, "du")
local paucs = do_gender_number(data, args, {"pauc"}, sings, "paucp",
singfem, "pl")
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, plurals appear,
-- and if singulative given, dual and paucal appear.
if not parse_number_spec(data, args) then
data.numbers = {}
if args[1] ~= "-" then
table.insert(data.numbers, "coll")
end
if #sings["base"] > 0 then
table.insert(data.numbers, "sing")
end
if #dus["base"] > 0 then
table.insert(data.numbers, "du")
end
if #paucs["base"] > 0 then
table.insert(data.numbers, "pauc")
end
if #pls["base"] > 0 then
table.insert(data.numbers, "pl")
end
end
-- Generate the collective, singulative, dual, paucal and plural forms
do_inflections_and_overrides(data, args,
{{colls, "coll"}, {sings, "sing"}, {dus, "du"}, {paucs, "pauc"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
-- The main entry point for singulative noun tables.
function export.show_sing_noun(frame)
local args, origargs, data = init(frame:getParent().args)
data.pos = "noun"
data.allnumbers = {"sing", "coll", "du", "pauc", "pl"}
data.engnumberscap["pl"] = "Plural of variety"
data.numgens = function() return data.numbers end
data.allnumgens = data.allnumbers
parse_state_etc_spec(data, args)
local sings, is_template = get_heads(data, args, "singulative")
-- If all singulative nouns feminine in form, form a masculine collective
local singfem = all_feminine(data, sings)
local colls = do_gender_number(data, args, {"coll"}, sings,
singfem and "m" or nil, false, "sg")
local dus = do_gender_number(data, args, {"d"}, sings, "d", singfem, "du")
local paucs = do_gender_number(data, args, {"pauc"}, sings, "paucp",
singfem, "pl")
local pls = is_template and {base={{"{{{pl}}}", "tri"}}} or
do_gender_number(data, args, {"pl", "cpl"}, colls, nil, false, "pl")
-- Can manually specify which numbers are to appear, and exactly those
-- numbers will appear. Otherwise, if any plurals given, plurals appear;
-- if singulative given or derivable, it and dual and paucal will appear.
if not parse_number_spec(data, args) then
data.numbers = {}
if args[1] ~= "-" then
table.insert(data.numbers, "sing")
end
if #colls["base"] > 0 then
table.insert(data.numbers, "coll")
end
if #dus["base"] > 0 then
table.insert(data.numbers, "du")
end
if #paucs["base"] > 0 then
table.insert(data.numbers, "pauc")
end
if #pls["base"] > 0 then
table.insert(data.numbers, "pl")
end
end
-- Generate the singulative, collective, dual, paucal and plural forms
do_inflections_and_overrides(data, args,
{{sings, "sing"}, {colls, "coll"}, {dus, "du"}, {paucs, "pauc"}, {pls, "pl"}})
-- Make the table
return make_noun_table(data)
end
-- The implementation of the main entry point for adjective and
-- gendered noun tables.
function show_gendered(frame, isadj, pos)
local args, origargs, data = init(frame:getParent().args)
data.pos = pos
data.numgens = function()
local numgens = {}
for _, gender in ipairs(data.allgenders) do
for _, number in ipairs(data.numbers) do
table.insert(numgens, gender .. "_" .. number)
end
end
return numgens
end
data.allnumgens = {}
for _, gender in ipairs(data.allgenders) do
for _, number in ipairs(data.allnumbers) do
table.insert(data.allnumgens, gender .. "_" .. number)
end
end
parse_state_etc_spec(data, args)
local msgs = get_heads(data, args, 'masculine singular')
-- Always do all of these so cases like يَوْم الاِثْنَيْن work.
-- See comment in show_noun().
local fsgs = do_gender_number(data, args, {"f"}, msgs, "f", true, "sg")
local mdus = do_gender_number(data, args, {"d"}, msgs, "d", false, "du")
local fdus = do_gender_number(data, args, {"fd"}, fsgs, "d", true, "du")
local mpls = do_gender_number(data, args, {"pl", "cpl"}, msgs,
isadj and "p" or nil, false, "pl")
local fpls = do_gender_number(data, args, {"fpl", "cpl"}, fsgs, "fp",
true, "pl")
if isadj then
parse_number_spec(data, args)
else
determine_noun_numbers(data, args, mpls)
end
-- Generate the singular, dual and plural forms
do_inflections_and_overrides(data, args,
{{msgs, "m_sg"}, {fsgs, "f_sg"}, {mdus, "m_du"}, {fdus, "f_du"},
{mpls, "m_pl"}, {fpls, "f_pl"}})
-- Make the table
if isadj then
return make_adj_table(data)
else
return make_gendered_noun_table(data)
end
end
-- The main entry point for gendered noun tables.
function export.show_gendered_noun(frame)
return show_gendered(frame, false, "noun")
end
-- The main entry point for numeral tables. Same as using show_gendered_noun()
-- with pos=numeral.
function export.show_numeral(frame)
return show_gendered(frame, false, "numeral")
end
-- The main entry point for adjective tables.
function export.show_adj(frame)
return show_gendered(frame, true, "adjective")
end
-- Inflection functions
function do_translit(term)
return (lang:transliterate(term)) or track("cant-translit") and BOGUS_CHAR
end
function split_arabic_tr(term)
if term == "" then
return "", ""
elseif not rfind(term, "/") then
return term, do_translit(term)
else
splitvals = rsplit(term, "/")
if #splitvals ~= 2 then
error("Must have at most one slash in a combined Arabic/translit expr: '" .. term .. "'")
end
return splitvals[1], splitvals[2]
end
end
function reorder_shadda(word)
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- detection process inconvenient, so undo it.
word = rsub(word, "(" .. DIACRITIC_ANY_BUT_SH .. ")" .. SH, SH .. "%1")
return word
end
-- Combine PREFIX, AR/TR, and ENDING in that order. PREFIX and ENDING
-- can be of the form ARABIC/TRANSLIT. The Arabic and translit parts are
-- separated out and grouped together, resulting in a string of the
-- form ARABIC/TRANSLIT (TRANSLIT will always be present, computed
-- automatically if not present in the source). The return value is actually a
-- list of ARABIC/TRANSLIT strings because hamza resolution is applied to
-- ARABIC, which may produce multiple outcomes (all of which will have the
-- same TRANSLIT).
function combine_with_ending(prefix, ar, tr, ending)
local prefixar, prefixtr = split_arabic_tr(prefix)
local endingar, endingtr = split_arabic_tr(ending)
-- When calling hamza_seat(), leave out prefixes, which we expect to be
-- clitics like وَ. (In case the prefix is a separate word, it won't matter
-- whether we include it in the text passed to hamza_seat().)
allar = hamza_seat(ar .. endingar)
-- Convert ...īān to ...iyān in case of stems ending in -ī or -ū
-- (e.g. kubrī "bridge").
if rfind(endingtr, "^[aeiouāēīōū]") then
if rfind(tr, "ī$") then
tr = rsub(tr, "ī$", "iy")
elseif rfind(tr, "ū$") then
tr = rsub(tr, "ū$", "uw")
end
end
tr = prefixtr .. tr .. endingtr
allartr = {}
for _, arval in ipairs(allar) do
table.insert(allartr, prefixar .. arval .. "/" .. tr)
end
return allartr
end
-- Combine PREFIX, STEM/TR and ENDING in that order and insert into the
-- list of items in DATA[KEY], initializing it if empty and making sure
-- not to insert duplicates. ENDING can be a list of endings, will be
-- distributed over the remaining parts. PREFIX and/or ENDING can be
-- of the form ARABIC/TRANSLIT (the stem is already split into Arabic STEM
-- and Latin TR). Note that what's inserted into DATA[KEY] is actually a
-- list of ARABIC/TRANSLIT strings; if more than one is present in the list,
-- they represent hamza variants, i.e. different ways of writing a hamza
-- sound, such as مُبْتَدَؤُون vs. مُبْتَدَأُون (see init_data()).
function add_inflection(data, key, prefix, stem, tr, ending)
if data.forms[key] == nil then
data.forms[key] = {}
end
if type(ending) ~= "table" then
ending = {ending}
end
for _, endingval in ipairs(ending) do
insert_if_not(data.forms[key],
combine_with_ending(prefix, stem, tr, endingval))
end
end
-- Form inflections from combination of STEM, with transliteration TR,
-- and ENDINGS (and definite article where necessary, plus any specified
-- prefixes) and store in DATA, for the number or gender/number
-- determined by MOD ("", "mod_", "mod2_", etc.; see call_inflection()) and
-- NUMGEN ("sg", "du", "pl", or "m_sg", "f_pl", etc. for adjectives). ENDINGS
-- is an array of 15 values, each of which is a string or array of
-- alternatives. The order of ENDINGS is indefinite nom, acc, gen; definite
-- nom, acc, gen; construct-state nom, acc, gen; informal indefinite, definite,
-- construct; lemma indefinite, definite, construct. (Normally the lemma is
-- based off of the indefinite, but if the inflection has been restricted to
-- particular states, it comes from one of those states, in the order
-- indefinite, definite, construct.) See also add_inflection() for more info
-- on exactly what is inserted into DATA.
function add_inflections(stem, tr, data, mod, numgen, endings)
stem = canon_hamza(stem)
assert(#endings == 15)
local ismod = mod ~= ""
-- If working on modifier and modN_numgen= is given, it better agree with
-- NUMGEN; the case where it doesn't agree should have been caught in
-- call_inflections().
if ismod and data[mod .. "numgen"] then
assert(data[mod .. "numgen"] == numgen)
end
-- Return a list of combined of ar/tr forms, with the ending tacked on.
-- There may be more than one form because of alternative hamza seats that
-- may be supplied, e.g. مُبْتَدَؤُون or مُبْتَدَأُون (mubtadaʾūn "(grammatical) subjects").
local defstem, deftr
if stem == "?" or data[mod .. "omitarticle"] then
defstem = stem
deftr = tr
else
-- apply sun-letter assimilation and hamzat al-wasl elision
defstem = rsub("الْ" .. stem, "^الْ([سشصتثطدذضزژظنرل])", "ال%1ّ")
defstem = rsub(defstem, "^الْ([اٱ])([ًٌٍَُِ])", "ال%2%1")
deftr = rsub("al-" .. tr, "^al%-([sšṣtṯṭdḏḍzžẓnrḷ])", "a%1-%1")
end
-- For a given MOD spec, is the previous word (base or modifier) a noun?
-- We assume the base is always a noun in this case, and otherwise
-- look at the value of modN_idafa.
local function prev_mod_is_noun(mod)
if mod == "mod_" then
return true
end
if mod == "mod2_" then
return data["mod_idafa"]
end
modnum = assert_rsub(mod, "^mod([0-9]+)_$", "%1")
modnum = modnum - 1
return data["mod" .. modnum .. "_idafa"]
end
local numgens = ismod and data[mod .. "numgen"] and data.numgens() or {numgen}
-- "defcon" means definite adjective modifying construct state noun. We
-- add a ... before the adjective (and after the construct-state noun) to
-- indicate that a nominal modifier would go between noun and adjective.
local stems = {ind = stem, def = defstem, con = stem,
defcon = "... " .. defstem}
local trs = {ind = tr, def = deftr, con = tr, defcon = "... " .. deftr}
for _, ng in ipairs(numgens) do
for _, state in ipairs(data.allstates) do
for _, case in ipairs(data.allcases_with_lemma) do
-- We are generating the inflections for STATE, but sometimes
-- we want to use the inflected form of a different state, e.g.
-- if modN_state= or basestate= is set to some particular state.
-- If we're dealing with an adjectival modifier, then in
-- place of "con" we use "defcon" if immediately after a noun
-- (see comment above), else "def".
local thestate = ismod and data[mod .. "state"] or
ismod and not data[mod .. "idafa"] and state == "con" and
(prev_mod_is_noun(mod) and "defcon" or "def") or
not ismod and data.basestate or
state
local is_lemmainf = case == "lemma" or case == "inf"
-- Don't substitute value of modcase for lemma/informal "cases"
local thecase = is_lemmainf and case or
ismod and data[mod .. "case"] or case
add_inflection(data, mod .. case .. "_" .. ng .. "_" .. state,
data[mod .. "prefix"] or "",
stems[thestate], trs[thestate],
endings[data.statecases[thestate][thecase]])
end
end
end
end
-- Insert into a category and a type variable (e.g. m_sg_type) for the
-- declension type of a particular declension (e.g. masculine singular for
-- adjectives). MOD and NUMGEN are as in call_inflection(). CATVALUE is the
-- category and ENGVALUE is the English description of the declension type.
-- In these values, NOUN is replaced with either "noun" or "adjective",
-- SINGULAR is replaced with the English equivalent of the number in NUMGEN
-- (e.g. "singular", "dual" or "plural") while BROKSING is the same but uses
-- "broken plural" in place of "plural" and "broken paucal" in place of
-- "paucal".
function insert_cat(data, mod, numgen, catvalue, engvalue)
local singpl = data.engnumbers[rsub(numgen, "^.*_", "")]
assert(singpl ~= nil)
local broksingpl = rsub(singpl, "plural", "broken plural")
broksingpl = rsub(broksingpl, "paucal", "broken paucal")
if rfind(broksingpl, "broken plural") and (rfind(catvalue, "BROKSING") or
rfind(engvalue, "BROKSING")) then
table.insert(data.categories, "Arabic " .. data.pos .. "s with broken plural")
end
if rfind(catvalue, "irregular") or rfind(engvalue, "irregular") then
table.insert(data.categories, "Arabic irregular " .. data.pos .. "s")
end
catvalue = rsub(catvalue, "NOUN", data.pos)
catvalue = rsub(catvalue, "SINGULAR", singpl)
catvalue = rsub(catvalue, "BROKSING", broksingpl)
engvalue = rsub(engvalue, "NOUN", data.pos)
engvalue = rsub(engvalue, "SINGULAR", singpl)
engvalue = rsub(engvalue, "BROKSING", broksingpl)
if mod == "" and catvalue ~= "" then
insert_if_not(data.categories, catvalue)
end
if engvalue ~= "" then
local key = mod .. numgen .. "_type"
if data.forms[key] == nil then
data.forms[key] = {}
end
insert_if_not(data.forms[key], engvalue)
end
if contains(data.states, "def") and not contains(data.states, "ind") then
insert_if_not(data.categories, "Arabic definite " .. data.pos .. "s")
end
end
-- Return true if we're handling modifier inflections and the modifier's
-- case is limited to an oblique case (gen or acc; typically genitive,
-- in an ʾidāfa construction). This is used when returning lemma
-- inflections -- the modifier part of the lemma should agree in case
-- with modifier's case if it's restricted in case.
function mod_oblique(mod, data)
return mod ~= "" and data[mod .. "case"] and (
data[mod .. "case"] == "acc" or data[mod .. "case"] == "gen")
end
-- Similar to mod_oblique but specifically when the modifier case is
-- limited to the accusative (which is rare or nonexistent in practice).
function mod_acc(mod, data)
return mod ~= "" and data[mod .. "case"] and data[mod .. "case"] == "acc"
end
-- Handle triptote and diptote inflections
function triptote_diptote(stem, tr, data, mod, numgen, is_dip, lc)
-- Remove any case ending
if rfind(stem, "[" .. UN .. U .. "]$") then
stem = rsub(stem, "[" .. UN .. U .. "]$", "")
tr = rsub(tr, "un?$", "")
end
-- special-case for صلوة pronounced ṣalāh; check translit
local is_aah = rfind(stem, TAM .. "$") and rfind(tr, "āh$")
if rfind(stem, TAM .. "$") then
if rfind(tr, "h$") then
tr = rsub(tr, "h$", "t")
elseif not rfind(tr, "t$") then
tr = tr .. "t"
end
end
add_inflections(stem, tr, data, mod, numgen,
{is_dip and U or UN,
is_dip and A or AN .. ((rfind(stem, "[" .. HAMZA_ON_ALIF .. TAM .. "]$")
or rfind(stem, "[" .. AMAD .. ALIF .. "]" .. HAMZA .. "$")
) and "" or ALIF),
is_dip and A or IN,
U, A, I,
lc and UU or U,
lc and AA or A,
lc and II or I,
{}, {}, {}, -- omit informal inflections
{}, {}, {}, -- omit lemma inflections
})
-- add category and informal and lemma inflections
local tote = lc and "long construct" or is_dip and "diptote" or "triptote"
local singpl_tote = "BROKSING " .. tote
local cat_prefix = "Arabic NOUNs with " .. tote .. " BROKSING"
-- since we're checking translit for -āh we probably don't need to
-- check stem too
if is_aah or rfind(stem, "[" .. AMAD .. ALIF .. "]" .. TAM .. "$") then
add_inflections(stem, rsub(tr, "t$", ""), data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"/t", "/t", "/t", -- informal pron. is -āt
"/h", "/h", "/t", -- lemma uses -āh
})
insert_cat(data, mod, numgen, cat_prefix .. " in -āh",
singpl_tote .. " in " .. make_link(HYPHEN .. AAH))
elseif rfind(stem, TAM .. "$") then
add_inflections(stem, rsub(tr, "t$", ""), data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "/t",
"", "", "/t",
})
insert_cat(data, mod, numgen, cat_prefix .. " in -a",
singpl_tote .. " in " .. make_link(HYPHEN .. AH))
elseif lc then
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", UU,
"", "", UU,
})
insert_cat(data, mod, numgen, cat_prefix,
singpl_tote)
else
-- also special-case the nisba ending, which has an informal
-- pronunciation.
if rfind(stem, IY .. SH .. "$") then
local infstem = rsub(stem, SH .. "$", "")
local inftr = rsub(tr, "iyy$", "ī")
-- add informal and lemma inflections separately
add_inflections(infstem, inftr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
{}, {}, {},
})
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
})
else
add_inflections(stem, tr, data, mod, numgen,
{{}, {}, {},
{}, {}, {},
{}, {}, {},
"", "", "",
"", "", "",
})
end
insert_cat(data, mod, numgen, "Arabic NOUNs with basic " .. tote .. " BROKSING",
"basic " .. singpl_tote)
end
end
-- Regular triptote
inflections["tri"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, false)
end
-- Regular diptote
inflections["di"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, true)
end
-- Elative and color/defect adjective: usually same as diptote,
-- might be invariable
function elative_color_defect(stem, tr, data, mod, numgen)
if rfind(stem, "[" .. ALIF .. AMAQ .. "]$") then
invariable(stem, tr, data, mod, numgen)
else
triptote_diptote(stem, tr, data, mod, numgen, true)
end
end
-- Elative: usually same as diptote, might be invariable
inflections["el"] = function(stem, tr, data, mod, numgen)
elative_color_defect(stem, tr, data, mod, numgen)
end
-- Color/defect adjective: Same as elative
inflections["cd"] = function(stem, tr, data, mod, numgen)
elative_color_defect(stem, tr, data, mod, numgen)
end
-- Triptote with lengthened ending in the construct state
inflections["lc"] = function(stem, tr, data, mod, numgen)
triptote_diptote(stem, tr, data, mod, numgen, false, true)
end
function in_defective(stem, tr, data, mod, numgen, tri)
if not rfind(stem, IN .. "$") then
error("'in' declension stem should end in -in: '" .. stem .. "'")
end
stem = rsub(stem, IN .. "$", "")
tr = rsub(tr, "in$", "")
local acc_ind_ending = tri and IY .. AN .. ALIF or IY .. A
add_inflections(stem, tr, data, mod, numgen,
{IN, acc_ind_ending, IN,
II, IY .. A, II,
II, IY .. A, II,
II, II, II,
-- FIXME: What should happen with the lemma when modifier case
-- is limited to the accusative and modifier state is e.g. definite?
-- Should the lemma end in -iya or -ī? In practice this will rarely
-- if ever happen.
mod_acc(mod, data) and acc_ind_ending or IN, II, II,
})
local tote = tri and "triptote" or "diptote"
insert_cat(data, mod, numgen, "Arabic NOUNs with " .. tote .. " BROKSING in -in",
"BROKSING " .. tote .. " in " .. make_link(HYPHEN .. IN))
end
function detect_in_type(stem, ispl)
if ispl and rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. IN .. "$") then -- layālin
return "diin"
else -- other -in words
return "triin"
end
end
-- Defective in -in
inflections["in"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen,
detect_in_type(stem, rfind(numgen, "pl")) == "triin")
end
-- Defective in -in, force "triptote" variant
inflections["triin"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen, true)
end
-- Defective in -in, force "diptote" variant
inflections["diin"] = function(stem, tr, data, mod, numgen)
in_defective(stem, tr, data, mod, numgen, false)
end
-- Defective in -an (comes in two variants, depending on spelling with tall alif or alif maqṣūra)
inflections["an"] = function(stem, tr, data, mod, numgen)
local tall_alif
if rfind(stem, AN .. ALIF .. "$") then
tall_alif = true
stem = rsub(stem, AN .. ALIF .. "$", "")
elseif rfind(stem, AN .. AMAQ .. "$") then
tall_alif = false
stem = rsub(stem, AN .. AMAQ .. "$", "")
else
error("Invalid stem for 'an' declension type: " .. stem)
end
tr = rsub(tr, "an$", "")
if tall_alif then
add_inflections(stem, tr, data, mod, numgen,
{AN .. ALIF, AN .. ALIF, AN .. ALIF,
AA, AA, AA,
AA, AA, AA,
AA, AA, AA,
AN .. ALIF, AA, AA,
})
else
add_inflections(stem, tr, data, mod, numgen,
{AN .. AMAQ, AN .. AMAQ, AN .. AMAQ,
AAMAQ, AAMAQ, AAMAQ,
AAMAQ, AAMAQ, AAMAQ,
AAMAQ, AAMAQ, AAMAQ,
AN .. AMAQ, AAMAQ, AAMAQ,
})
end
-- FIXME: Should we distinguish between tall alif and alif maqṣūra?
insert_cat(data, mod, numgen, "Arabic NOUNs with BROKSING in -an",
"BROKSING in " .. make_link(HYPHEN .. AN .. (tall_alif and ALIF or AMAQ)))
end
function invariable(stem, tr, data, mod, numgen)
add_inflections(stem, tr, data, mod, numgen,
{"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
})
insert_cat(data, mod, numgen, "Arabic NOUNs with invariable BROKSING",
"BROKSING invariable")
end
-- Invariable in -ā (non-loanword type)
inflections["inv"] = function(stem, tr, data, mod, numgen)
invariable(stem, tr, data, mod, numgen)
end
-- Invariable in -ā (loanword type, behaving in the dual as if ending in -a, I think!)
inflections["lwinv"] = function(stem, tr, data, mod, numgen)
invariable(stem, tr, data, mod, numgen)
end
-- Duals
inflections["d"] = function(stem, tr, data, mod, numgen)
if rfind(stem, ALIF .. NI .. "?$") then
stem = rsub(stem, AOPTA .. NI .. "?$", "")
elseif rfind(stem, AMAD .. NI .. "?$") then
stem = rsub(stem, AMAD .. NI .. "?$", HAMZA_PH)
else
error("Dual stem should end in -ān(i): '" .. stem .. "'")
end
tr = rsub(tr, "āni?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{AANI, AYNI, AYNI,
AANI, AYNI, AYNI,
AA, AYSK, AYSK,
AYN, AYN, AYSK,
mo and AYN or AAN, mo and AYN or AAN, mo and AYSK or AA,
})
insert_cat(data, mod, numgen, "", "dual in " .. make_link(HYPHEN .. AANI))
end
-- Sound masculine plural
inflections["smp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, UUNA .. "?$") then
error("Sound masculine plural stem should end in -ūn(a): '" .. stem .. "'")
end
stem = rsub(stem, UUNA .. "?$", "")
tr = rsub(tr, "ūna?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{UUNA, IINA, IINA,
UUNA, IINA, IINA,
UU, II, II,
IIN, IIN, II,
mo and IIN or UUN, mo and IIN or UUN, mo and II or UU,
})
-- use SINGULAR because conceivably this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound masculine SINGULAR",
"sound masculine SINGULAR")
end
-- Sound feminine plural
inflections["sfp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, "[" .. ALIF .. AMAD .. "]" .. T .. UN .. "?$") then
error("Sound feminine plural stem should end in -āt(un): '" .. stem .. "'")
end
stem = rsub(stem, UN .. "$", "")
tr = rsub(tr, "un$", "")
add_inflections(stem, tr, data, mod, numgen,
{UN, IN, IN,
U, I, I,
U, I, I,
"", "", "",
"", "", "",
})
-- use SINGULAR because this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound feminine SINGULAR",
"sound feminine SINGULAR")
end
-- Plural of defective in -an
inflections["awnp"] = function(stem, tr, data, mod, numgen)
if not rfind(stem, AWNA .. "?$") then
error("'awnp' plural stem should end in -awn(a): '" .. stem .. "'")
end
stem = rsub(stem, AWNA .. "?$", "")
tr = rsub(tr, "awna?$", "")
local mo = mod_oblique(mod, data)
add_inflections(stem, tr, data, mod, numgen,
{AWNA, AYNA, AYNA,
AWNA, AYNA, AYNA,
AWSK, AYSK, AYSK,
AYN, AYN, AYSK,
mo and AYN or AWN, mo and AYN or AWN, mo and AYSK or AWSK,
})
-- use SINGULAR because conceivably this might be used with the paucal
-- instead of plural
insert_cat(data, mod, numgen, "Arabic NOUNs with sound SINGULAR in -awna",
"sound SINGULAR in " .. make_link(HYPHEN .. AWNA))
end
-- Unknown
inflections["?"] = function(stem, tr, data, mod, numgen)
add_inflections("?", "?", data, mod, numgen,
{"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
})
insert_cat(data, mod, numgen, "Arabic NOUNs with unknown SINGULAR",
"SINGULAR unknown")
end
-- Detect declension of noun or adjective stem or lemma. We allow triptotes,
-- diptotes and sound plurals to either come with ʾiʿrāb or not. We detect
-- some cases where vowels are missing, when it seems fairly unambiguous to
-- do so. ISFEM is true if we are dealing with a feminine stem (not
-- currently used and needs to be rethought). NUM is "sg", "du", or "pl",
-- depending on the number of the stem.
--
-- POS is the part of speech, generally "noun" or "adjective". Used to
-- distinguish nouns and adjectives of the فَعْلَان type. There are nouns of
-- this type and they generally are triptotes, e.g. قَطْرَان "tar"
-- and شَيْطَان "devil". An additional complication is that the user can set
-- the POS to something else, like "numeral". We don't use this POS for
-- modifiers, where we determine whether they are noun-like or adjective-like
-- according to whether mod_idafa= is true.
--
-- Some unexpectedly diptote nouns/adjectives:
--
-- jiʿrān in ʾabū jiʿrān "dung beetle"
-- distributive numbers: ṯunāʾ "two at a time", ṯulāṯ/maṯlaṯ "three at a time",
-- rubāʿ "four at a time" (not a regular diptote pattern, cf. triptote
-- junāḥ "misdemeanor, sin", nujār "origin, root", nuḥām "flamingo")
-- jahannam (f.) "hell"
-- many names: jilliq/jillaq "Damascus", judda/jidda "Jedda", jibrīl (and
-- variants) "Gabriel", makka "Mecca", etc.
-- jibriyāʾ "pride"
-- kibriyāʾ "glory, pride"
-- babbaḡāʾ "parrot"
-- ʿayāyāʾ "incapable, tired"
-- suwaidāʾ "black bile, melancholy"
-- Note also: ʾajhar "day-blind" (color-defect) and ʾajhar "louder" (elative)
function export.detect_type(stem, isfem, num, pos)
local function dotrack(word)
track(word)
track(word .. "/" .. pos)
return true
end
-- Not strictly necessary because the caller (stem_and_type) already
-- reorders, but won't hurt, and may be necessary if this function is
-- called from an external caller.
stem = reorder_shadda(stem)
local origstem = stem
-- So that we don't get tripped up by alif madda, we replace alif madda
-- with the sequence hamza + fatḥa + alif before the regexps below.
stem = rsub(stem, AMAD, HAMZA .. AA)
if num == "du" then
if rfind(stem, ALIF .. NI .. "?$") then
return "d"
else
error("Malformed stem for dual, should end in the nominative dual ending -ān(i): '" .. origstem .. "'")
end
end
if rfind(stem, IN .. "$") then -- -in words
return detect_in_type(stem, num == "pl")
elseif rfind(stem, AN .. "[" .. ALIF .. AMAQ .. "]$") then
return "an"
elseif rfind(stem, AN .. "$") then
error("Malformed stem, fatḥatan should be over second-to-last letter: " .. origstem)
elseif num == "pl" and rfind(stem, AW .. SKOPT .. N .. AOPT .. "$") then
return "awnp"
elseif num == "pl" and rfind(stem, ALIF .. T .. UNOPT .. "$") and
-- Avoid getting tripped up by plurals like ʾawqāt "times",
-- ʾaḥwāt "fishes", ʾabyāt "verses", ʾazyāt "oils", ʾaṣwāt "voices",
-- ʾamwāt "dead (pl.)".
not rfind(stem, HAMZA_ON_ALIF .. A .. CONS .. SK .. CONS .. AAT .. UNOPT .. "$") then
return "sfp"
elseif num == "pl" and rfind(stem, W .. N .. AOPT .. "$") and
-- Avoid getting tripped up by plurals like ʿuyūn "eyes",
-- qurūn "horns" (note we check for U between first two consonants
-- so we correctly ignore cases like sinūn "hours" (from sana),
-- riʾūn "lungs" (from riʾa) and banūn "sons" (from ibn).
not rfind(stem, "^" .. CONS .. U .. CONS .. UUN .. AOPT .. "$") then
return "smp"
elseif rfind(stem, UN .. "$") then -- explicitly specified triptotes (we catch sound feminine plurals above)
return "tri"
elseif rfind(stem, U .. "$") then -- explicitly specified diptotes
return "di"
elseif -- num == "pl" and
( -- various diptote plural patterns; these are diptote even in the singular (e.g. yanāyir "January", falāfil "falafel", tuʾabāʾ "yawn, fatigue"
-- currently we sometimes end up with such plural patterns in the "singular" in a singular
-- ʾidāfa construction with plural modifier. (FIXME: These should be fixed to the correct number.)
rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. IOPT .. Y .. "?" .. CONS .. "$") and dotrack("fawaakih") or -- fawākih, daqāʾiq, makātib, mafātīḥ
rfind(stem, "^" .. CONS .. AOPT .. CONS .. AOPTA .. CONS .. SH .. "$")
and not rfind(stem, "^" .. T) and dotrack("mawaadd") or -- mawādd, maqāmm, ḍawāll; exclude t- so we don't catch form-VI verbal nouns like taḍādd (HACK!!!)
rfind(stem, "^" .. CONS .. U .. CONS .. AOPT .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("wuzaraa") or -- wuzarāʾ "ministers", juhalāʾ "ignorant (pl.)"
rfind(stem, ELCD_START .. SKOPT .. CONS .. IOPT .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("asdiqaa") or -- ʾaṣdiqāʾ
rfind(stem, ELCD_START .. IOPT .. CONS .. SH .. AOPTA .. HAMZA .. "$") and dotrack("aqillaa") -- ʾaqillāʾ, ʾajillāʾ "important (pl.)", ʾaḥibbāʾ "lovers"
) then
return "di"
elseif num == "sg" and ( -- diptote singular patterns (nouns/adjectives)
rfind(stem, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. HAMZA .. "$") and dotrack("qamraa") or -- qamrāʾ "moon-white, moonlight"; baydāʾ "desert"; ṣaḥrāʾ "desert-like, desert"; tayhāʾ "trackless, desolate region"; not pl. to avoid catching e.g. ʾabnāʾ "sons", ʾaḥmāʾ "fathers-in-law", ʾamlāʾ "steppes, deserts" (pl. of malan), ʾanbāʾ "reports" (pl. of nabaʾ)
rfind(stem, ELCD_START .. SK .. CONS .. A .. CONS .. "$") and dotrack("abyad") or -- ʾabyaḍ "white", ʾakbar "greater"; FIXME nouns like ʾaʿzab "bachelor", ʾaḥmad "Ahmed" but not ʾarnab "rabbit", ʾanjar "anchor", ʾabjad "abjad", ʾarbaʿ "four", ʾandar "threshing floor" (cf. diptote ʾandar "rarer")
rfind(stem, ELCD_START .. A .. CONS .. SH .. "$") and dotrack("alaff") or -- ʾalaff "plump", ʾaḥabb "more desirable"
-- do the following on the origstem so we can check specifically for alif madda
rfind(origstem, "^" .. AMAD .. CONS .. A .. CONS .. "$") and dotrack("aalam") -- ʾālam "more painful", ʾāḵar "other"
) then
return "di"
elseif num == "sg" and pos == "adjective" and ( -- diptote singular patterns (adjectives)
rfind(stem, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. N .. "$") and dotrack("kaslaan") or -- kaslān "lazy", ʿaṭšān "thirsty", jawʿān "hungry", ḡaḍbān "angry", tayhān "wandering, perplexed"; but not nouns like qaṭrān "tar", šayṭān "devil", mawtān "plague", maydān "square"
-- rfind(stem, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. N .. "$") and dotrack("laffaa") -- excluded because of too many false positives e.g. ḵawwān "disloyal", not to mention nouns like jannān "gardener"; only diptote example I can find is ʿayyān "incapable, weary" (diptote per Lane but not Wehr)
rfind(stem, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. HAMZA .. "$") and dotrack("laffaa") -- laffāʾ "plump (fem.)"; but not nouns like jarrāʾ "runner", ḥaddāʾ "camel driver", lawwāʾ "wryneck"
) then
return "di"
elseif rfind(stem, AMAQ .. "$") then -- kaslā, ḏikrā (spelled with alif maqṣūra)
return "inv"
elseif rfind(stem, "[" .. ALIF .. SK .. "]" .. Y .. AOPTA .. "$") then -- dunyā, hadāyā (spelled with tall alif after yāʾ)
return "inv"
elseif rfind(stem, ALIF .. "$") then -- kāmērā, lībiyā (spelled with tall alif; we catch dunyā and hadāyā above)
return "lwinv"
elseif rfind(stem, II .. "$") then -- cases like كُوبْرِي kubrī "bridge" and صَوَانِي ṣawānī pl. of ṣīniyya; modern words that would probably end with -in
dotrack("ii")
return "inv"
elseif rfind(stem, UU .. "$") then -- FIXME: Does this occur? Check the tracking
dotrack("uu")
return "inv"
else
return "tri"
end
end
-- Replace hamza (of any sort) at the end of a word, possibly followed by
-- a nominative case ending or -in or -an, with HAMZA_PH, and replace alif
-- madda at the end of a word with HAMZA_PH plus fatḥa + alif. To undo these
-- changes, use hamza_seat().
function canon_hamza(word)
word = rsub(word, AMAD .. "$", HAMZA_PH .. AA)
word = rsub(word, HAMZA_ANY .. "([" .. UN .. U .. IN .. "]?)$", HAMZA_PH .. "%1")
word = rsub(word, HAMZA_ANY .. "(" .. AN .. "[" .. ALIF .. AMAQ .. "])$", HAMZA_PH .. "%1")
return word
end
-- Supply the appropriate hamza seat(s) for a placeholder hamza.
function hamza_seat(word)
if rfind(word, HAMZA_PH) then -- optimization to avoid many regexp substs
return ar_utilities.process_hamza(word)
end
return {word}
end
--[[
-- Supply the appropriate hamza seat for a placeholder hamza in a combined
-- Arabic/translation expression.
function split_and_hamza_seat(word)
if rfind(word, HAMZA_PH) then -- optimization to avoid many regexp substs
local ar, tr = split_arabic_tr(word)
-- FIXME: Do something with all values returned
ar = ar_utilities.process_hamza(ar)[1]
return ar .. "/" .. tr
end
return word
end
--]]
-- Return stem and type of an argument given the singular stem and whether
-- this is a plural argument. WORD may be of the form ARABIC, ARABIC/TR,
-- ARABIC:TYPE, ARABIC/TR:TYPE, or TYPE, for Arabic stem ARABIC with
-- transliteration TR and of type (i.e. declension) TYPE. If the type
-- is omitted, it is auto-detected using detect_type(). If the transliteration
-- is omitted, it is auto-transliterated from the Arabic. If only the type
-- is present, it is a sound plural type ("sf", "sm" or "awn"),
-- in which case the stem and translit are generated from the singular by
-- regular rules. SG may be of the form ARABIC/TR or ARABIC. ISFEM is true
-- if WORD is a feminine stem. NUM is either "sg", "du" or "pl" according to
-- the number of the stem. The return value will be in the ARABIC/TR format.
--
-- POS is the part of speech, generally "noun" or "adjective". Used to
-- distinguish nouns and adjectives of the فَعْلَان type. There are nouns of
-- this type and they generally are triptotes, e.g. قَطْرَان "tar"
-- and شَيْطَان "devil". An additional complication is that the user can set
-- the POS to something else, like "numeral". We don't use this POS for
-- modifiers, where we determine whether they are noun-like or adjective-like
-- according to whether mod_idafa= is true.
function export.stem_and_type(word, sg, sgtype, isfem, num, pos)
local rettype = nil
if rfind(word, ":") then
local split = rsplit(word, ":")
if #split > 2 then
error("More than one colon found in argument: '" .. word .. "'")
end
word, rettype = split[1], split[2]
end
local ar, tr = split_arabic_tr(word)
-- Need to reorder shaddas here so that shadda at the end of a stem
-- followed by ʾiʿrāb or a plural ending or whatever can get processed
-- correctly. This processing happens in various places so make sure
-- we return the reordered Arabic in all circumstances.
ar = reorder_shadda(ar)
local artr = ar .. "/" .. tr
-- Now return split-out ARABIC/TR and TYPE, with shaddas reordered in
-- the Arabic.
if rettype then
return artr, rettype
end
-- Likewise, do shadda reordering for the singular.
local sgar, sgtr = split_arabic_tr(sg)
sgar = reorder_shadda(sgar)
-- Apply a substitution to the singular Arabic and translit. If a
-- substitution could be made, return the combined ARABIC/TR with
-- substitutions made; else, return nil. The Arabic has ARFROM
-- replaced with ARTO, while the translit has TRFROM replaced with
-- TRTO, and if that doesn't match, replace TRFROM2 with TRTO2.
local function sub(arfrom, arto, trfrom, trto, trfrom2, trto2, trfrom3, trto3)
if rfind(sgar, arfrom) then
local arret = rsub(sgar, arfrom, arto)
local trret = sgtr
if rfind(sgtr, trfrom) then
trret = rsub(sgtr, trfrom, trto)
elseif trfrom2 and rfind(sgtr, trfrom2) then
trret = rsub(sgtr, trfrom2, trto2)
elseif trfrom3 and rfind(sgtr, trfrom3) then
trret = rsub(sgtr, trfrom3, trto3)
elseif not rfind(sgtr, BOGUS_CHAR) then
error("Transliteration '" .. sgtr .."' does not have same ending as Arabic '" .. sgar .. "'")
end
return arret .. "/" .. trret
else
return nil
end
end
if (num ~= "sg" or not isfem) and (word == "elf" or word == "cdf" or word == "intf" or word == "rf" or word == "f") then
error("Inference of form for inflection type '" .. word .. "' only allowed in singular feminine")
end
if num ~= "du" and word == "d" then
error("Inference of form for inflection type '" .. word .. "' only allowed in dual")
end
if num ~= "pl" and (word == "sfp" or word == "smp" or word == "awnp" or word == "cdp" or word == "sp" or word == "fp" or word == "p") then
error("Inference of form for inflection type '" .. word .. "' only allowed in plural")
end
local function is_intensive_adj(ar)
return rfind(ar, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. N .. UOPT .. "$") or
rfind(ar, "^" .. CONS .. A .. CONS .. SK .. AMAD .. N .. UOPT .. "$") or
rfind(ar, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. N .. UOPT .. "$")
end
local function is_feminine_cd_adj(ar)
return pos == "adjective" and
(rfind(ar, "^" .. CONS .. A .. CONS .. SK .. CONS .. AOPTA .. HAMZA .. UOPT .. "$") or -- ʾḥamrāʾ/ʿamyāʾ/bayḍāʾ
rfind(ar, "^" .. CONS .. A .. CONS .. SH .. AOPTA .. HAMZA .. UOPT .. "$") -- laffāʾ
)
end
local function is_elcd_adj(ar)
return rfind(ar, ELCD_START .. SK .. CONS .. A .. CONS .. UOPT .. "$") or -- ʾabyaḍ "white", ʾakbar "greater"
rfind(ar, ELCD_START .. A .. CONS .. SH .. UOPT .. "$") or -- ʾalaff "plump", ʾaqall "fewer"
rfind(ar, ELCD_START .. SK .. CONS .. AAMAQ .. "$") or -- ʾaʿmā "blind", ʾadnā "lower"
rfind(ar, "^" .. AMAD .. CONS .. A .. CONS .. UOPT .. "$") -- ʾālam "more painful", ʾāḵar "other"
end
if word == "?" or
(rfind(word, "^[a-z][a-z]*$") and sgtype == "?") then
--if 'word' is a type, actual value inferred from sg; if sgtype is ?,
--propagate it to all derived types
return "", "?"
end
if word == "intf" then
if not is_intensive_adj(sgar) then
error("Singular stem not in CACCān form: " .. sgar)
end
local ret = (
sub(AMAD .. N .. UOPT .. "$", AMAD, "nu?$", "") or -- ends in -ʾān
sub(AOPTA .. N .. UOPT .. "$", AMAQ, "nu?$", "") -- ends in -ān
)
return ret, "inv"
end
if word == "elf" then
local ret = (
sub(ELCD_START .. SK .. "[" .. Y .. W .. "]" .. A .. CONSPAR .. UOPT .. "$",
"%1" .. UU .. "%2" .. AMAQ, "ʔa(.)[yw]a(.)u?", "%1ū%2ā") or -- ʾajyad
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. U .. "%2" .. SK .. "%3" .. AMAQ, "ʔa(.)(.)a(.)u?", "%1u%2%3ā") or -- ʾakbar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. U .. "%2" .. SH .. AMAQ, "ʔa(.)a(.)%2u?", "%1u%2%2ā") or -- ʾaqall
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. U .. "%2" .. SK .. Y .. ALIF, "ʔa(.)(.)ā", "%1u%2yā") or -- ʾadnā
sub("^" .. AMAD .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
HAMZA_ON_ALIF .. U .. "%1" .. SK .. "%2" .. AMAQ, "ʔā(.)a(.)u?", "ʔu%1%2ā") -- ʾālam "more painful", ʾāḵar "other"
)
if not ret then
error("Singular stem not an elative adjective: " .. sgar)
end
return ret, "inv"
end
if word == "cdf" then
local ret = (
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. A .. "%2" .. SK .. "%3" .. AA .. HAMZA, "ʔa(.)(.)a(.)u?", "%1a%2%3āʔ") or -- ʾaḥmar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. A .. "%2" .. SH .. AA .. HAMZA, "ʔa(.)a(.)%2u?", "%1a%2%2āʔ") or -- ʾalaff
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. A .. "%2" .. SK .. Y .. AA .. HAMZA, "ʔa(.)(.)ā", "%1a%2yāʔ") -- ʾaʿmā
)
if not ret then
error("Singular stem not a color/defect adjective: " .. sgar)
end
return ret, "cd" -- so plural will be correct
end
-- Regular feminine -- add ة, possibly with stem modifications
if word == "rf" then
sgar = canon_hamza(sgar)
if rfind(sgar, TAM .. UNUOPT .. "$") then
--Don't do this or we have problems when forming singulative from
--collective with a construct modifier that's feminine
--error("Singular stem is already feminine: " .. sgar)
return sgar .. "/" .. sgtr, "tri"
end
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AAH, "an$", "āh") or -- ends in -an
sub(IN .. "$", IY .. AH, "in$", "iya") or -- ends in -in
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AAH, "ā$", "āh") or -- ends in alif or alif maqṣūra
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
sub(UNU .. "$", AH, "un?$", "a", "$", "a") or -- anything else + -u(n)
sub("$", AH, "$", "a") -- anything else
)
return ret, "tri"
end
if word == "f" then
if sgtype == "cd" then
return export.stem_and_type("cdf", sg, sgtype, true, "sg", pos)
elseif sgtype == "el" then
return export.stem_and_type("elf", sg, sgtype, true, "sg", pos)
elseif sgtype =="di" and is_intensive_adj(sgar) then
return export.stem_and_type("intf", sg, sgtype, true, "sg", pos)
elseif sgtype == "di" and is_elcd_adj(sgar) then
-- If form is elative or color-defect, we don't know which of
-- the two it is, and each has a special feminine which isn't
-- the regular "just add ة", so shunt to unknown. This will
-- ensure that ?'s appear in place of the inflection -- also
-- for dual and plural.
return export.stem_and_type("?", sg, sgtype, true, "sg", pos)
else
return export.stem_and_type("rf", sg, sgtype, true, "sg", pos)
end
end
if word == "rm" then
sgar = canon_hamza(sgar)
--Don't do this or we have problems when forming collective from
--singulative with a construct modifier that's not feminine,
--e.g. شَجَرَة التُفَّاح
--if not rfind(sgar, TAM .. UNUOPT .. "$") then
-- error("Singular stem is not feminine: " .. sgar)
--end
local ret = (
sub(AAH .. UNUOPT .. "$", AN .. AMAQ, "ātun?$", "an", "ā[ht]$", "an") or -- in -āh
sub(IY .. AH .. UNUOPT .. "$", IN, "iyatun?$", "in", "iya$", "in") or -- ends in -iya
sub(AOPT .. TAM .. UNUOPT .. "$", "", "atun?$", "", "a$", "") or --ends in -a
sub("$", "", "$", "") -- do nothing
)
return ret, "tri"
end
if word == "m" then
-- FIXME: handle cd (color-defect)
-- FIXME: handle el (elative)
-- FIXME: handle int (intensive)
return export.stem_and_type("rm", sg, sgtype, false, "sg", pos)
end
-- The plural used for feminine adjectives. If the singular type is
-- color/defect or it looks like a feminine color/defect adjective,
-- use color/defect plural. Otherwise shunt to sound feminine plural.
if word == "fp" then
if sgtype == "cd" or is_feminine_cd_adj(sgar) then
return export.stem_and_type("cdp", sg, sgtype, true, "pl", pos)
else
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
end
end
if word == "sp" then
if sgtype == "cd" then
return export.stem_and_type("cdp", sg, sgtype, isfem, "pl", pos)
elseif isfem then
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
elseif sgtype == "an" then
return export.stem_and_type("awnp", sg, sgtype, false, "pl", pos)
else
return export.stem_and_type("smp", sg, sgtype, false, "pl", pos)
end
end
-- Conservative plural, as used for masculine plural adjectives.
-- If singular type is color-defect, shunt to color-defect plural; else
-- shunt to unknown, so ? appears in place of the inflections.
if word == "p" then
if sgtype == "cd" then
return export.stem_and_type("cdp", sg, sgtype, isfem, "pl", pos)
else
return export.stem_and_type("?", sg, sgtype, isfem, "pl", pos)
end
end
-- Special plural used for paucal plurals of singulatives. If ends in -ة
-- (most common), use strong feminine plural; if ends with -iyy (next
-- most common), use strong masculine plural; ends default to "p"
-- (conservative plural).
if word == "paucp" then
if rfind(sgar, TAM .. UNUOPT .. "$") then
return export.stem_and_type("sfp", sg, sgtype, true, "pl", pos)
elseif rfind(sgar, IY .. SH .. UNUOPT .. "$") then
return export.stem_and_type("smp", sg, sgtype, false, "pl", pos)
else
return export.stem_and_type("p", sg, sgtype, isfem, "pl", pos)
end
end
if word == "d" then
sgar = canon_hamza(sgar)
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AY .. AAN, "an$", "ayān") or -- ends in -an
sub(IN .. "$", IY .. AAN, "in$", "iyān") or -- ends in -in
sgtype == "lwinv" and sub(AOPTA .. "$", AT .. AAN, "[āa]$", "atān") or -- lwinv, ends in alif; allow translit with short -a
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AY .. AAN, "ā$", "ayān") or -- ends in alif or alif maqṣūra
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
--
-- NOTE: Collapsing the "h$" and "$" cases into "h?$" doesn't work
-- in the case of words ending in -āh, which end up having the
-- translit end in -tāntān.
sub(TAM .. UNU .. "$", T .. AAN, "[ht]un?$", "tān", "h$", "tān", "$", "tān") or -- ends in tāʾ marbuṭa + -u(n)
sub(TAM .. "$", T .. AAN, "h$", "tān", "$", "tān") or -- ends in tāʾ marbuṭa
-- Same here as above
sub(UNU .. "$", AAN, "un?$", "ān", "$", "ān") or -- anything else + -u(n)
sub("$", AAN, "$", "ān") -- anything else
)
return ret, "d"
end
-- Strong feminine plural in -āt, possibly with stem modifications
if word == "sfp" then
sgar = canon_hamza(sgar)
sgar = rsub(sgar, AMAD .. "(" .. TAM .. UNUOPT .. ")$", HAMZA_PH .. AA .. "%1")
sgar = rsub(sgar, HAMZA_ANY .. "(" .. AOPT .. TAM .. UNUOPT .. ")$", HAMZA_PH .. "%1")
local ret = (
sub(AOPTA .. TAM .. UNUOPT .. "$", AYAAT, "ā[ht]$", "ayāt", "ātun?$", "ayāt") or -- ends in -āh
sub(AOPT .. TAM .. UNUOPT .. "$", AAT, "a$", "āt", "atun?$", "āt") or -- ends in -a
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AYAAT, "an$", "ayāt") or -- ends in -an
sub(IN .. "$", IY .. AAT, "in$", "iyāt") or -- ends in -in
sgtype == "inv" and (
sub(AOPT .. "[" .. ALIF .. AMAQ .. "]$", AYAAT, "ā$", "ayāt") -- ends in alif or alif maqṣūra
) or
sgtype == "lwinv" and (
sub(AOPTA .. "$", AAT, "[āa]$", "āt") -- loanword ending in tall alif; allow translit with short -a
) or
-- We separate the ʾiʿrāb and no-ʾiʿrāb cases even though we can
-- do a single Arabic regexp to cover both because we want to
-- remove u(n) from the translit only when ʾiʿrāb is present to
-- lessen the risk of removing -un in the actual stem. We also
-- allow for cases where the ʾiʿrāb is present in Arabic but not
-- in translit.
sub(UNU .. "$", AAT, "un?$", "āt", "$", "āt") or -- anything else + -u(n)
sub("$", AAT, "$", "āt") -- anything else
)
return ret, "sfp"
end
if word == "smp" then
sgar = canon_hamza(sgar)
local ret = (
sub(IN .. "$", UUN, "in$", "ūn") or -- ends in -in
-- See comments above for why we have two cases, one for UNU and
-- one for non-UNU
sub(UNU .. "$", UUN, "un?$", "ūn", "$", "ūn") or -- anything else + -u(n)
sub("$", UUN, "$", "ūn") -- anything else
)
return ret, "smp"
end
-- Color/defect plural; singular must be masculine or feminine
-- color/defect adjective
if word == "cdp" then
local ret = (
sub(ELCD_START .. SK .. W .. A .. CONSPAR .. UOPT .. "$",
"%1" .. UU .. "%2", "ʔa(.)wa(.)u?", "%1ū%2") or -- ʾaswad
sub(ELCD_START .. SK .. Y .. A .. CONSPAR .. UOPT .. "$",
"%1" .. II .. "%2", "ʔa(.)ya(.)u?", "%1ī%2") or -- ʾabyaḍ
sub(ELCD_START .. SK .. CONSPAR .. A .. CONSPAR .. UOPT .. "$",
"%1" .. U .. "%2" .. SK .. "%3", "ʔa(.)(.)a(.)u?", "%1u%2%3") or -- ʾaḥmar
sub(ELCD_START .. A .. CONSPAR .. SH .. UOPT .. "$",
"%1" .. U .. "%2" .. SH, "ʔa(.)a(.)%2u?", "%1u%2%2") or -- ʾalaff
sub(ELCD_START .. SK .. CONSPAR .. AAMAQ .. "$",
"%1" .. U .. "%2" .. Y, "ʔa(.)(.)ā", "%1u%2y") or -- ʾaʿmā
sub("^" .. CONSPAR .. A .. W .. SKOPT .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. UU .. "%2", "(.)aw(.)āʔu?", "%1ū%2") or -- sawdāʾ
sub("^" .. CONSPAR .. A .. Y .. SKOPT .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. II .. "%2", "(.)ay(.)āʔu?", "%1ī%2") or -- bayḍāʾ
sub("^" .. CONSPAR .. A .. CONSPAR .. SK .. CONSPAR .. AA .. HAMZA .. UOPT .. "$", "%1" .. U .. "%2" .. SK .. "%3", "(.)a(.)(.)āʔu?", "%1u%2%3") or -- ʾḥamrāʾ/ʿamyāʾ
sub("^" .. CONSPAR .. A .. CONSPAR .. SH .. AA .. HAMZA .. UOPT .. "$", "%1" .. U .. "%2" .. SH, "(.)a(.)%2āʔu?", "%1u%2%2") -- laffāʾ
)
if not ret then
error("For 'cdp', singular must be masculine or feminine color/defect adjective: " .. sgar)
end
return ret, "tri"
end
if word == "awnp" then
local ret = (
sub(AN .. "[" .. ALIF .. AMAQ .. "]$", AWSK .. N, "an$", "awn") -- ends in -an
)
if not ret then
error("For 'awnp', singular must end in -an: " .. sgar)
end
return ret, "awnp"
end
return artr, export.detect_type(ar, isfem, num, pos)
end
-- local outersep = " <small style=\"color: #888\">or</small> "
-- need LRM here so multiple Arabic plurals end up agreeing in order with
-- the transliteration
local outersep = LRM .. "; "
local innersep = LRM .. "/"
-- Subfunction of show_form(), used to implement recursively generating
-- all combinations of elements from FORM and from each of the items in
-- LIST_OF_MODS, both of which are either arrays of strings or arrays of
-- arrays of strings, where the strings are in the form ARABIC/TRANSLIT,
-- as described in show_form(). TRAILING_ARTRMODS is an array of ARTRMOD
-- items, each of which is a two-element array of ARMOD (Arabic) and TRMOD
-- (transliteration), accumulating all of the suffixes generated so far
-- in the recursion process. Each time we recur we take the last MOD item
-- off of LIST_OF_MODS, separate each element in MOD into its Arabic and
-- Latin parts and to each Arabic/Latin pair we add all elements in
-- TRAILING_ARTRMODS, passing the newly generated list of ARTRMOD items
-- down the next recursion level with the shorter LIST_OF_MODS. We end up
-- returning a string to insert into the Wiki-markup table.
function show_form_1(form, list_of_mods, trailing_artrmods, use_parens)
if #list_of_mods == 0 then
local arabicvals = {}
local latinvals = {}
local parenvals = {}
-- Accumulate separately the Arabic and transliteration into
-- ARABICVALS and LATINVALS, then concatenate each down below.
-- However, if USE_PARENS, we put each transliteration directly
-- after the corresponding Arabic, in parens, and put the results
-- in PARENVALS, which get concatenated below. (This is used in the
-- title of the declension table.)
for _, artrmod in ipairs(trailing_artrmods) do
assert(#artrmod == 2)
local armod = artrmod[1]
local trmod = artrmod[2]
for _, subform in ipairs(form) do
local ar_span, tr_span
local ar_subspan, tr_subspan
local ar_subspans = {}
local tr_subspans = {}
if type(subform) ~= "table" then
subform = {subform}
end
for _, subsubform in ipairs(subform) do
local arabic, translit = split_arabic_tr(subsubform)
if arabic == "-" then
ar_subspan = "—"
tr_subspan = "—"
else
tr_subspan = (rfind(translit, BOGUS_CHAR) or rfind(trmod, BOGUS_CHAR)) and "?" or
require("Module:script utilities").tag_translit(translit .. trmod, lang, "default", 'style="color: #888;"')
-- implement elision of al- after vowel
tr_subspan = rsub(tr_subspan, "([aeiouāēīōū][ %-])a([sšṣtṯṭdḏḍzžẓnrḷl]%-)", "%1%2")
tr_subspan = rsub(tr_subspan, "([aeiouāēīōū][ %-])a(llāh)", "%1%2")
ar_subspan = m_links.full_link({lang = lang, term = arabic .. armod, tr = "-"})
end
insert_if_not(ar_subspans, ar_subspan)
insert_if_not(tr_subspans, tr_subspan)
end
ar_span = table.concat(ar_subspans, innersep)
tr_span = table.concat(tr_subspans, innersep)
if use_parens then
table.insert(parenvals, ar_span .. " (" .. tr_span .. ")")
else
table.insert(arabicvals, ar_span)
table.insert(latinvals, tr_span)
end
end
end
if use_parens then
return table.concat(parenvals, outersep)
else
local arabic_span = table.concat(arabicvals, outersep)
local latin_span = table.concat(latinvals, outersep)
return arabic_span .. "<br />" .. latin_span
end
else
local last_mods = table.remove(list_of_mods)
local artrmods = {}
for _, mod in ipairs(last_mods) do
if type(mod) ~= "table" then
mod = {mod}
end
for _, submod in ipairs(mod) do
local armod, trmod = split_arabic_tr(submod)
-- If the value is -, we need to create a blank entry
-- rather than skipping it; if we have no entries at any
-- level, then there will be no overall entries at all
-- because the inside of the loop at the next level will
-- never be executed.
if armod == "-" then
armod = ""
trmod = ""
end
if armod ~= "" then armod = ' ' .. armod end
if trmod ~= "" then trmod = ' ' .. trmod end
for _, trailing_artrmod in ipairs(trailing_artrmods) do
local trailing_armod = trailing_artrmod[1]
local trailing_trmod = trailing_artrmod[2]
armod = armod .. trailing_armod
trmod = trmod .. trailing_trmod
artrmod = {armod, trmod}
table.insert(artrmods, artrmod)
end
end
end
return show_form_1(form, list_of_mods, artrmods, use_parens)
end
end
-- Generate a string to substitute into a particular form in a Wiki-markup
-- table. FORM is the set of inflected forms corresponding to the base,
-- either an array of strings (referring e.g. to different possible plurals)
-- or an array of arrays of strings (the first level referring e.g. to
-- different possible plurals and the inner level referring typically to
-- hamza-spelling variants). LIST_OF_MODS is an array of MODS elements, one
-- per modifier. Each MODS element is the set of inflected forms corresponding
-- to the modifier and is of the same form as FORM, i.e. an array of strings
-- or an array of arrays of strings. Each string is typically of the form
-- "ARABIC/TRANSLIT", i.e. an Arabic string and a Latin string separated
-- by a slash. We loop over all possible combinations of elements from
-- each array; this requires recursion.
function show_form(form, list_of_mods, use_parens)
if not form then
return "—"
elseif type(form) ~= "table" then
error("a non-table value was given in the list of inflected forms.")
end
if #form == 0 then
return "—"
end
-- We need to start the recursion with the third parameter containing
-- one blank element rather than no elements, otherwise no elements
-- will be propagated to the next recursion level.
return show_form_1(form, list_of_mods, {{"", ""}}, use_parens)
end
-- Create a Wiki-markup table using the values in DATA and the template in
-- WIKICODE.
function make_table(data, wikicode)
-- Function used as replace arg of call to rsub(). Replace the
-- specified param with its (HTML) value. The param references appear
-- as {{{PARAM}}} in the wikicode.
local function repl(param)
if param == "pos" then
return data.pos
elseif param == "info" then
return data.title and " (" .. data.title .. ")" or ""
elseif rfind(param, "type$") then
return table.concat(data.forms[param] or {"—"}, outersep)
else
local list_of_mods = {}
for _, mod in ipairs(mod_list) do
local mods = data.forms[mod .. "_" .. param]
if not mods or #mods == 0 then
-- We need one blank element rather than no element,
-- otherwise no elements will be propagated from one
-- recursion level to the next.
mods = {""}
end
table.insert(list_of_mods, mods)
end
return show_form(data.forms[param], list_of_mods, param == "lemma")
end
end
-- For states not in the list of those to be displayed, clear out the
-- corresponding inflections so they appear as a dash.
for _, state in ipairs(data.allstates) do
if not contains(data.states, state) then
for _, numgen in ipairs(data.numgens()) do
for _, case in ipairs(data.allcases) do
data.forms[case .. "_" .. numgen .. "_" .. state] = {}
end
end
end
end
return rsub(wikicode, "{{{([a-z_]+)}}}", repl) .. m_utilities.format_categories(data.categories, lang)
end
-- Generate part of the noun table for a given number spec NUM (e.g. sg)
function generate_noun_num(num)
return [=[! style="background: #CDCDCD;" | Indefinite
! style="background: #CDCDCD;" | Definite
! style="background: #CDCDCD;" | Construct
|-
! style="background: #EFEFEF;" | Informal
| {{{inf_]=] .. num .. [=[_ind}}}
| {{{inf_]=] .. num .. [=[_def}}}
| {{{inf_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Nominative
| {{{nom_]=] .. num .. [=[_ind}}}
| {{{nom_]=] .. num .. [=[_def}}}
| {{{nom_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Accusative
| {{{acc_]=] .. num .. [=[_ind}}}
| {{{acc_]=] .. num .. [=[_def}}}
| {{{acc_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Genitive
| {{{gen_]=] .. num .. [=[_ind}}}
| {{{gen_]=] .. num .. [=[_def}}}
| {{{gen_]=] .. num .. [=[_con}}}
]=]
end
-- Make the noun table
function make_noun_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Declension of {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
for _, num in ipairs(data.numbers) do
if num == "du" then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" | Dual
]=] .. generate_noun_num("du")
else
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | ]=] .. data.engnumberscap[num] .. "\n" .. [=[
! style="background: #CDCDCD;" colspan=3 | {{{]=] .. num .. [=[_type}}}
|-
]=] .. generate_noun_num(num)
end
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
-- Generate part of the gendered-noun table for a given numgen spec
-- NUM (e.g. m_sg)
function generate_gendered_noun_num(num)
return [=[|-
! style="background: #CDCDCD;" | Indefinite
! style="background: #CDCDCD;" | Definite
! style="background: #CDCDCD;" | Construct
! style="background: #CDCDCD;" | Indefinite
! style="background: #CDCDCD;" | Definite
! style="background: #CDCDCD;" | Construct
|-
! style="background: #EFEFEF;" | Informal
| {{{inf_m_]=] .. num .. [=[_ind}}}
| {{{inf_m_]=] .. num .. [=[_def}}}
| {{{inf_m_]=] .. num .. [=[_con}}}
| {{{inf_f_]=] .. num .. [=[_ind}}}
| {{{inf_f_]=] .. num .. [=[_def}}}
| {{{inf_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Nominative
| {{{nom_m_]=] .. num .. [=[_ind}}}
| {{{nom_m_]=] .. num .. [=[_def}}}
| {{{nom_m_]=] .. num .. [=[_con}}}
| {{{nom_f_]=] .. num .. [=[_ind}}}
| {{{nom_f_]=] .. num .. [=[_def}}}
| {{{nom_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Accusative
| {{{acc_m_]=] .. num .. [=[_ind}}}
| {{{acc_m_]=] .. num .. [=[_def}}}
| {{{acc_m_]=] .. num .. [=[_con}}}
| {{{acc_f_]=] .. num .. [=[_ind}}}
| {{{acc_f_]=] .. num .. [=[_def}}}
| {{{acc_f_]=] .. num .. [=[_con}}}
|-
! style="background: #EFEFEF;" | Genitive
| {{{gen_m_]=] .. num .. [=[_ind}}}
| {{{gen_m_]=] .. num .. [=[_def}}}
| {{{gen_m_]=] .. num .. [=[_con}}}
| {{{gen_f_]=] .. num .. [=[_ind}}}
| {{{gen_f_]=] .. num .. [=[_def}}}
| {{{gen_f_]=] .. num .. [=[_con}}}
]=]
end
-- Make the gendered noun table
function make_gendered_noun_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Declension of {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
for _, num in ipairs(data.numbers) do
if num == "du" then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | Dual
! style="background: #CDCDCD;" colspan=3 | Masculine
! style="background: #CDCDCD;" colspan=3 | Feminine
]=] .. generate_gendered_noun_num("du")
else
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | ]=] .. data.engnumberscap[num] .. "\n" .. [=[
! style="background: #CDCDCD;" colspan=3 | Masculine
! style="background: #CDCDCD;" colspan=3 | Feminine
|-
! style="background: #CDCDCD;" colspan=3 | {{{m_]=] .. num .. [=[_type}}}
! style="background: #CDCDCD;" colspan=3 | {{{f_]=] .. num .. [=[_type}}}
]=] .. generate_gendered_noun_num(num)
end
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
-- Generate part of the adjective table for a given numgen spec NUM (e.g. m_sg)
function generate_adj_num(num)
return [=[|-
! style="background: #CDCDCD;" | Indefinite
! style="background: #CDCDCD;" | Definite
! style="background: #CDCDCD;" | Indefinite
! style="background: #CDCDCD;" | Definite
|-
! style="background: #EFEFEF;" | Informal
| {{{inf_m_]=] .. num .. [=[_ind}}}
| {{{inf_m_]=] .. num .. [=[_def}}}
| {{{inf_f_]=] .. num .. [=[_ind}}}
| {{{inf_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | Nominative
| {{{nom_m_]=] .. num .. [=[_ind}}}
| {{{nom_m_]=] .. num .. [=[_def}}}
| {{{nom_f_]=] .. num .. [=[_ind}}}
| {{{nom_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | Accusative
| {{{acc_m_]=] .. num .. [=[_ind}}}
| {{{acc_m_]=] .. num .. [=[_def}}}
| {{{acc_f_]=] .. num .. [=[_ind}}}
| {{{acc_f_]=] .. num .. [=[_def}}}
|-
! style="background: #EFEFEF;" | Genitive
| {{{gen_m_]=] .. num .. [=[_ind}}}
| {{{gen_m_]=] .. num .. [=[_def}}}
| {{{gen_f_]=] .. num .. [=[_ind}}}
| {{{gen_f_]=] .. num .. [=[_def}}}
]=]
end
-- Make the adjective table
function make_adj_table(data)
local wikicode = [=[<div class="NavFrame">
<div class="NavHead">Declension of {{{pos}}} {{{lemma}}}</div>
<div class="NavContent">
{| class="inflection-table" style="border-width: 1px; border-collapse: collapse; background:#F9F9F9; text-align:center; width:100%;"
]=]
if contains(data.numbers, "sg") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | Singular
! style="background: #CDCDCD;" colspan=2 | Masculine
! style="background: #CDCDCD;" colspan=2 | Feminine
|-
! style="background: #CDCDCD;" colspan=2 | {{{m_sg_type}}}
! style="background: #CDCDCD;" colspan=2 | {{{f_sg_type}}}
]=] .. generate_adj_num("sg")
end
if contains(data.numbers, "du") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=2 | Dual
! style="background: #CDCDCD;" colspan=2 | Masculine
! style="background: #CDCDCD;" colspan=2 | Feminine
]=] .. generate_adj_num("du")
end
if contains(data.numbers, "pl") then
wikicode = wikicode .. [=[|-
! style="background: #CDCDCD;" rowspan=3 | Plural
! style="background: #CDCDCD;" colspan=2 | Masculine
! style="background: #CDCDCD;" colspan=2 | Feminine
|-
! style="background: #CDCDCD;" colspan=2 | {{{m_pl_type}}}
! style="background: #CDCDCD;" colspan=2 | {{{f_pl_type}}}
]=] .. generate_adj_num("pl")
end
wikicode = wikicode .. [=[|}
</div>
</div>]=]
return make_table(data, wikicode)
end
return export
-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet: