Jump to content

Module:User:Benwing2/ar-inflections

From Wiktionary, the free dictionary


local export = {}

--[=[

Authorship: Ben Wing <benwing2>

This module implements {{ar-verb form}}, which automatically generates the appropriate inflections of a non-lemma
verb form given the form and the conjugation spec for the verb (which is usually just the lemma and verb form; see
{{ar-conj}}/{{ar-verb}}).
]=]

local force_cat = false -- set to true for debugging

local m_links = require("Module:links")
local m_table = require("Module:table")
local m_form_of = require("Module:form of")
local m_string_utilities = require("Module:string utilities")
local accel_module = "Module:accel"
local headword_module = "Module:headword"
local ar_verb_module = "Module:ar-verb"
local ar_IPA_module = "Module:ar-IPA"
local IPA_module = "Module:IPA"
local parse_utilities_module = "Module:parse utilities"
local template_parser_module = "Module:template parser"
local utilities_module = "Module:utilities"
local lang = require("Module:languages").getByCode("ar")

local rsplit = m_string_utilities.split

local function track(page)
	require("Module:debug/track")("ar-inflections/" .. page)
	return true
end

local function split_on_comma(term)
	if not term then
		return nil
	end
	if term:find(",%s") then
		return require(parse_utilities_module).split_on_comma(term)
	else
		return rsplit(term, ",")
	end
end

local function get_pronun(form, tr)
	local pronun = require(ar_IPA_module).toIPA({ Arabic = form, tr = tr}, "noerror")
	if pronun == "" then
		return nil
	else
		return require(IPA_module).format_IPA(lang, "/" .. pronun .. "/")
	end
end

local function find_conjugations(lemma, verb_form, conjid, noconjid)
	local title = mw.title.new(lemma)
	if title then
		local content = title:getContent()
		if content then
			local arabic = require(utilities_module).get_section(content, "Arabic")
			if arabic then
				local conjs = {}
				local other_verb_forms = {}
				local ignored_ids = {}
				local saw_multiword_conj = 0
				local conjid_restrictions = split_on_comma(conjid)
				if conjid_restrictions then
					conjid_restrictions = m_table.listToSet(conjid_restrictions)
				end
				local noconjid_restrictions = split_on_comma(noconjid)
				if noconjid_restrictions then
					noconjid_restrictions = m_table.listToSet(noconjid_restrictions)
				end
				for tempname, args, template_invoc, _ in require(template_parser_module).findTemplates(arabic) do
					if tempname == "ar-conj" then
						local arg1 = args[1]
						if arg1 then
							arg1 = arg1:match("^<(.*)>$") or arg1
							if arg1:find("<") or arg1:find("%(%(") then
								mw.log(('find_conjugations("%s", "%s"): Skipping multiword conjugation: %s'):format(
									lemma, verb_form, template_invoc))
								saw_multiword_conj = saw_multiword_conj + 1
							else
								local arg1_verb_form = arg1:gsub("[/.-].*$", "")
								if arg1_verb_form == verb_form then
									local this_conjid = args.id
									local passes_id_restriction = true
									if this_conjid then
										if conjid_restrictions then
											passes_id_restriction = passes_id_restriction and
												conjid_restrictions[this_conjid]
										end
										if noconjid_restrictions then
											passes_id_restriction = passes_id_restriction and
												not noconjid_restrictions[this_conjid]
										end
									else
										passes_id_restriction = not conjid_restrictions
									end
									if passes_id_restriction then
										table.insert(conjs, {
											conj = ("%s<%s>"):format(lemma, arg1),
											gloss = args.t,
										})
									else
										m_table.insertIfNot(ignored_ids, this_conjid or "''no_ID''")
									end
								else
									m_table.insertIfNot(other_verb_forms, arg1_verb_form)
								end
							end
						end
					end
				end
				if conjs[1] then
					return conjs
				else
					local skipped_restrictions = {}
					if other_verb_forms[1] then
						table.insert(skipped_restrictions, ("found conjugation(s) for form%s %s"):format(
							#other_verb_forms > 1 and "s" or "", table.concat(other_verb_forms, ",")))
					end
					if saw_multiword_conj > 0 then
						table.insert(skipped_restrictions, ("skipped %s multiword conjugation%s"):format(
							saw_multiword_conj, saw_multiword_conj > 1 and "s" or ""))
					end
					if ignored_ids[1] then
						local id_restrictions = {}
						if conjid then
							table.insert(id_restrictions, ("conjid=%s"):format(conjid))
						end
						if noconjid then
							table.insert(id_restrictions, ("noconjid=%s"):format(noconjid))
						end
						table.insert(skipped_restrictions, ("ignored ID%s %s not passing ID restriction%s %s"):format(
							#ignored_ids > 1 and "'s" or "", table.concat(ignored_ids, ","),
							#id_restrictions > 1 and "s" or "",	table.concat(id_restrictions, " and ")))
					end
					skipped_restrictions = m_table.serialCommaJoin(skipped_restrictions, {punc = ";"})
					if #skipped_restrictions > 0 then
						skipped_restrictions = (" (but %s)"):format(skipped_restrictions)
					end
					return ("For Arabic lemma '%s', found Arabic section but couldn't find any verb conjugations when looking for form %s%s"):format(
						lemma, verb_form, skipped_restrictions)
				end
			else
				return ("For Arabic lemma '%s', page exists but has no Arabic section when looking for form %s"):format(
					lemma, verb_form)
			end
		else
			return ("For Arabic lemma '%s', couldn't fetch contents for page when looking for form %s; page may not exist"):format(
			lemma, verb_form)
		end
	else
		return ("Bad Arabic lemma '%s' when looking for form %s; couldn't create title object"):format(lemma, verb_form)
	end
end


local function generate_inflection_of(forms_and_tags, vforms, pagename, is_template_example)
	-- There are two approaches for combining tag sets and lemmas: Either we combine the lemmas first or the tag sets
	-- first. Combining the lemmas first means we look for all instances of a given tag set and combine all the lemmas
	-- with that tag set, and then, for each given set of lemmas, find all tag sets with that set of lemmas and run
	-- the multipart combination algorithm on them. Combining the tag sets first means that first, for each given
	-- lemma, we find all tag sets for that lemma and run the multipart combination algorithm on them, then for each
	-- resulting multipart tag set, we group all instances, and then finally group by set of lemmas. It's not clear
	-- which one results in fewer overall lines, so we do it both ways and see which one results in fewer lines.

	for _, form_and_tags in ipairs(forms_and_tags) do
		local by_lemma_sets_by_stage = {}
		local stages = { "lemmas-first", "tag-sets-first" }
		for _, stage in ipairs(stages) do
			if stage == "lemmas-first" then
				-- Within a given form, we have a collection of lemma+tag-set objects. Group first by tag set.
				local group_by_tag_set = {}
				for _, lemma_tag_set in ipairs(form_and_tags.lemmas_tag_sets) do
					m_table.insertIfNot(group_by_tag_set, {
						tag_set = lemma_tag_set.tag_set,
						lemmas = {lemma_tag_set.lemma},
					}, {
						key = function(obj) return obj.tag_set end,
						combine = function(obj1, obj2)
							for _, lemma in ipairs(obj2.lemmas) do
								m_table.insertIfNot(obj1.lemmas, lemma)
							end
						end,
					})
				end
				-- Then group further by lemma set.
				local group_by_lemma_set = {}
				for _, by_tag_set in ipairs(group_by_tag_set) do
					m_table.insertIfNot(group_by_lemma_set, {
						tag_sets = {by_tag_set.tag_set},
						lemmas = by_tag_set.lemmas,
					}, {
						key = function(obj) return obj.lemmas end,
						combine = function(obj1, obj2)
							for _, tag_set in ipairs(obj2.tag_sets) do
								m_table.insertIfNot(obj1.tag_sets, tag_set)
							end
						end,
					})
				end
				-- Finally, run the multipart combination algorithm.
				for _, by_lemma_set in ipairs(group_by_lemma_set) do
					for i, tag_set in ipairs(by_lemma_set.tag_sets) do
						by_lemma_set.tag_sets[i] = {tags = rsplit(tag_set, "|", true)}
					end
					if by_lemma_set.tag_sets[2] then -- more than one
						by_lemma_set.tag_sets = require(accel_module).combine_tag_sets_into_multipart(
							by_lemma_set.tag_sets, lang, "verb")
					end
				end
				by_lemma_sets_by_stage[stage] = group_by_lemma_set
			else
				-- First group by lemma.
				local group_by_lemma = {}
				for _, lemma_tag_set in ipairs(form_and_tags.lemmas_tag_sets) do
					m_table.insertIfNot(group_by_lemma, {
						lemma = lemma_tag_set.lemma,
						tag_sets = {lemma_tag_set.tag_set},
					}, {
						key = function(obj) return obj.lemma end,
						combine = function(obj1, obj2)
							for _, tag_set in ipairs(obj2.tag_sets) do
								m_table.insertIfNot(obj1.tag_sets, tag_set)
							end
						end,
					})
				end
				-- Then run the multipart combination algorithm.
				for _, by_lemma in ipairs(group_by_lemma) do
					for i, tag_set in ipairs(by_lemma.tag_sets) do
						by_lemma.tag_sets[i] = {tags = rsplit(tag_set, "|", true)}
					end
					if by_lemma.tag_sets[2] then -- more than one
						by_lemma.tag_sets = require(accel_module).combine_tag_sets_into_multipart(
							by_lemma.tag_sets, lang, "verb")
					end
				end
				-- Finally group further by tag-set set.
				local group_by_tag_set_set = {}
				for _, by_lemma in ipairs(group_by_lemma) do
					m_table.insertIfNot(group_by_tag_set_set, {
						tag_sets = by_lemma.tag_sets,
						lemmas = {by_lemma.lemma},
					}, {
						key = function(obj) return obj.tag_sets end,
						combine = function(obj1, obj2)
							for _, lemma in ipairs(obj2.lemmas) do
								m_table.insertIfNot(obj1.lemmas, lemma)
							end
						end,
					})
				end
				by_lemma_sets_by_stage[stage] = group_by_tag_set_set
			end
		end

		local lines_by_stage = {}
		for _, stage in ipairs(stages) do
			local by_lemma_sets = by_lemma_sets_by_stage[stage]
			local num_lines = 0
			for _, by_lemma_set in ipairs(by_lemma_sets) do
				num_lines = num_lines + #by_lemma_set.tag_sets
			end
			lines_by_stage[stage] = num_lines
		end

		local by_lemma_sets
		if lines_by_stage["lemmas-first"] <= lines_by_stage["tag-sets-first"] then
			by_lemma_sets = by_lemma_sets_by_stage["lemmas-first"]
		else
			by_lemma_sets = by_lemma_sets_by_stage["tag-sets-first"]
		end
		form_and_tags.by_lemma_sets = by_lemma_sets
	end

	local function get_verb_forms_as_inflections()
		local formatted_vforms = {}
		-- FIXME: Duplicated in [[Module:ar-headword]]
		for _, vform in ipairs(vforms) do
			table.insert(formatted_vforms,
				{ label = "[[Appendix:Arabic verbs#Form " .. vform .. "|form " .. vform .. "]]" })
		end
		return formatted_vforms
	end

	local parts = {}
	local function ins(part)
		table.insert(parts, part)
	end
	local function add_lang_to_lemmas(lemmas)
		-- add lang; if the lang is already assigned, this isn't a big deal due to idempotency
		for _, lemma in ipairs(lemmas) do
			lemma.lang = lang
		end
	end

	if forms_and_tags[2] then -- more than one
		ins(require(headword_module).full_headword {
			lang = lang,
			pos_category = "verb forms",
			heads = is_template_example and {"كتبت"} or nil,
			inflections = get_verb_forms_as_inflections(),
			pagename = pagename,
			translits = {"-"},
			force_cat_output = force_cat,
		})
		ins("\n")
		for _, form_and_tags in ipairs(forms_and_tags) do
			ins("\n# ")
			ins(m_links.full_link {
				lang = lang,
				term = form_and_tags.form,
				tr = form_and_tags.translit,
			})
			local pronun = get_pronun(form_and_tags.form, form_and_tags.translit)
			if pronun then
				ins(" " .. pronun)
			end
			ins(":")
			local by_lemma_sets = form_and_tags.by_lemma_sets
			if by_lemma_sets[2] then -- more than one
				for _, by_lemma_set in ipairs(by_lemma_sets) do
					add_lang_to_lemmas(by_lemma_set.lemmas)
					ins("\n## ")
					ins(m_form_of.tagged_inflections {
						lang = lang, tag_sets = by_lemma_set.tag_sets, lemmas = by_lemma_set.lemmas,
						lemma_face = "term", POS = "verb", indent = "###",
					})
				end
			else
				local by_lemma_set = by_lemma_sets[1]
				add_lang_to_lemmas(by_lemma_set.lemmas)
				ins(" ")
				ins(m_form_of.tagged_inflections {
					lang = lang, tag_sets = by_lemma_set.tag_sets, lemmas = by_lemma_set.lemmas,
					lemma_face = "term", POS = "verb",
				})
			end
		end
	else
		local form_and_tags = forms_and_tags[1]
		ins(require(headword_module).full_headword {
			lang = lang,
			pos_category = "verb forms",
			heads = {form_and_tags.form},
			translits = {form_and_tags.translit},
			inflections = get_verb_forms_as_inflections(),
			pagename = pagename,
			force_cat_output = force_cat,
		})
		local pronun = get_pronun(form_and_tags.form, form_and_tags.translit)
		if pronun then
			ins(" " .. pronun)
		end
		ins("\n")
		local by_lemma_sets = form_and_tags.by_lemma_sets
		for _, by_lemma_set in ipairs(by_lemma_sets) do
			add_lang_to_lemmas(by_lemma_set.lemmas)
			ins("\n# ")
			ins(m_form_of.tagged_inflections {
				lang = lang, tag_sets = by_lemma_set.tag_sets, lemmas = by_lemma_set.lemmas,
				lemma_face = "term", POS = "verb",
			})
		end
	end

	return table.concat(parts)
end

function export.verb_form(frame)
	local parargs = frame:getParent().args
	local params = {
		[1] = {required = true, list = true},
		["noautolinktext"] = {type = "boolean"},
		["noautolinkverb"] = {type = "boolean"},
		["pagename"] = {}, -- for testing/documentation pages
		["json"] = {type = "boolean"}, -- for bot use
		["slots"] = {}, -- restrict to only these slots
		["noslots"] = {}, -- restrict to all but these slots
		["t"] = {list = true, separate_no_index = true},
		["gloss"] = {list = true, separate_no_index = true, alias_of = "t"},
		["lit"] = {list = true, separate_no_index = true},
		["pos"] = {list = true, separate_no_index = true},
		["id"] = {list = true, separate_no_index = true},
		["conjid"] = {list = true},
		["noconjid"] = {list = true},
	}
	local m_verb_module = require(ar_verb_module)
	local args = require("Module:parameters").process(parargs, params)
	local argspecs = args[1]
	local forms_and_tags = {}
	local vforms = {}
	local categories = {}
	local err_msgs = {}
	local is_template_example = mw.title.getCurrentTitle().nsText == "Template" and
		mw.title.getCurrentTitle().subpageText == "ar-verb form"
	if not argspecs[1] and is_template_example then -- template invocation
		argspecs = {"كتب<I/a~u.pass>"}
	end

	local function process_argspec(argspec, gloss, i)
		args[1] = argspec
		local alternant_multiword_spec = m_verb_module.do_generate_forms(args, "ar-verb form")

		local non_lemma_form = alternant_multiword_spec.verb_form_of_form

		for _, vform in ipairs(alternant_multiword_spec.verb_forms) do
			m_table.insertIfNot(vforms, vform)
		end

		local lemmas = {}
		local lemma_slot = nil
		for _, slot in ipairs(m_verb_module.potential_lemma_slots) do
			if alternant_multiword_spec.forms[slot] then
				lemma_slot = slot
				for _, formobj in ipairs(alternant_multiword_spec.forms[slot]) do
					local lemmaval = m_links.remove_links(mw.ustring.toNFC(formobj.form))
					local lemma_translit = formobj.translit
					-- In case there is redundant manual translit, remove it so the comparisons are guaranteed to work
					-- out correctly in insertIfNot() comparisons. Otherwise it's possible we are comparing an object
					-- without manual translit to an object with redundant manual translit, in which case we will
					-- wrongly not find them equivalent.
					if lemma_translit and lemma_translit == lang:transliterate(lemmaval) then
						lemma_translit = nil
					end
					-- Ignore any footnotes. FIXME: We could convert them to qualifiers instead.
					table.insert(lemmas, {
						-- use term and tr for compatibility with tagged_inflections()
						term = lemmaval,
						tr = lemma_translit,
						gloss = gloss or args.t[i] or args.t.default,
						lit = args.lit[i] or args.lit.default,
						pos = args.pos[i] or args.pos.default,
						id = args.id[i] or args.id.default,
					})
				end 
				break
			end
		end 

		if not lemmas[1] then
			error(("Can't identify any lemmas when invoking {{ar-conj|%s}}"):format(args[1]))
		end

		local slot_restrictions = args.slots and m_table.listToSet(rsplit(args.slots, ",")) or nil
		local negated_slot_restrictions = args.noslots and m_table.listToSet(rsplit(args.noslots, ",")) or nil
		if slot_restrictions and slot_restrictions.lemma then
			slot_restrictions.lemma = nil
			slot_restrictions[lemma_slot] = true
		end
		if negated_slot_restrictions and negated_slot_restrictions.lemma then
			negated_slot_restrictions.lemma = nil
			negated_slot_restrictions[lemma_slot] = true
		end
		local slots_seen = {}

		local normally_skipped_slots = m_table.listToSet { "ap", "vp", "vn" }
		normally_skipped_slots[lemma_slot] = true
		if slot_restrictions then
			for slot, _ in pairs(slot_restrictions) do
				normally_skipped_slots[slot] = nil
			end
		end

		local matched = false
		for _, slot_accel in ipairs(alternant_multiword_spec.verb_slots) do
			local slot, accel = unpack(slot_accel)
			-- Skip "unsettable" (ancillary/internal) slots.
			if not m_verb_module.unsettable_slots_set[slot] and not normally_skipped_slots[slot] then
				local forms = alternant_multiword_spec.forms[slot]
				if forms then
					for _, formobj in ipairs(forms) do
						local form = m_links.remove_links(mw.ustring.toNFC(formobj.form))
						if non_lemma_form == lang:makeEntryName(form) then
							if (not slot_restrictions or slot_restrictions[slot]) and (
								not negated_slot_restrictions or not negated_slot_restrictions[slot]
							) then
								local translit = formobj.translit
								-- In case there is redundant manual translit, remove it; see above with lemmas for why
								-- we do this.
								if translit and translit == lang:transliterate(form) then
									translit = nil
								end
								for _, lemma in ipairs(lemmas) do
									local form_and_tags = {
										form = form,
										translit = translit,
										lemmas_tag_sets = {{
											lemma = lemma,
											tag_set = accel,
										}},
									}
									m_table.insertIfNot(forms_and_tags, form_and_tags, {
										key = function(formobj)
											return {
												form = formobj.form,
												translit = formobj.translit,
											}
										end,
										combine = function(obj, newobj)
											for _, lemma_tag_set in ipairs(newobj.lemmas_tag_sets) do
												m_table.insertIfNot(obj.lemmas_tag_sets, lemma_tag_set)
											end
										end,
									})
								end
								matched = true
							end
							if slot_restrictions or negated_slot_restrictions then
								slots_seen[slot] = true
							end
						end
					end
				end
			end
		end

		local function concat_lemmas()
			local formvals = {}
			for _, lemma in ipairs(lemmas) do
				table.insert(formvals, lemma.term)
			end
			return table.concat(formvals, "/")
		end

		if next(slots_seen) then
			local function get_slots_seen()
				local slots_seen_list = {}
				for slot, _ in pairs(slots_seen) do
					table.insert(slots_seen_list, slot)
				end
				table.sort(slots_seen_list)
				return slots_seen_list
			end

			local function check_against_slots_seen(restrictions, prefix)			
				for slot, _ in pairs(restrictions) do
					if not slots_seen[slot] then
						error(("%sslot restriction for slot '%s' had no effect (typo?) because it is not any of the slots matching form '%s' for verb '%s': possible values %s"):format(
							prefix, slot, non_lemma_form, concat_lemmas(), table.concat(get_slots_seen(), ",")))
					end
				end
			end

			if slot_restrictions then
				check_against_slots_seen(slot_restrictions, "")
			end
			if negated_slot_restrictions then
				check_against_slots_seen(negated_slot_restrictions, "negated ")
			end
		end
		if not matched then
			error(("'%s' is not any of the forms of the verb '%s'"):format(non_lemma_form, concat_lemmas()))
		end
	end

	for i, argspec in ipairs(argspecs) do
		if argspec:find("^%+") then
			local lemma, verb_form = argspec:match("^%+(.+)<(.-)>$")
			local conjs = find_conjugations(lemma, verb_form, args.conjid[i], args.noconjid[i])
			if type(conjs) == "string" then
				m_table.insertIfNot(err_msgs, conjs)
				m_table.insertIfNot(categories, "Arabic bad invocations of Template:ar-verb form")
			else
				for _, conj in ipairs(conjs) do
					process_argspec(conj.conj, conj.gloss, i)
				end
			end
		else
			process_argspec(argspec, nil, i)
		end
	end

	local parts = {}
	for _, msg in ipairs(err_msgs) do
		table.insert(parts, '<span style="font-weight: bold; color: #CC2200;">' .. msg .. "</span><br />")
	end
	if forms_and_tags[1] then
		table.insert(parts, generate_inflection_of(forms_and_tags, vforms, args.pagename, is_template_example))
	end
	local cat_text
	if categories[1] then
		cat_text = require(utilities_module).format_categories(categories, lang, nil, nil, force_cat)
	else
		cat_text = ""
	end
	return table.concat(parts) .. cat_text
end

return export