Module:User:Victar/headword

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Victar, for their own experimentation. Items in this module may be added and removed at Victar's discretion; do not rely on this module's stability.


local export = {}

local m_data = mw.loadData("Module:headword/data")

local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged

local parameters = {
	lang = { type = "object" },
	script = { type = "object" },
	heads = { type = "table" },
	translits = { type = "table" },
	transcripts = { type = "table" },
	inflections = { type = "table" },
	genders = { type = "table" },
	categories = { type = "table" },
	pos_category = { type = "string" },
	sort_key = { type = "string" },
	id = { type = "string" },
}

local function test_script(text, script_code)
	if type(text) == "string" and type(script_code) == "string" then
		local sc = require("Module:scripts").getByCode(script_code)
		local characters
		if sc then
			characters = sc:getCharacters()
		end
		
		local out
		if characters then
			text = mw.ustring.gsub(text, "%W", "")
			out = mw.ustring.find(text, "[" .. characters .. "]")
		end
		
		if out then
			return true
		else
			return false
		end
	else
		mw.log("Parameters to test_script were incorrect.")
		return nil
	end
end


local function preprocess(data)
	--[=[
	[[Special:WhatLinksHere/Wiktionary:Tracking/headword/heads-not-table]]
	[[Special:WhatLinksHere/Wiktionary:Tracking/headword/translits-not-table]]
	]=]
	if type(data.heads) ~= "table" then
		if data.heads then
			require("Module:debug").track("headword/heads-not-table")
		end
		
		data.heads = { data.heads }
	end
	
	if type(data.translits) ~= "table" then
		if data.translits then
			require("Module:debug").track("headword/translits-not-table")
		end
		
		data.translits = { data.translits }
	end
	
	if not data.heads or #data.heads == 0 then
		data.heads = {""}
	end
	
	local title = mw.title.getCurrentTitle()
	
	-- Determine if term is reconstructed
	local is_reconstructed = data.lang:getType() == "reconstructed"
		or title.nsText == "Reconstruction"
	
	-- Create a default headword.
	local subpagename = title.subpageText
	local pagename = title.text
	local default_head
	if is_reconstructed then
		default_head = require("Module:utilities").plain_gsub(pagename, data.lang:getCanonicalName() .. "/", "")
	else
		default_head = subpagename
	end
	
	-- Add links to multi-word page names when appropriate
	if data.lang:getCode() ~= "zh" then
		local spacingPunctuation = "([%s%p]+)"
		--[[ 	Variable containing anything that is
				not a punctuation character found inside of words.
				Used to exclude characters from the above regex.	]]
		local notWordPunc = "([^-־׳״'.·*]+)"
		local contains_words = false
		
		for possibleWordBreak in mw.ustring.gmatch(default_head, spacingPunctuation) do
			if mw.ustring.find(possibleWordBreak, notWordPunc) then
				contains_words = true
				break
			end
		end
		
		if (not is_reconstructed) and contains_words then
			local function workaround_to_exclude_chars(s)
				return mw.ustring.gsub(s, notWordPunc, "]]%1[[")
			end
			
			default_head = "[["
				.. mw.ustring.gsub(
					default_head,
					spacingPunctuation,
					workaround_to_exclude_chars
					)
				.. "]]"
			--[=[
			use this when workaround is no longer needed:
			default_head = "[["
				.. mw.ustring.gsub(default_head, WORDBREAKCHARS, "]]%1[[")
				.. "]]"
			
			Remove any empty links, which could have been created above
			at the beginning or end of the string.
			]=]
			default_head = mw.ustring.gsub(default_head, "%[%[%]%]", "")
		end
	end
	
	if is_reconstructed then
		default_head = "*" .. default_head
	end
	
	-- If a head is the empty string "", then replace it with the default
	for i, head in ipairs(data.heads) do
		if head == "" then
			head = default_head
		else
			if head == default_head and data.lang:getCanonicalName() == "English" then
				table.insert(data.categories, data.lang:getCanonicalName() .. " terms with redundant head parameter")
			end			
		end
		data.heads[i] = head
	end
	
	--[[	Try to detect the script if it was not provided
			We use the first headword for this, and assume
			that all of them have the same script
			This *should* always be true, right?		]]
	if not data.sc then
		data.sc = require("Module:scripts").findBestScript(data.heads[1], data.lang)
	end
	
	-- Make transliterations
	for i, head in ipairs(data.heads) do
		local translit = data.translits[i]
		
		-- Try to generate a transliteration if necessary
		-- Generate it if the script is not Latn or similar, and if no transliteration was provided
		if translit == "-" then
			translit = nil
		elseif not translit
			and not (
				data.sc:getCode():find("Latn", nil, true)
				or data.sc:getCode() == "Latinx"
				or data.sc:getCode() == "None"
				)
			and (not data.sc or data.sc:getCode() ~= "Imag") then
			
			translit = data.lang:transliterate(require("Module:links").remove_links(head), data.sc)
			
			-- There is still no transliteration?
			-- Add the entry to a cleanup category.
			if not translit and not notranslit[data.lang:getCode()] then
				translit = "<small>transliteration needed</small>"
				table.insert(data.categories, data.lang:getCanonicalName() .. " terms needing transliteration")
			end
		end
		
		-- Link to the transliteration entry for languages that require this
		if translit and data.lang:link_tr() then
			translit = require("Module:links").full_link{
				term = translit,
				lang = data.lang,
				sc = require("Module:scripts").getByCode("Latn"),
				tr = "-"
				}
		end
		
		data.translits[i] = translit
	end
	
	if data.id and type(data.id) ~= "string" then
		error("The id in the data table should be a string.")
	end
end


-- Format a headword with transliterations
local function format_headword(data)
	local m_links = require("Module:links")
	local m_scriptutils = require("Module:script utilities")
	
	if data.heads and #data.heads and data.lang then
		require("Module:debug").track{
			"headword/heads/" .. #data.heads,
			"headword/heads/" .. #data.heads .. "/" .. data.lang:getCode()
		}
	end
	
	-- Are there non-empty transliterations?
	-- Need to do it this way because translit[1] might be nil while translit[2] is not
	local has_translits = false
	
	-- Format the headwords
	for i, head in ipairs(data.heads) do
		if data.translits[i] then
			has_translits = true
		end
		
		-- Apply processing to the headword, for formatting links and such
		if head:find("[[", nil, true) and (not data.sc or data.sc:getCode() ~= "Imag") then
			head = m_links.language_link({term = head, lang = data.lang}, false)
		end
		
		-- Add language and script wrapper
		if i == 1 then
			head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil, data.id)
		else
			head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil)
		end
		
		data.heads[i] = head
	end
	
	local translits_formatted = ""
	
	if has_translits then
		-- Format the transliterations
		for i, head in ipairs(data.heads) do
			local translit = data.translits[i]
			
			if not translit then
				translit = "?"
			end
			
			translit = m_scriptutils.tag_translit(translit, data.lang:getCode(), "head")
			
			data.translits[i] = translit
		end
		
		translits_formatted = " (" .. table.concat(data.translits, " ''or'' ") .. ")"
		
		local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Wiktionary")
		
		if transliteration_page then
			local success, exists = pcall(function () return transliteration_page.exists end)
			if success and exists then
				translits_formatted = " [[Wiktionary:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
			end
		end
	end
	
	return table.concat(data.heads, " ''or'' ") .. translits_formatted
end


local function format_genders(data)
	if data.genders and #data.genders > 0 then
		local gen = require("Module:gender and number")
		return "&nbsp;" .. gen.format_list(data.genders, data.lang)
	else
		return ""
	end
end


local function format_inflection_parts(data, parts)
	local m_links = require("Module:links")
	
	for key, part in ipairs(parts) do
		if type(part) ~= "table" then
			part = {term = part}
		end
		
		local qualifiers = ""
		
		if part.qualifiers and #part.qualifiers > 0 then
			qualifiers = require("Module:qualifier").format_qualifier(part.qualifiers) .. " "
			
			-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/qualifier]]
			require("Module:debug").track("headword/qualifier")
		end
		
		local partaccel = part.accel
		local face = part.hypothetical and "hypothetical" or "bold"
		local nolink = part.hypothetical or part.nolink
		
		-- Convert the term into a full link
		-- Don't show a transliteration here, the consensus seems to be not to
		-- show them in headword lines to avoid clutter.
		part = m_links.full_link(
			{
				term = not nolink and part.term or nil,
				alt = part.alt or (nolink and part.term or nil),
				lang = part.lang or data.lang,
				sc = part.sc or parts.sc or (not part.lang and data.sc),
				id = part.id,
				genders = part.genders,
				tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil),
				accel = parts.accel or partaccel,
			},
			face,
			false
			)
		
		part = qualifiers .. part
		
		parts[key] = part
	end
	
	local parts_output = ""
	
	if #parts > 0 then
		parts_output = " " .. table.concat(parts, " ''or'' ")
	elseif parts.request then
		parts_output = " <small>[please provide]</small>"
			.. require("Module:utilities").format_categories(
				{data.lang:getCanonicalName() .. " entries needing inflection"},
				lang,
				nil,
				nil,
				data.force_cat_output,
				data.sc
				)
	end
	
	return "''" .. parts.label .. "''" .. parts_output
end

-- Format the inflections following the headword
local function format_inflections(data)
	if data.inflections and #data.inflections > 0 then
		-- Format each inflection individually
		for key, infl in ipairs(data.inflections) do
			data.inflections[key] = format_inflection_parts(data, infl)
		end
		
		return " (" .. table.concat(data.inflections, ", ") .. ")"
	else
		return ""
	end
end

local function show_headword_line(data)
	-- Check the namespace against the language type
	if mw.title.getCurrentTitle().nsText == "" then
		if data.lang:getType() == "reconstructed" then
			error("Entries for this language must be placed in the Reconstruction: namespace.")
		elseif data.lang:getType() == "appendix-constructed" then
			error("Entries for this language must be placed in the Appendix: namespace.")
		end
	end
	
	local tracking_categories = {}
	
	local pos_category = data.lang:getCanonicalName() .. " " .. data.pos_category
	if pos_category ~= "Translingual Han characters" then
		table.insert(data.categories, 1, pos_category)
	end
	
	-- Is it a lemma category?
	if isLemma[data.pos_category] or isLemma[data.pos_category:gsub("^reconstructed ", "")] then
		table.insert(data.categories, 1, data.lang:getCanonicalName() .. " lemmas")
	-- Is it a nonlemma category?
	elseif isNonLemma[data.pos_category]
		or isNonLemma[data.pos_category:gsub("^reconstructed ", "")]
		or isLemma[data.pos_category:gsub("^mutated ", "")]
		or isNonLemma[data.pos_category:gsub("^mutated ", "")] then
		
		table.insert(data.categories, 1, data.lang:getCanonicalName() .. " non-lemma forms")
	-- It's neither; we don't know what this category is, so tag it with a tracking category.
	else
		--[=[
		[[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
		]=]
		table.insert(tracking_categories, "head tracking/unrecognized pos")
		require("Module:debug").track{
			"headword/unrecognized pos",
			"headword/unrecognized pos/lang/" .. data.lang:getCode(),
			"headword/unrecognized pos/pos/" .. data.pos_category
		}
	end
	
	-- Preprocess
	preprocess(data)
	
	local m_links = require("Module:links")
	
	if data.lang:getType() ~= "reconstructed" then
		for _, head in ipairs(data.heads) do
			if mw.title.getCurrentTitle().prefixedText ~= m_links.getLinkPage(m_links.remove_links(head), data.lang) then
				--[=[
				[[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
				]=]
				require("Module:debug").track{
					"headword/pagename spelling mismatch",
					"headword/pagename spelling mismatch/" .. data.lang:getCode()
				}
				break
			end
		end
	end
	
	-- Format and return all the gathered information
	return
		format_headword(data) ..
		format_genders(data) ..
		format_inflections(data) ..
		require("Module:utilities").format_categories(
			tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
			)
end

function export.full_headword(data)
	local tracking_categories = {}
	
	-- Script-tags the topmost header.
	local pagename = mw.title.getCurrentTitle().text
	local fullPagename = mw.title.getCurrentTitle().fullText
	local namespace = mw.title.getCurrentTitle().nsText
	
	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
		error("In data, the first argument to full_headword, data.lang should be a language object.")
	end
	
	if not data.sc then
		data.sc = require("Module:scripts").findBestScript(data.heads and data.heads[1] ~= "" and data.heads[1] or pagename, data.lang)
	else
		-- Track uses of sc parameter
		local best = require("Module:scripts").findBestScript(pagename, data.lang)
		require("Module:debug").track("headword/sc")
		
		if data.sc:getCode() == best:getCode() then
			require("Module:debug").track("headword/sc/redundant")
			require("Module:debug").track("headword/sc/redundant/" .. data.sc:getCode())
		else
			require("Module:debug").track("headword/sc/needed")
			require("Module:debug").track("headword/sc/needed/" .. data.sc:getCode())
		end
	end
	
	local displayTitle
	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
	if namespace == "" and data.sc and toBeTagged[data.sc:getCode()] or
			data.sc:getCode() == "Jpan" and (test_script(pagename, "Hira") or test_script(pagename, "Kana")) then
		displayTitle = '<span class="' .. data.sc:getCode() .. '">' .. pagename .. '</span>'
	elseif namespace == "Reconstruction" then
		displayTitle, matched = mw.ustring.gsub(
			fullPagename,
			"^(Reconstruction:[^/]+/)(.+)$",
			function(before, term)
				return before ..
					require("Module:script utilities").tag_text(
						term,
						data.lang,
						data.sc
					)
			end
		)
		
		if matched == 0 then
			displayTitle = nil
		end
	end
	
	if displayTitle then
		local frame = mw.getCurrentFrame()
		frame:callParserFunction(
			"DISPLAYTITLE",
			displayTitle
		)
	end
	
	if data.force_cat_output then
		--[=[
		[[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
		]=]
		require("Module:debug").track("headword/force cat output")
	end
	
	if data.getCanonicalName then
		error('The "data" variable supplied to "full_headword" should not be a language object.')
	end
		
	-- Were any categories specified?
	if data.categories and #data.categories > 0 then
		local lang_name = require("Module:string").pattern_escape(data.lang:getCanonicalName())
		for _, cat in ipairs(data.categories) do
			-- Does the category begin with the language name? If not, tag it with a tracking category.
			if not mw.ustring.find(cat, "^" .. lang_name) then
				mw.log(cat, data.lang:getCanonicalName())
				table.insert(tracking_categories, "head tracking/no lang category")
				
				--[=[
				[[Special:WhatLinksHere/Wiktionary:Tracking/head tracking/no lang category]]
				]=]
				require("Module:debug").track{
					"headword/no lang category",
					"headword/no lang category/lang/" .. data.lang:getCode()
				}
			end
		end
		
		if not data.pos_category
			and mw.ustring.find(data.categories[1], "^" .. data.lang:getCanonicalName())
				then
			data.pos_category = mw.ustring.gsub(data.categories[1], "^" .. data.lang:getCanonicalName() .. " ", "")
			table.remove(data.categories, 1)
		end
	end
	
	if not data.pos_category then
		error(
			'No valid part-of-speech categories were found in the list '
			.. 'of categories passed to the function "full_headword". '
			.. 'The part-of-speech category should consist of a language\'s '
			.. 'canonical name plus a part of speech.'
			)
	end
	
	-- Categorise for unusual characters
	local standard = data.lang:getStandardCharacters()
	
	if standard then
		if mw.ustring.len(mw.title.getCurrentTitle().subpageText) ~= 1 and not mw.ustring.match(mw.title.getCurrentTitle().text, "^Unsupported titles/") then
			for character in mw.ustring.gmatch(mw.title.getCurrentTitle().subpageText, "([^" .. standard .. "])") do
				local upper = mw.ustring.upper(character)
				if not mw.ustring.find(upper, "[" .. standard .. "]") then
					character = upper
				end
				table.insert(
					data.categories,
					data.lang:getCanonicalName() .. " terms spelled with " .. character
				)
			end
		end
	end
	
	-- Categorise for palindromes
	if mw.title.getCurrentTitle().nsText ~= "Reconstruction"
		and require('Module:palindromes').is_palindrome(
			mw.title.getCurrentTitle().subpageText, data.lang, data.sc
			) then
		table.insert(data.categories, data.lang:getCanonicalName() .. " palindromes")
	end
	
	return
		show_headword_line(data) ..
		require("Module:utilities").format_categories(
			data.categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
			) ..
		require("Module:utilities").format_categories(
			tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
			)
end

return export