Module:User:Victar/headword
Jump to navigation
Jump to search
- This module sandbox lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Victar, for their own experimentation. Items in this module may be added and removed at Victar's discretion; do not rely on this module's stability.
local export = {}
local m_data = mw.loadData("Module:headword/data")
local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged
local parameters = {
lang = { type = "object" },
script = { type = "object" },
heads = { type = "table" },
translits = { type = "table" },
transcripts = { type = "table" },
inflections = { type = "table" },
genders = { type = "table" },
categories = { type = "table" },
pos_category = { type = "string" },
sort_key = { type = "string" },
id = { type = "string" },
}
local function test_script(text, script_code)
if type(text) == "string" and type(script_code) == "string" then
local sc = require("Module:scripts").getByCode(script_code)
local characters
if sc then
characters = sc:getCharacters()
end
local out
if characters then
text = mw.ustring.gsub(text, "%W", "")
out = mw.ustring.find(text, "[" .. characters .. "]")
end
if out then
return true
else
return false
end
else
mw.log("Parameters to test_script were incorrect.")
return nil
end
end
local function preprocess(data)
--[=[
[[Special:WhatLinksHere/Wiktionary:Tracking/headword/heads-not-table]]
[[Special:WhatLinksHere/Wiktionary:Tracking/headword/translits-not-table]]
]=]
if type(data.heads) ~= "table" then
if data.heads then
require("Module:debug").track("headword/heads-not-table")
end
data.heads = { data.heads }
end
if type(data.translits) ~= "table" then
if data.translits then
require("Module:debug").track("headword/translits-not-table")
end
data.translits = { data.translits }
end
if not data.heads or #data.heads == 0 then
data.heads = {""}
end
local title = mw.title.getCurrentTitle()
-- Determine if term is reconstructed
local is_reconstructed = data.lang:getType() == "reconstructed"
or title.nsText == "Reconstruction"
-- Create a default headword.
local subpagename = title.subpageText
local pagename = title.text
local default_head
if is_reconstructed then
default_head = require("Module:utilities").plain_gsub(pagename, data.lang:getCanonicalName() .. "/", "")
else
default_head = subpagename
end
-- Add links to multi-word page names when appropriate
if data.lang:getCode() ~= "zh" then
local spacingPunctuation = "([%s%p]+)"
--[[ Variable containing anything that is
not a punctuation character found inside of words.
Used to exclude characters from the above regex. ]]
local notWordPunc = "([^-־׳״'.·*]+)"
local contains_words = false
for possibleWordBreak in mw.ustring.gmatch(default_head, spacingPunctuation) do
if mw.ustring.find(possibleWordBreak, notWordPunc) then
contains_words = true
break
end
end
if (not is_reconstructed) and contains_words then
local function workaround_to_exclude_chars(s)
return mw.ustring.gsub(s, notWordPunc, "]]%1[[")
end
default_head = "[["
.. mw.ustring.gsub(
default_head,
spacingPunctuation,
workaround_to_exclude_chars
)
.. "]]"
--[=[
use this when workaround is no longer needed:
default_head = "[["
.. mw.ustring.gsub(default_head, WORDBREAKCHARS, "]]%1[[")
.. "]]"
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
default_head = mw.ustring.gsub(default_head, "%[%[%]%]", "")
end
end
if is_reconstructed then
default_head = "*" .. default_head
end
-- If a head is the empty string "", then replace it with the default
for i, head in ipairs(data.heads) do
if head == "" then
head = default_head
else
if head == default_head and data.lang:getCanonicalName() == "English" then
table.insert(data.categories, data.lang:getCanonicalName() .. " terms with redundant head parameter")
end
end
data.heads[i] = head
end
--[[ Try to detect the script if it was not provided
We use the first headword for this, and assume
that all of them have the same script
This *should* always be true, right? ]]
if not data.sc then
data.sc = require("Module:scripts").findBestScript(data.heads[1], data.lang)
end
-- Make transliterations
for i, head in ipairs(data.heads) do
local translit = data.translits[i]
-- Try to generate a transliteration if necessary
-- Generate it if the script is not Latn or similar, and if no transliteration was provided
if translit == "-" then
translit = nil
elseif not translit
and not (
data.sc:getCode():find("Latn", nil, true)
or data.sc:getCode() == "Latinx"
or data.sc:getCode() == "None"
)
and (not data.sc or data.sc:getCode() ~= "Imag") then
translit = data.lang:transliterate(require("Module:links").remove_links(head), data.sc)
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not translit and not notranslit[data.lang:getCode()] then
translit = "<small>transliteration needed</small>"
table.insert(data.categories, data.lang:getCanonicalName() .. " terms needing transliteration")
end
end
-- Link to the transliteration entry for languages that require this
if translit and data.lang:link_tr() then
translit = require("Module:links").full_link{
term = translit,
lang = data.lang,
sc = require("Module:scripts").getByCode("Latn"),
tr = "-"
}
end
data.translits[i] = translit
end
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
end
end
-- Format a headword with transliterations
local function format_headword(data)
local m_links = require("Module:links")
local m_scriptutils = require("Module:script utilities")
if data.heads and #data.heads and data.lang then
require("Module:debug").track{
"headword/heads/" .. #data.heads,
"headword/heads/" .. #data.heads .. "/" .. data.lang:getCode()
}
end
-- Are there non-empty transliterations?
-- Need to do it this way because translit[1] might be nil while translit[2] is not
local has_translits = false
-- Format the headwords
for i, head in ipairs(data.heads) do
if data.translits[i] then
has_translits = true
end
-- Apply processing to the headword, for formatting links and such
if head:find("[[", nil, true) and (not data.sc or data.sc:getCode() ~= "Imag") then
head = m_links.language_link({term = head, lang = data.lang}, false)
end
-- Add language and script wrapper
if i == 1 then
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil, data.id)
else
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil)
end
data.heads[i] = head
end
local translits_formatted = ""
if has_translits then
-- Format the transliterations
for i, head in ipairs(data.heads) do
local translit = data.translits[i]
if not translit then
translit = "?"
end
translit = m_scriptutils.tag_translit(translit, data.lang:getCode(), "head")
data.translits[i] = translit
end
translits_formatted = " (" .. table.concat(data.translits, " ''or'' ") .. ")"
local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Wiktionary")
if transliteration_page then
local success, exists = pcall(function () return transliteration_page.exists end)
if success and exists then
translits_formatted = " [[Wiktionary:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
end
end
end
return table.concat(data.heads, " ''or'' ") .. translits_formatted
end
local function format_genders(data)
if data.genders and #data.genders > 0 then
local gen = require("Module:gender and number")
return " " .. gen.format_list(data.genders, data.lang)
else
return ""
end
end
local function format_inflection_parts(data, parts)
local m_links = require("Module:links")
for key, part in ipairs(parts) do
if type(part) ~= "table" then
part = {term = part}
end
local qualifiers = ""
if part.qualifiers and #part.qualifiers > 0 then
qualifiers = require("Module:qualifier").format_qualifier(part.qualifiers) .. " "
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/qualifier]]
require("Module:debug").track("headword/qualifier")
end
local partaccel = part.accel
local face = part.hypothetical and "hypothetical" or "bold"
local nolink = part.hypothetical or part.nolink
-- Convert the term into a full link
-- Don't show a transliteration here, the consensus seems to be not to
-- show them in headword lines to avoid clutter.
part = m_links.full_link(
{
term = not nolink and part.term or nil,
alt = part.alt or (nolink and part.term or nil),
lang = part.lang or data.lang,
sc = part.sc or parts.sc or (not part.lang and data.sc),
id = part.id,
genders = part.genders,
tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil),
accel = parts.accel or partaccel,
},
face,
false
)
part = qualifiers .. part
parts[key] = part
end
local parts_output = ""
if #parts > 0 then
parts_output = " " .. table.concat(parts, " ''or'' ")
elseif parts.request then
parts_output = " <small>[please provide]</small>"
.. require("Module:utilities").format_categories(
{data.lang:getCanonicalName() .. " entries needing inflection"},
lang,
nil,
nil,
data.force_cat_output,
data.sc
)
end
return "''" .. parts.label .. "''" .. parts_output
end
-- Format the inflections following the headword
local function format_inflections(data)
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually
for key, infl in ipairs(data.inflections) do
data.inflections[key] = format_inflection_parts(data, infl)
end
return " (" .. table.concat(data.inflections, ", ") .. ")"
else
return ""
end
end
local function show_headword_line(data)
-- Check the namespace against the language type
if mw.title.getCurrentTitle().nsText == "" then
if data.lang:getType() == "reconstructed" then
error("Entries for this language must be placed in the Reconstruction: namespace.")
elseif data.lang:getType() == "appendix-constructed" then
error("Entries for this language must be placed in the Appendix: namespace.")
end
end
local tracking_categories = {}
local pos_category = data.lang:getCanonicalName() .. " " .. data.pos_category
if pos_category ~= "Translingual Han characters" then
table.insert(data.categories, 1, pos_category)
end
-- Is it a lemma category?
if isLemma[data.pos_category] or isLemma[data.pos_category:gsub("^reconstructed ", "")] then
table.insert(data.categories, 1, data.lang:getCanonicalName() .. " lemmas")
-- Is it a nonlemma category?
elseif isNonLemma[data.pos_category]
or isNonLemma[data.pos_category:gsub("^reconstructed ", "")]
or isLemma[data.pos_category:gsub("^mutated ", "")]
or isNonLemma[data.pos_category:gsub("^mutated ", "")] then
table.insert(data.categories, 1, data.lang:getCanonicalName() .. " non-lemma forms")
-- It's neither; we don't know what this category is, so tag it with a tracking category.
else
--[=[
[[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
]=]
table.insert(tracking_categories, "head tracking/unrecognized pos")
require("Module:debug").track{
"headword/unrecognized pos",
"headword/unrecognized pos/lang/" .. data.lang:getCode(),
"headword/unrecognized pos/pos/" .. data.pos_category
}
end
-- Preprocess
preprocess(data)
local m_links = require("Module:links")
if data.lang:getType() ~= "reconstructed" then
for _, head in ipairs(data.heads) do
if mw.title.getCurrentTitle().prefixedText ~= m_links.getLinkPage(m_links.remove_links(head), data.lang) then
--[=[
[[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
]=]
require("Module:debug").track{
"headword/pagename spelling mismatch",
"headword/pagename spelling mismatch/" .. data.lang:getCode()
}
break
end
end
end
-- Format and return all the gathered information
return
format_headword(data) ..
format_genders(data) ..
format_inflections(data) ..
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
)
end
function export.full_headword(data)
local tracking_categories = {}
-- Script-tags the topmost header.
local pagename = mw.title.getCurrentTitle().text
local fullPagename = mw.title.getCurrentTitle().fullText
local namespace = mw.title.getCurrentTitle().nsText
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In data, the first argument to full_headword, data.lang should be a language object.")
end
if not data.sc then
data.sc = require("Module:scripts").findBestScript(data.heads and data.heads[1] ~= "" and data.heads[1] or pagename, data.lang)
else
-- Track uses of sc parameter
local best = require("Module:scripts").findBestScript(pagename, data.lang)
require("Module:debug").track("headword/sc")
if data.sc:getCode() == best:getCode() then
require("Module:debug").track("headword/sc/redundant")
require("Module:debug").track("headword/sc/redundant/" .. data.sc:getCode())
else
require("Module:debug").track("headword/sc/needed")
require("Module:debug").track("headword/sc/needed/" .. data.sc:getCode())
end
end
local displayTitle
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
if namespace == "" and data.sc and toBeTagged[data.sc:getCode()] or
data.sc:getCode() == "Jpan" and (test_script(pagename, "Hira") or test_script(pagename, "Kana")) then
displayTitle = '<span class="' .. data.sc:getCode() .. '">' .. pagename .. '</span>'
elseif namespace == "Reconstruction" then
displayTitle, matched = mw.ustring.gsub(
fullPagename,
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
return before ..
require("Module:script utilities").tag_text(
term,
data.lang,
data.sc
)
end
)
if matched == 0 then
displayTitle = nil
end
end
if displayTitle then
local frame = mw.getCurrentFrame()
frame:callParserFunction(
"DISPLAYTITLE",
displayTitle
)
end
if data.force_cat_output then
--[=[
[[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
]=]
require("Module:debug").track("headword/force cat output")
end
if data.getCanonicalName then
error('The "data" variable supplied to "full_headword" should not be a language object.')
end
-- Were any categories specified?
if data.categories and #data.categories > 0 then
local lang_name = require("Module:string").pattern_escape(data.lang:getCanonicalName())
for _, cat in ipairs(data.categories) do
-- Does the category begin with the language name? If not, tag it with a tracking category.
if not mw.ustring.find(cat, "^" .. lang_name) then
mw.log(cat, data.lang:getCanonicalName())
table.insert(tracking_categories, "head tracking/no lang category")
--[=[
[[Special:WhatLinksHere/Wiktionary:Tracking/head tracking/no lang category]]
]=]
require("Module:debug").track{
"headword/no lang category",
"headword/no lang category/lang/" .. data.lang:getCode()
}
end
end
if not data.pos_category
and mw.ustring.find(data.categories[1], "^" .. data.lang:getCanonicalName())
then
data.pos_category = mw.ustring.gsub(data.categories[1], "^" .. data.lang:getCanonicalName() .. " ", "")
table.remove(data.categories, 1)
end
end
if not data.pos_category then
error(
'No valid part-of-speech categories were found in the list '
.. 'of categories passed to the function "full_headword". '
.. 'The part-of-speech category should consist of a language\'s '
.. 'canonical name plus a part of speech.'
)
end
-- Categorise for unusual characters
local standard = data.lang:getStandardCharacters()
if standard then
if mw.ustring.len(mw.title.getCurrentTitle().subpageText) ~= 1 and not mw.ustring.match(mw.title.getCurrentTitle().text, "^Unsupported titles/") then
for character in mw.ustring.gmatch(mw.title.getCurrentTitle().subpageText, "([^" .. standard .. "])") do
local upper = mw.ustring.upper(character)
if not mw.ustring.find(upper, "[" .. standard .. "]") then
character = upper
end
table.insert(
data.categories,
data.lang:getCanonicalName() .. " terms spelled with " .. character
)
end
end
end
-- Categorise for palindromes
if mw.title.getCurrentTitle().nsText ~= "Reconstruction"
and require('Module:palindromes').is_palindrome(
mw.title.getCurrentTitle().subpageText, data.lang, data.sc
) then
table.insert(data.categories, data.lang:getCanonicalName() .. " palindromes")
end
return
show_headword_line(data) ..
require("Module:utilities").format_categories(
data.categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
) ..
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
)
end
return export