Jump to content

Module:grc-utilities

From Wiktionary, the free dictionary

This module contains four functions, three of which are called by other modules.

standardDiacritics takes spacing or nonstandard diacritics and converts them to standard combining diacritics. This function is used by pronunciationOrder.

reorderDiacritics takes the diacritics, removes them from the letter (mw.ustring.toNFD), and reorders them so that macrons or breves are first; diaeresis or breathing mark is second; acute, grave, or circumflex is third; and iota subscript is last. Aside from the iota subscript part, this is the only order in which the diacritics can display correctly, as explained elsewhere. This function is used by Module:typing-aids and {{chars}}.

  • ά̓̆νερ (α◌́◌̓◌̆νερ) → ᾰ̓́νερ (α◌̆◌̓◌́νερ)

pronunciationOrder does the same thing, except it puts the macron or breve and iota subscript last and recombines the diacritics (mw.ustring.toNFC) after reordering them. The diaeresis or breathing mark and accent mark will recombine, while the macron and breve remains uncombined as a combining character. This function is used by Module:grc-pronunciation and {{grc-IPA}}.

Module:grc-utilities/data holds the diacritic definitions and substitutions that are used by this module.

Tokenization

The function tokenize breaks the text into meaningful units of a single consonant or monophthong letter, or diphthong, with any diacritics, as shown below. This function is used by Module:grc-translit and Module:grc-accent, and by the sandbox module Module:grc-pronunciation/sandbox.

The first argument is the word to be tokenized. The second is a boolean: if true, the function will group εω together as a diphthong, for instance in πόλεως (póleōs), genitive of πόλῐς (pólĭs, city state).

word tokens
ἡμεῖς , μ, εῖ, ς
οἷαι οἷ, αι
ἀναῡ̈τέω , ν, α, ῡ̈, τ, έ, ω
δαΐφρων δ, α, ΐ, φ, ρ, ω, ν
τούτῳ τ, ού, τ,
ὑϊκός , ϊ, κ, ό, ς
ἡ Ἑλήνη , , , λ, ή, ν, η
νηῦς ν, ηῦ, ς
υἱός υἱ, ό, ς
ὄργυιᾰ , ρ, γ, υι,
οὐ δοκεῖν ἀλλ’ εἶναι ἀγαθὸν οὐ, , δ, ο, κ, εῖ, ν, , , λ, λ, , , εἶ, ν, αι, , , γ, α, θ, , ν

Testcases

 

local export = {}

local m_data = mw.loadData("Module:grc-utilities/data")
local m_string_utils = require("Module:string utilities")

local concat = table.concat
local full_link = require("Module:links").full_link
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local insert = table.insert
local sparseConcat = require("Module:table").sparseConcat
local standard_diacritics -- defined below
local tag_text = require("Module:script utilities").tag_text
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("Polyt")

local groups = m_data.groups
local canonical = m_data.canonical
local diacritic_order = m_data.diacritic_order
local diacritical_conversions = m_data.diacritical_conversions
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local macron = diacritics.macron
local breve = diacritics.breve
local spacing_macron = diacritics.spacing_macron
local spacing_breve = diacritics.spacing_breve
local rough = diacritics.rough
local smooth = diacritics.smooth
local diaeresis = diacritics.diaeresis
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local subscript = diacritics.subscript
local combining_diacritic = m_data.combining_diacritic

local UTF8_char = ".[\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ

local info = {}
-- The tables are shared among different characters so that they can be checked
-- for equality if needed, and to use less space.
local vowel_t = { vowel = true }
local iota_t = { vowel = true, offglide = true }
local upsilon_t = { vowel = true, offglide = true }
-- These don't need any contents.
local breathy_cons_t = {}
-- local consonant_t = {}
local diacritic_t = { diacritic = true }
-- Needed for equality comparisons.
local breathing_t = { diacritic = true }

local function add_info(characters, t)
	if type(characters) == "string" then
		for character in characters:gmatch(UTF8_char) do
			info[character] = t
		end
	else
		for _, character in ipairs(characters) do
			info[character] = t
		end
	end
end

add_info({ macron, breve,
		diaeresis,
		acute, grave, circumflex,
		subscript,
	}, diacritic_t)

add_info({rough, smooth}, breathing_t)
add_info("ΑΕΗΟΩαεηοω", vowel_t)
add_info("Ιι", iota_t)
add_info("Υυ", upsilon_t)
add_info("ϜϝΡρ", breathy_cons_t)

local not_recognized = {}
setmetatable(info, { __index = function(t, key)
	return not_recognized
end})

-- Perform a function on each Unicode character in a string.
local function forEach(str, func)
	for char in str:gmatch(UTF8_char) do
		func(char)
	end
end

function export.tag(term, face)
	return tag_text(term, lang, sc, face)
end

function export.link(term, face, alt, tr)
	return full_link({ term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end

-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
function export.standardDiacritics(text)
	return toNFD((toNFD(text):gsub(UTF8_char, diacritical_conversions)))
end
standard_diacritics = export.standardDiacritics

-- Convert variant letter forms to the canonical form, and decompose.
function export.canonicalize(text)
	text = standard_diacritics(text)
	-- Compose, since the characters in `canonical` are in form NFC.
	text = toNFC(text):gsub(UTF8_char .. grave, canonical) -- for ϗ̀
		:gsub(UTF8_char, canonical)
	-- Decompose on return.
	return toNFD(text)
end

--[=[	This function arranges diacritics in the following order:
			1. macron or breve
			2. breathings or diaeresis
			3. acute, circumflex, or grave
			4. iota subscript
		Used by [[Module:typing-aids]].
		
		Returns an error if a sequence of diacritics contains more than one
		of each category.
]=]
local function reorderDiacriticSequence(diacritics)
	local output = {}
	forEach(diacritics,
		function (diacritic)
			local index = diacritic_order[diacritic]
			if not output[index] then
				output[index] = diacritic
			else
				-- Place breve after macron.
				if diacritic == breve then
					index = index + 1
				end
				-- The following might have odd results when there
				-- are three or more diacritics.
				insert(output, index, diacritic)
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-utils/too many diacritics]]
				require("Module:debug").track("grc-utils/too many diacritics")
				--[[
				local m_templates = require("Module:grc-utilities/templates")
				error("There are two diacritics, " ..
						m_templates.addDottedCircle(output[index]) .. " and " ..
						m_templates.addDottedCircle(diacritic) ..
						" that belong in the same position. There should be only one."
				)
				--]]
			end
		end)
	return sparseConcat(output)
end

function export.reorderDiacritics(text)
	return (gsub(toNFD(text), combining_diacritic .. combining_diacritic .. "+", reorderDiacriticSequence))
end

--[=[
		This breaks a word into meaningful "tokens", which are
		individual letters or diphthongs with their diacritics.
		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function make_tokens(text)
	local tokens, prev_info = {}, {}
	local token_i, vowel_count = 1, 0 -- Vowel count tracks .
	local prev
	for character in text:gmatch(UTF8_char) do
		local curr_info = info[character]
		-- Split vowels between tokens if not a diphthong.
		if curr_info.vowel then
			vowel_count = vowel_count + 1
			if prev and (not (vowel_count == 2 and curr_info.offglide and prev_info.vowel)
					-- υυ → υ, υ
					-- ιυ → ι, υ
					or prev_info.offglide and curr_info == upsilon_t or curr_info == prev_info) then
				token_i = token_i + 1
				if prev_info.vowel then
					vowel_count = 1
				end
			elseif vowel_count == 2 then
				vowel_count = 0
			end
			tokens[token_i] = (tokens[token_i] or "") .. character
		elseif curr_info.diacritic then
			vowel_count = 0
			tokens[token_i] = (tokens[token_i] or "") .. character
			if prev_info.diacritic or prev_info.vowel then
				if character == diaeresis then
					-- Split the diphthong in the current token if a diaeresis was found:
					-- the first letter, then the second letter plus any diacritics.
					local previous_vowel, vowel_with_diaeresis = tokens[token_i]:match("^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
					if previous_vowel then
						tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
						token_i = token_i + 1
					else
						-- The vowel preceding the vowel with the diaeresis will already be
						-- placed in the previous token if it has a diacritic:
						-- Περικλῆῐ̈ → Π ε ρ ι κ λ ῆ ῐ̈
						--[[
						mw.log('Diaeresis was found in ' .. text .. ', but the previous token ' ..
							require("Module:Unicode data").add_dotted_circle(tokens[token_i]) ..
							' couldn’t be split because it does not consist of two Basic Greek characters followed by other characters.')
						--]]
					end
				end
			elseif prev_info == breathy_cons_t then
				if curr_info ~= breathing_t then
					mw.log(("The character %s in %s should not have the accent %s on it."):format(
						prev, text, require("Module:grc-utilities/templates").addDottedCircle(character)))
				end
			else
				mw.log("The character " .. prev .. " cannot have a diacritic on it.")
			end
		else
			vowel_count = 0
			if prev then
				token_i = token_i + 1
			end
			tokens[token_i] = (tokens[token_i] or "") .. character
		end
		prev = character
		prev_info = curr_info
	end
	return tokens
end

local cache = {}
function export.tokenize(text)
	text = toNFD(text)
	if not cache[text] then
		cache[text] = make_tokens(text)
	end
	return cache[text]
end

--[=[	Places diacritics in the following order:
			1. breathings or diaeresis
			2. acute, circumflex, or grave
			3. macron or breve
			4. iota subscript
		Used by [[Module:grc-pronunciation]].		]=]
function export.pronunciationOrder(text)
	text = standard_diacritics(text)
	if match(text, groups[1]) then
		text = gsub(text,
			diacritic .. diacritic .. "+",
			function(sequence)
				-- Put breathing and diaeresis first, then accents, then macron or breve
				return concat{
					match(sequence, groups[2]) or "",
					match(sequence, groups[3]) or "",
					match(sequence, groups[1]) or "",
					match(sequence, groups[4]) or ""
				}
			end)
		text = gsub(text, macron, spacing_macron) -- combining to spacing macron
		text = gsub(text, breve, spacing_breve) -- combining to spacing breve
	end
	return toNFC(text)
end

return export