Module:User:Erutuon/he-translit-omit-nonconsonantal

This module sandbox lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
-- Sort of a reimplementation of [[Module:he-translit]].
-- This version omits matres lectionis and letters that otherwise don't have
-- their consonantal value, except when they indicate vowel length.

-- Remaining issues:
-- * some shvas are undecided (?)
-- * cantillation marks and meteg are all interpreted as stress marks
--   (some mark secondary stress, some don't mark stress)
-- * qamats isn't distinguished correctly in all cases

local export = {}

local Array = require "Module:array"

local function show_code_point_names(text)
	if not text then return "" end
	local names = Array()
	for cp in mw.ustring.gcodepoint(text) do
		-- Remove HEBREW LETTER, HEBREW POINT, etc.
		local name = require "Module:Unicode data".lookup_name(cp)
			:gsub(
				"^HEBREW (%w+) ",
				function(type)
					if type == "ACCENT" then return "ACCENT " else return "" end
				end)
			:lower()
		names:insert(name)
	end
	return names:concat ", "
end

local function show_tokens(tokens, i, j)
	return table.concat(Array(tokens):map(show_code_point_names), " | ", i, j)
end

export.show_tokens = show_tokens

local U = mw.ustring.char
local ufind  = mw.ustring.find
local ugsub  = mw.ustring.gsub
local ulen   = mw.ustring.len
local umatch = mw.ustring.match
local usub   = mw.ustring.sub

local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)

local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = "[" .. macron_above .. macron_below .. "]"

local acute = U(0x0301)

local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'

local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam

local schwa = 'ə'

local vowel_map = {
	[sheva] = '',
	[hataf_segol] = 'ĕ',
	[hataf_patah] = 'ă',
	[hataf_qamats] = 'ŏ',
	[hiriq] = 'i',
	[tsere] = 'ē',
	[segol] = 'e',
	[patah] = 'a',
	[qamats] = 'ā',
	[qamats_qatan] = 'o',
	[qubuts] = 'u',
	[holam] = 'ō',
	[holam_male] = 'ō',
	[holam_haser_for_waw] = 'ō',
	[shuruq] = 'ū',
}

local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()

local short_vowel_map = {
	[holam] = 'o',
	[holam_male] = 'o',
	[holam_haser_for_waw] = 'o',
	[shuruq] = 'u',
}

local plene_map = {
	[hiriq] = 'ī',
	[tsere] = 'ē',
	[qamats] = 'ā',
	-- [qamats_qatan] = 'o', -- if plene, then misspelling?
}

local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local lamed = 'ל'
local mem = 'מ'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav

local het = 'ח'
local ayin = 'ע'

local letter_map = {
	[alef] = 'ʔ',
	[bet] = 'b' .. macron_below,
	[gimel] = 'g' .. macron_above,
	[dalet] = 'd' .. macron_below,
	[he] = 'h',
	[waw] = 'w',
	['ז'] = 'z',
	[het] = 'ḥ',
	['ט'] = 'ṭ',
	[yod] = 'y',
	[kaf] = 'k' .. macron_below,
	[kaf_final] = 'k' .. macron_below,
	[lamed] = 'l',
	[mem] = 'm',
	['ם'] = 'm',
	['נ'] = 'n',
	['ן'] = 'n',
	['ס'] = 's',
	[ayin] = 'ʕ',
	[pe] = 'p' .. macron_above,
	[pe_final] = 'p' .. macron_above,
	['צ'] = 'ṣ',
	['ץ'] = 'ṣ',
	['ק'] = 'q',
	['ר'] = 'r',
	[tav] = 't' .. macron_below,
}

local shin_sin_map = {
	[shin_dot] = "š",
	[sin_dot] = "ś",
}

local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()

local punctuation_map = {
	["־"] = "-",
	["׃"] = ".",
}

-- First and last code point called "HEBREW ACCENT ...".
local first_accent_cp, last_accent_cp = 0x0591, 0x05AE
local meteg_cp = 0x05BD
local meteg = U(meteg_cp)
local combining_grapheme_joiner_cp = 0x034F
local cgj = U(combining_grapheme_joiner_cp)
local accents = { meteg }
for cp = first_accent_cp, last_accent_cp do
	table.insert(accents, U(cp))
end

local diacritic_order = {
	{shin_dot, shin_dot},
	{dagesh_mappiq},
	Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end),
	accents,
	{cgj},
}

local accent_pattern = U(first_accent_cp) .. "-" .. U(last_accent_cp) .. meteg

local diacritic_pattern = "["
	.. shin_dot .. sin_dot
	.. dagesh_mappiq
	.. vowel_diacritics
	.. accent_pattern
	.. cgj
	.. "]"
local diacritics_pattern = diacritic_pattern .. diacritic_pattern .. "+"
local diacritic_order_map = {}
for i, diacritics in ipairs(diacritic_order) do
	for _, diacritic in ipairs(diacritics) do
		diacritic_order_map[diacritic] = i
	end
end

local function is_accent(token)
	if not token then
		return false
	end
	local cp = mw.ustring.codepoint(token)
	return first_accent_cp <= cp and cp <= last_accent_cp
		or cp == combining_grapheme_joiner_cp
end

-- Fix illogical order of diacritics in Unicode normalization.
-- The default order:
-- letter, vowel points, dagesh or mappiq, accent, shin or sin dot.
-- The desired order:
-- letter, shin or sin dot, dagesh or mappiq, first vowel point, accent,
-- maybe second vowel point if first vowel point is sheva or hiriq.
function export.normalize(text)
	text = ugsub(
		text,
		diacritics_pattern,
		function(diacritics)
			local diacritics_list = mw.text.split(diacritics, "")
			table.sort(
				diacritics_list,
				function(a, b)
					return (diacritic_order_map[a] or 0) < (diacritic_order_map[b] or 0)
				end)
			-- For now remove combining grapheme joiners... though this might be wrong.
			while diacritics_list[#diacritics_list] == cgj do
				table.remove(diacritics_list)
			end
			
			-- If there are two vowels, put hiriq or sheva after other vowels.
			-- If there is also an accent, put it after the first vowel.
			-- Assume Unicode normalization:
			-- sheva before hiriq before patah before either qamats.
			-- This code works for combinations are in the testcases.
			-- יְרוּשָׁלִַם, יְרוּשָׁלְַמָה
			local i = 0
			local first_vowel
			repeat
				i = i + 1
				first_vowel = diacritics_list[i]
			until not first_vowel or vowel_diacritics:find(first_vowel)
			
			if first_vowel then
				local second_vowel = diacritics_list[i + 1]
				if second_vowel then
					if first_vowel == hiriq or first_vowel == sheva then
						diacritics_list[i], diacritics_list[i + 1] = diacritics_list[i + 1], diacritics_list[i]
					end
					if is_accent(diacritics_list[i + 2]) then
						diacritics_list[i + 1], diacritics_list[i + 2] = diacritics_list[i + 2], diacritics_list[i + 1]
					end	
				end
			end
			
			return table.concat(diacritics_list)
		end)
	return text
end

local function match_alt_one(text, code_point_pos, patterns)
	for _, pattern in ipairs(patterns) do
		local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
		if start_pos == code_point_pos then
			-- Return first capture (if any) and end of match
			return capture, end_pos
		end
	end
end

local token_patterns = {
	"(" .. holam_male .. ")",
	"([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)",
	"(.)",
}

local function next_token(text, code_point_pos)
	return match_alt_one(text, code_point_pos, token_patterns)
end

-- Validate shin dot and sin dot?
local function tokenize(text)
	local pos = 1
	local tokens = {}
	while true do
		local token, next_pos = next_token(text, pos)
		if not next_pos then
			break
		end
		pos = next_pos + 1
		table.insert(tokens, token)
	end
	return tokens
end

export.tokenize = tokenize

local function may_be_silent(token)
	return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end

-- Indicates that a token might be a vowel.
-- Use only after determining that it is not a consonant.
local function is_vowel(token)
	return token == holam_male or token == shuruq or (token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil)
end

local function is_preceded_by_unchangeable_vowel(tokens, i)
	local token1, token2 = tokens[i - 2], tokens[i - 1]
	return token2 == shuruq -- Don't check that this is waw with dagesh.
		or token2 == holam_male
		or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol)
end

local function is_short_vowel(token)
	return token == patah or token == segol or token == hiriq or token == qubuts
end

local function is_open_vowel(token)
	return token == patah or token == qamats
end

local function has_dagesh(token)
	return token ~= nil and token:find(dagesh_mappiq, 1, true) ~= nil
end

local function is_waw(token)
	return token ~= nil and token:find(waw, 1, true) == 1
end

local function is_he(token)
	return token ~= nil and token:find(he, 1, true) == 1
end

local function is_hataf(token)
	return token == hataf_segol or token == hataf_patah or token == hataf_qamats
end

local function get_letter(token)
	-- assert(ufind(token, "[" .. letters .. "]") == 1)
	if token ~= nil then
		return usub(token, 1, 1)
	end
end

local function is_guttural(token)
	local letter = get_letter(token)
	return letter == alef or letter == he or letter == het or letter == ayin
end

local function is_bgdkpt(token)
	return token ~= nil and ufind(token, "^[" .. bgdkpt .. "]") == 1
end

-- Bidirectional control characters should be avoided as much as possible,
-- but they are easily picked up when copying and pasting, so the module needs
-- to account for them.
-- This list is from [[w:Bidirectional control character]].
local bidirectional_control_characters =
	U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
	.. U(0x2066) .. "-" .. U(0x2069)
local word_boundary_character = "^[%s%p" .. bidirectional_control_characters .. "]$"
local function is_word_boundary(token)
	return token == nil or ufind(token, word_boundary_character) ~= nil
end

local function get_dot(token)
	return token and umatch(token, "[" .. shin_dot .. sin_dot .. "]")
end

local function is_followed_by_vowel(tokens, i)
	repeat
		i = i + 1
	until not is_accent(tokens[i])
	return is_vowel(tokens[i])
end

local function is_preceded_by_vowel(tokens, i)
	repeat
		i = i - 1
	until not (may_be_silent(tokens[i]) or is_accent(tokens[i]))
	return is_vowel(tokens[i])
end

local function get_previous_vowel_pos(tokens, i)
	while true do
		i = i - 1
		local token = tokens[i]
		if is_vowel(token) then
			return i
		elseif is_word_boundary(token) then
			return nil
		end
	end
end

local function get_previous_vowel(tokens, i)
	local pos = get_previous_vowel_pos(tokens, i)
	if pos then return tokens[pos] end
end


local function get_previous_neighboring_vowel(tokens, i)
	while true do
		i = i - 1
		local token = tokens[i]
		if is_vowel(token) then
			return token
		elseif not is_accent(token) then
			return nil
		end
	end
end

-- Defined below.
local is_consonant

local function skip_before_accent(tokens, i)
	repeat
		i = i - 1
	until not is_accent(tokens[i])
	return i
end

local function is_preceded_by_consonant(tokens, i)
	return is_consonant(tokens, skip_before_accent(tokens, i))
end

local function makes_furtive_patah(token)
	local pos, letter = ufind(token, "([" .. ayin .. het .. he .. "])")
	return pos == 1 and (token ~= he or has_dagesh(token))
end

-- Handles silence of the possibly silent letters,
-- except for some cases of waw (holam male, shuruq).
local function is_silent(tokens, i)
	local prev_token, next_token = tokens[skip_before_accent(tokens, i)], tokens[i + 1]
	
	-- special case for יִשָּׂשכָר yiśśāḵār
	if tokens[i] == shin_sin and not is_vowel(next_token) then
		return true
	elseif may_be_silent(tokens[i]) then
		if tokens[i] == alef then
			-- Alef is pronounced when
			-- 1. initial
			-- 2. both preceded and followed by written vowels.
			return not (is_followed_by_vowel(tokens, i)
				and (is_preceded_by_vowel(tokens, i)
				or is_word_boundary(prev_token)))
		elseif tokens[i] == yod then
			return not is_followed_by_vowel(tokens, i)
				and (prev_token == hiriq or prev_token == tsere or prev_token == segol
				or not is_word_boundary(next_token)) -- בָּנָיו bānāw vs. בָּנַי bānay
		elseif tokens[i] == waw then
			-- holam + waw is probably incorrect
			return prev_token == holam
				or not (is_vowel(tokens[i + 1]) or is_word_boundary(tokens[i + 1]))
		else
			return not is_followed_by_vowel(tokens, i)
		end
	else
		return false
	end
end

-- Indicates that a token may be a consonant.
-- Declared as local above.
function is_consonant(tokens, i)
	local token = tokens[i]
	if is_waw(token) then
		return token == waw
			or (token == shuruq and not (is_preceded_by_consonant(tokens, i) or is_word_boundary(tokens[i - 1])))
	else
		return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1
	end
end

-- Don't double he.
-- Don't double bgdkpt after sheva or at beginning of word.
local function is_double(tokens, i)
	local token = tokens[i]
	return token ~= nil
		and has_dagesh(token)
		and not is_he(token)
		and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1])))
end

local function is_preceded_by_prefix(tokens, i)
	local consonant, vowel = tokens[i - 2], tokens[i - 1]
	local letter = get_letter(consonant)
	local letter_is_shin = (letter == shin_sin and get_dot(consonant) == shin_dot)
	local next_cons_has_dagesh = has_dagesh(tokens[i])
	return (vowel == hiriq and letter == mem and next_cons_has_dagesh)
		or (vowel == sheva and (
				letter == bet or letter == dalet or letter == waw
				or letter == kaf or letter == lamed
			)
		) or (vowel == patah and next_cons_has_dagesh and (
				letter == bet or letter == he or letter == kaf or letter == lamed
				or letter_is_shin -- very archaic, says [[Module:he-translit]]
			)
		) or (vowel == segol and next_cons_has_dagesh and letter_is_shin)
end

local function is_in_last_syllable(tokens, i)
	while true do
		local token = tokens[i + 1]
		if is_word_boundary(token)
		-- A sequence of consonant sheva consonant (sheva) does not have a vowel:
		-- וַיֵּבְךְּ wayyēḇk, וַיַּרְא wayyar
		or token == sheva and (
			is_consonant(tokens, i + 2)
			and (tokens[i + 3] == sheva or is_word_boundary(tokens[i + 3]))
		) then
			return true
		elseif is_vowel(token) then
			return false
		end
		i = i + 1
	end
end

local function is_pronounced_sheva(tokens, i)
	local previous_vowel = get_previous_vowel(tokens, i)
	if tokens[i - 2] == meteg then
		return true
	-- ignore יְרוּשָׁלְָמָה yərūšālayim, יְרוּשָׁלְַמָה yərūšālāyim
	elseif
		is_word_boundary(tokens[i + 1])
		or (tokens[i + 1] == alef and is_word_boundary(tokens[i + 2]))
		or has_dagesh(tokens[i + 1]) -- check for bgdkpt?
	then
		return false
	elseif
		-- after another sheva
		previous_vowel == sheva
		-- after initial consonant unless following consonant has dagesh
		or previous_vowel == nil
		-- between identical consonants
		or (get_letter(tokens[i - 1]) == get_letter(tokens[i + 1])
			and not is_silent(tokens, i + 1))
		or is_preceded_by_unchangeable_vowel(tokens, i - 1)
		or is_double(tokens, i - 1)
	then
		return true
	elseif is_short_vowel(previous_vowel)
	or is_guttural(tokens[i - 1]) then
		return false
	else -- Leave this catch-all case to make it clear what the default is.
		return false
	end
end

function export.transliterate(text)
	local tokens = export.tokenize(export.normalize(text))
	local transliteration = {}
	local function add_tr(val)
		assert(type(val) == "string")
		table.insert(transliteration, val)
	end
	-- Use a manually incremented loop so we can skip
	-- furtive patah and matres lectionis tokens.
	local i = 1
	while true do
		local token = tokens[i]
		if not token then
			break
		end
		-- This catches silent letters after a consonant;
		-- silent letters after a vowel are handled below.
		if is_silent(tokens, i) then
			add_tr("")
		elseif is_consonant(tokens, i) then
			local letter = get_letter(token)
			local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token)
			if has_dagesh(token) then
				tr = ugsub(tr, macron, "")
				if is_double(tokens, i) then
					tr = tr .. tr
				end
			end
			-- Transcribe furtive patah before its consonant and skip it.
			if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then
				local previous_vowel_pos = get_previous_vowel_pos(tokens, i)
				if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then
					add_tr(acute)
				end
				add_tr(vowel_map[patah])
				i = i + 1
			end
			add_tr(tr)
		elseif is_vowel(token) then
			-- Genuine waw holam. Handle the waw and leave the holam to the next
			-- bit of code.
			-- מִצְוֹת miṣwōṯ
			if token == holam_male and tokens[i - 1] == sheva then
				add_tr(letter_map[waw])
			end
			
			local has_accent = is_accent(tokens[i + 1])
			local next_i = i + 1
			if has_accent then
				next_i = i + 2
			end
			
			-- Handle sheva.
			if tokens[i] == sheva then
				-- implicit ktiv/qre from [[Module:he-translit/testcases]]:
				-- יְרוּשָׁלְָמָה yərūšālayim, יְרוּשָׁלְַמָה yərūšālāyim
				if is_open_vowel(get_previous_neighboring_vowel(tokens, i)) then
					local previous_vowel_pos = get_previous_vowel_pos(tokens, i)
					if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then
						add_tr(acute)
					end

					add_tr("y")
				elseif is_pronounced_sheva(tokens, i) then
					add_tr(schwa)
				else
					add_tr("")
				end
			-- implicit ktiv/qre from [[Module:he-translit/testcases]]:
			-- יְרוּשָׁלִַם yərūšālaymā, יְרוּשָׁלִָם yərūšālāymā
			elseif token == hiriq and is_open_vowel(get_previous_neighboring_vowel(tokens, i)) then
				local previous_vowel_pos = get_previous_vowel_pos(tokens, i)
				if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then
					add_tr(acute)
				end
				add_tr("yi")
			-- qamats in possibly closed syllable,
			-- as long as following two consonants are not identical, in which
			-- case the sheva has to be pronounced, putting the qamats
			-- in an open syllable
			elseif token == qamats
			and not has_accent
			and (
				(tokens[next_i + 1] == sheva and not is_pronounced_sheva(tokens, next_i + 1))
				or is_double(tokens, next_i)
				or (is_guttural(tokens[next_i]) and is_hataf(tokens[next_i + 1]))
				-- כָּל kol, on its own and with prefixes
				or ((get_letter(tokens[i - 1]) == kaf and get_letter(tokens[next_i]) == lamed)
					and (is_word_boundary(tokens[next_i + 1])
						and (
							is_word_boundary(tokens[i - 2])
							or is_preceded_by_prefix(tokens, i - 1)
						)
					)
				)
			) then
				add_tr(vowel_map[qamats_qatan])
			else
				local vowel = token
				local start_i = i
				local i_after_silent_letters = next_i - 1
				while is_silent(tokens, i_after_silent_letters + 1) do
					i_after_silent_letters = i_after_silent_letters + 1
				end
				
				if i_after_silent_letters > start_i or token == shuruq or token == holam_male then
					if is_double(tokens, i_after_silent_letters + 1) then
						add_tr(short_vowel_map[vowel] or vowel_map[vowel])
					else
						add_tr(plene_map[vowel] or vowel_map[vowel])
					end
				else
					add_tr(vowel_map[token])
				end
				i = i_after_silent_letters
			end
			-- This is not completely correct because not all accents indicate stress.
			-- I haven't sorted out their functions though.
			if has_accent and not is_in_last_syllable(tokens, i) then
				add_tr(acute)
			end
		else
			if not (is_accent(token) or token == meteg) then
				add_tr(punctuation_map[token] or token)
			end
		end
		i = i + 1
	end
	return table.concat(transliteration)
end

function export.tr_t(frame)
	return export.transliterate(frame.args[1])
end

return export