Module:User:Erutuon/he-translit-superscript

This module sandbox lacks a documentation subpage. You may create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Erutuon, for his own experimentation. Items in this module may be added and removed at Erutuon's discretion; do not rely on this module's stability.
-- Experimenting with a weird semi-phonemic transcription of Hebrew orthography
-- based on the Tiberian pronunciation, as described in this paper:
-- https://www.jstor.org/stable/27909449

--[[
This transliteration departs from Tiberian phonemes as described in
https://www.openbookpublishers.com/product/951
and https://www.tiberianhebrew.com/
in order to more faithfully represent the orthography:
* ב and ו distinguished (both /v/ in most cases, formerly distinguished as /b/ and /w/).
* שׁ and ס distinguished (both /s/, formerly distinguished as /ɬ/ and /s/).
* Superscript letters (matres lectionis and quiescent) don't represent
  consonant phonemes unless we postulate synchronic phonological rules
  to delete them. And apparently they also don't consistently correspond to
  Tiberian phonemic vowel length! There is a vowel length distinction
  inferred for Tiberian Hebrew, but it is maybe indicated with cantillation,
  not sure. See also a different analysis:
  https://www.academia.edu/37684591/The_vocalic_phonemes_of_Tiberian_Hebrew
]]

local export = {}

local Array = require "Module:array"

local U = mw.ustring.char
local ufind  = mw.ustring.find
local ugsub  = mw.ustring.gsub
local ulen   = mw.ustring.len
local umatch = mw.ustring.match
local usub   = mw.ustring.sub

local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)

local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = "[" .. macron_above .. macron_below .. "]"

local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'

local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam

local superscript_map = {
	[alef] = "ˀ",
	[yod] = "ʸ",
	[he] = "ʰ",
	[waw] = "ʷ",
}

local vowel_map = {
	[sheva] = '', -- ə
	[hataf_segol] = 'ĕ',
	[hataf_patah] = 'ă',
	[hataf_qamats] = 'ŏ',
	[hiriq] = 'i',
	[tsere] = 'e',
	[segol] = 'ɛ', -- or æ
	[patah] = 'a',
	[qamats] = 'ɔ', -- or å
	[qamats_qatan] = 'ɔ', -- or å
	[qubuts] = 'u',
	[holam] = 'o',
	[holam_male] = 'o' .. superscript_map[waw],
	[holam_haser_for_waw] = 'o',
	[shuruq] = 'u' .. superscript_map[waw],
}

local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()

local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav

local het = 'ח'
local ayn = 'ע'

local letter_map = {
	[alef] = 'ʔ',
	[bet] = 'b' .. macron_below,
	[gimel] = 'g' .. macron_above,
	[dalet] = 'd' .. macron_below,
	[he] = 'h',
	[waw] = 'w',
	['ז'] = 'z',
	[het] = 'ḥ',
	['ט'] = 'ṭ',
	[yod] = 'y',
	[kaf] = 'k' .. macron_below,
	[kaf_final] = 'k' .. macron_below,
	['ל'] = 'l',
	['מ'] = 'm',
	['ם'] = 'm',
	['נ'] = 'n',
	['ן'] = 'n',
	['ס'] = 's',
	[ayn] = 'ʕ',
	[pe] = 'p' .. macron_above,
	[pe_final] = 'p' .. macron_above,
	['צ'] = 'ṣ',
	['ץ'] = 'ṣ',
	['ק'] = 'q',
	['ר'] = 'r',
	[tav] = 't' .. macron_below,
}

local shin_sin_map = {
	[shin_dot] = "š",
	[sin_dot] = "ś",
}

local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()

local punctuation_map = {
	["־"] = "-",
	["׃"] = ".",
}

-- Fix illogical order of diacritics in Unicode normalization.
function export.normalize(text)
	-- Comment from [[Module:he-translit]]:
	-- The default order is: consonant, vowel point, dagesh or mappiq, shin or sin dot.
	-- The desired order is: consonant, shin or sin dot, dagesh or mappiq, vowel point.
	text = ugsub(text, "([" .. vowel_diacritics .. ']*)(' .. dagesh_mappiq .. "*)([" .. shin_dot .. sin_dot .. "]*)", "%3%2%1")
	text = ugsub(
		text,
		"[" .. hiriq .. patah .. qamats .. qamats_qatan .. "]+",
		function(vowels)
			if ulen(vowels) == 2 then
				local first, second = umatch(vowels, "^(.)(.)$")
				if first == hiriq and second ~= hiriq then
					return second .. first
				end
			end
		end)
	return text
end

local function match_alt_one(text, code_point_pos, patterns)
	for _, pattern in ipairs(patterns) do
		local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
		if start_pos == code_point_pos then
			-- Return first capture (if any) and end of match
			return capture, end_pos
		end
	end
end

local token_patterns = {
	"(" .. holam_male .. ")",
	"([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)",
	"(.)",
}

local function next_token(text, code_point_pos)
	return match_alt_one(text, code_point_pos, token_patterns)
end

-- Validate shin dot and sin dot?
local function tokenize(text)
	local pos = 1
	local tokens = {}
	while true do
		local token, next_pos = next_token(text, pos)
		if not next_pos then
			break
		end
		pos = next_pos + 1
		table.insert(tokens, token)
	end
	return tokens
end

export.tokenize = tokenize

-- Indicates that a token may be a consonant.
local function is_consonant(token)
	return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1
end

local function may_be_silent(token)
	return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end

-- Indicates that a token is definitely a vowel.
-- Shuruq not covered because it could be a ww.
local function is_vowel(token)
	return token == holam_male or token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil
end

local function has_dagesh(token)
	return token:find(dagesh_mappiq, 1, true) ~= nil
end

local function is_waw(token)
	return token:find(waw, 1, true) == 1
end

local function is_he(token)
	return token:find(he, 1, true) == 1
end

local function is_bgdkpt(token)
	return ufind(token, "^[" .. bgdkpt .. "]") == 1
end

local function is_word_boundary(token)
	return token == nil or ufind(token, "^[%s%p]$") ~= nil
end

local function get_letter(token)
	assert(ufind(token, "[" .. letters .. "]") == 1)
	return usub(token, 1, 1)
end

local function get_dot(token)
	return umatch(token, "[" .. shin_dot .. sin_dot .. "]")
end

local function is_followed_by_vowel(tokens, i)
	local next_token = tokens[i + 1]
	return is_vowel(next_token) or next_token == shuruq
end

local function is_preceded_by_vowel(tokens, i)
	i = i - 1
	while may_be_silent(tokens[i]) do
		i = i - 1
	end
	return is_vowel(tokens[i]) or tokens[i] == shuruq
end

local function makes_furtive_patah(token)
	local pos, letter = ufind(token, "([" .. ayn .. het .. he .. "])")
	return pos == 1 and (token ~= he or has_dagesh(token))
end

-- Handles silence of the possibly silent letters,
-- except for some cases of waw (holam male, shuruq).
local function is_silent(tokens, i)
	local prev_token, next_token = tokens[i - 1], tokens[i + 1]
	if may_be_silent(tokens[i]) then
		if tokens[i] == alef then
			-- Alef is pronounced when
			-- 1. initial
			-- 2. both preceded and followed by written vowels.
			return not (is_followed_by_vowel(tokens, i)
				and (is_preceded_by_vowel(tokens, i)
				or is_word_boundary(prev_token)))
		elseif tokens[i] == yod then
			-- ?
			return not is_followed_by_vowel(tokens, i)
				and (prev_token == hiriq or prev_token == tsere or prev_token == segol
				or not is_word_boundary(next_token)) -- for בָּנָיו: bɔnɔʸw vs. בָּנַי bɔnay
		elseif tokens[i] == waw then
			error("Waw is supposed to be handled elsewhere; module must be broken.")
		else
			return not is_followed_by_vowel(tokens, i)
		end
	else
		return false
	end
end

function export.transliterate(text)
	local tokens = export.tokenize(export.normalize(text))
	local transliteration = {}
	local function add_tr(val)
		assert(type(val) == "string")
		table.insert(transliteration, val)
	end
	-- Use a manually incremented loop so we can skip the furtive patah token.
	local i = 1
	while true do
		local token = tokens[i]
		if not token then
			break
		end
		if is_waw(token) then
			if token == holam_male then
				if tokens[i - 1] == sheva then
					add_tr(letter_map[waw] .. vowel_map[holam])
				else
					add_tr(vowel_map[holam_male])
				end
			-- waw with dagesh, shuruq
			elseif has_dagesh(token) then
				if is_consonant(tokens[i - 1]) or is_word_boundary(tokens[i - 1]) then
					add_tr(vowel_map[shuruq])
				else
					add_tr("ww")
				end
			elseif
				is_preceded_by_vowel(tokens, i)
				-- final waw in בָּנָיו bɔnɔʸw pronounced
				and not (is_word_boundary(tokens[i + 1]) or is_followed_by_vowel(tokens, i))
			then
				add_tr(superscript_map[waw])
			else
				add_tr("w")
			end
		elseif is_silent(tokens, i) then
			add_tr(superscript_map[token])
		elseif may_be_silent(token) or is_consonant(token) then
			local letter = get_letter(token)
			local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token)
			if has_dagesh(token) then
				tr = ugsub(tr, macron, "")
				-- Don't double he.
				-- Don't double bgdkpt after sheva or at beginning of word.
				if not is_he(token) and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1]))) then
					tr = tr .. tr
				end
			end
			-- Transcribe furtive patah before its consonant and skip it.
			if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then
				add_tr(vowel_map[patah])
				i = i + 1
			elseif tokens[i - 1] == sheva and get_letter(tokens[i - 2]) == get_letter(token) then
				add_tr("'")
			end
			add_tr(tr)
		elseif is_vowel(token) then
			add_tr(vowel_map[token])
		else
			add_tr(punctuation_map[token] or token)
		end
		i = i + 1
	end
	return table.concat(transliteration)
end

return export