Module:User:Erutuon/he-translit-superscript
Appearance
- This module sandbox lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Erutuon, for his own experimentation. Items in this module may be added and removed at Erutuon's discretion; do not rely on this module's stability.
-- Experimenting with a weird semi-phonemic transcription of Hebrew orthography
-- based on the Tiberian pronunciation, as described in this paper:
-- https://www.jstor.org/stable/27909449
--[[
This transliteration departs from Tiberian phonemes as described in
https://www.openbookpublishers.com/product/951
and https://www.tiberianhebrew.com/
in order to more faithfully represent the orthography:
* ב and ו distinguished (both /v/ in most cases, formerly distinguished as /b/ and /w/).
* שׁ and ס distinguished (both /s/, formerly distinguished as /ɬ/ and /s/).
* Superscript letters (matres lectionis and quiescent) don't represent
consonant phonemes unless we postulate synchronic phonological rules
to delete them. And apparently they also don't consistently correspond to
Tiberian phonemic vowel length! There is a vowel length distinction
inferred for Tiberian Hebrew, but it is maybe indicated with cantillation,
not sure. See also a different analysis:
https://www.academia.edu/37684591/The_vocalic_phonemes_of_Tiberian_Hebrew
]]
local export = {}
local Array = require "Module:array"
local U = mw.ustring.char
local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local ulen = mw.ustring.len
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)
local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = "[" .. macron_above .. macron_below .. "]"
local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'
local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam
local superscript_map = {
[alef] = "ˀ",
[yod] = "ʸ",
[he] = "ʰ",
[waw] = "ʷ",
}
local vowel_map = {
[sheva] = '', -- ə
[hataf_segol] = 'ĕ',
[hataf_patah] = 'ă',
[hataf_qamats] = 'ŏ',
[hiriq] = 'i',
[tsere] = 'e',
[segol] = 'ɛ', -- or æ
[patah] = 'a',
[qamats] = 'ɔ', -- or å
[qamats_qatan] = 'ɔ', -- or å
[qubuts] = 'u',
[holam] = 'o',
[holam_male] = 'o' .. superscript_map[waw],
[holam_haser_for_waw] = 'o',
[shuruq] = 'u' .. superscript_map[waw],
}
local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()
local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav
local het = 'ח'
local ayn = 'ע'
local letter_map = {
[alef] = 'ʔ',
[bet] = 'b' .. macron_below,
[gimel] = 'g' .. macron_above,
[dalet] = 'd' .. macron_below,
[he] = 'h',
[waw] = 'w',
['ז'] = 'z',
[het] = 'ḥ',
['ט'] = 'ṭ',
[yod] = 'y',
[kaf] = 'k' .. macron_below,
[kaf_final] = 'k' .. macron_below,
['ל'] = 'l',
['מ'] = 'm',
['ם'] = 'm',
['נ'] = 'n',
['ן'] = 'n',
['ס'] = 's',
[ayn] = 'ʕ',
[pe] = 'p' .. macron_above,
[pe_final] = 'p' .. macron_above,
['צ'] = 'ṣ',
['ץ'] = 'ṣ',
['ק'] = 'q',
['ר'] = 'r',
[tav] = 't' .. macron_below,
}
local shin_sin_map = {
[shin_dot] = "š",
[sin_dot] = "ś",
}
local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()
local punctuation_map = {
["־"] = "-",
["׃"] = ".",
}
-- Fix illogical order of diacritics in Unicode normalization.
function export.normalize(text)
-- Comment from [[Module:he-translit]]:
-- The default order is: consonant, vowel point, dagesh or mappiq, shin or sin dot.
-- The desired order is: consonant, shin or sin dot, dagesh or mappiq, vowel point.
text = ugsub(text, "([" .. vowel_diacritics .. ']*)(' .. dagesh_mappiq .. "*)([" .. shin_dot .. sin_dot .. "]*)", "%3%2%1")
text = ugsub(
text,
"[" .. hiriq .. patah .. qamats .. qamats_qatan .. "]+",
function(vowels)
if ulen(vowels) == 2 then
local first, second = umatch(vowels, "^(.)(.)$")
if first == hiriq and second ~= hiriq then
return second .. first
end
end
end)
return text
end
local function match_alt_one(text, code_point_pos, patterns)
for _, pattern in ipairs(patterns) do
local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
if start_pos == code_point_pos then
-- Return first capture (if any) and end of match
return capture, end_pos
end
end
end
local token_patterns = {
"(" .. holam_male .. ")",
"([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)",
"(.)",
}
local function next_token(text, code_point_pos)
return match_alt_one(text, code_point_pos, token_patterns)
end
-- Validate shin dot and sin dot?
local function tokenize(text)
local pos = 1
local tokens = {}
while true do
local token, next_pos = next_token(text, pos)
if not next_pos then
break
end
pos = next_pos + 1
table.insert(tokens, token)
end
return tokens
end
export.tokenize = tokenize
-- Indicates that a token may be a consonant.
local function is_consonant(token)
return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1
end
local function may_be_silent(token)
return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end
-- Indicates that a token is definitely a vowel.
-- Shuruq not covered because it could be a ww.
local function is_vowel(token)
return token == holam_male or token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil
end
local function has_dagesh(token)
return token:find(dagesh_mappiq, 1, true) ~= nil
end
local function is_waw(token)
return token:find(waw, 1, true) == 1
end
local function is_he(token)
return token:find(he, 1, true) == 1
end
local function is_bgdkpt(token)
return ufind(token, "^[" .. bgdkpt .. "]") == 1
end
local function is_word_boundary(token)
return token == nil or ufind(token, "^[%s%p]$") ~= nil
end
local function get_letter(token)
assert(ufind(token, "[" .. letters .. "]") == 1)
return usub(token, 1, 1)
end
local function get_dot(token)
return umatch(token, "[" .. shin_dot .. sin_dot .. "]")
end
local function is_followed_by_vowel(tokens, i)
local next_token = tokens[i + 1]
return is_vowel(next_token) or next_token == shuruq
end
local function is_preceded_by_vowel(tokens, i)
i = i - 1
while may_be_silent(tokens[i]) do
i = i - 1
end
return is_vowel(tokens[i]) or tokens[i] == shuruq
end
local function makes_furtive_patah(token)
local pos, letter = ufind(token, "([" .. ayn .. het .. he .. "])")
return pos == 1 and (token ~= he or has_dagesh(token))
end
-- Handles silence of the possibly silent letters,
-- except for some cases of waw (holam male, shuruq).
local function is_silent(tokens, i)
local prev_token, next_token = tokens[i - 1], tokens[i + 1]
if may_be_silent(tokens[i]) then
if tokens[i] == alef then
-- Alef is pronounced when
-- 1. initial
-- 2. both preceded and followed by written vowels.
return not (is_followed_by_vowel(tokens, i)
and (is_preceded_by_vowel(tokens, i)
or is_word_boundary(prev_token)))
elseif tokens[i] == yod then
-- ?
return not is_followed_by_vowel(tokens, i)
and (prev_token == hiriq or prev_token == tsere or prev_token == segol
or not is_word_boundary(next_token)) -- for בָּנָיו: bɔnɔʸw vs. בָּנַי bɔnay
elseif tokens[i] == waw then
error("Waw is supposed to be handled elsewhere; module must be broken.")
else
return not is_followed_by_vowel(tokens, i)
end
else
return false
end
end
function export.transliterate(text)
local tokens = export.tokenize(export.normalize(text))
local transliteration = {}
local function add_tr(val)
assert(type(val) == "string")
table.insert(transliteration, val)
end
-- Use a manually incremented loop so we can skip the furtive patah token.
local i = 1
while true do
local token = tokens[i]
if not token then
break
end
if is_waw(token) then
if token == holam_male then
if tokens[i - 1] == sheva then
add_tr(letter_map[waw] .. vowel_map[holam])
else
add_tr(vowel_map[holam_male])
end
-- waw with dagesh, shuruq
elseif has_dagesh(token) then
if is_consonant(tokens[i - 1]) or is_word_boundary(tokens[i - 1]) then
add_tr(vowel_map[shuruq])
else
add_tr("ww")
end
elseif
is_preceded_by_vowel(tokens, i)
-- final waw in בָּנָיו bɔnɔʸw pronounced
and not (is_word_boundary(tokens[i + 1]) or is_followed_by_vowel(tokens, i))
then
add_tr(superscript_map[waw])
else
add_tr("w")
end
elseif is_silent(tokens, i) then
add_tr(superscript_map[token])
elseif may_be_silent(token) or is_consonant(token) then
local letter = get_letter(token)
local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token)
if has_dagesh(token) then
tr = ugsub(tr, macron, "")
-- Don't double he.
-- Don't double bgdkpt after sheva or at beginning of word.
if not is_he(token) and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1]))) then
tr = tr .. tr
end
end
-- Transcribe furtive patah before its consonant and skip it.
if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then
add_tr(vowel_map[patah])
i = i + 1
elseif tokens[i - 1] == sheva and get_letter(tokens[i - 2]) == get_letter(token) then
add_tr("'")
end
add_tr(tr)
elseif is_vowel(token) then
add_tr(vowel_map[token])
else
add_tr(punctuation_map[token] or token)
end
i = i + 1
end
return table.concat(transliteration)
end
return export