Jump to content

Module:he-translit

From Wiktionary, the free dictionary

This module will transliterate Hebrew language text per WT:HE TR. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:he-translit/testcases.

Functions

[edit]
tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

--Contributors: Malku H₂n̥rés, Sartma, Erutuon, Metaknowledge

local m_str_utils = require("Module:string utilities")

local gcodepoint = m_str_utils.gcodepoint
local match = m_str_utils.match
local s = m_str_utils.gsub
local U = m_str_utils.char

local bidirectional_control_characters =
	U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
	.. U(0x2066) .. "-" .. U(0x2069)
local word_end = "%f[%s%z" .. bidirectional_control_characters .. "%-]"
local word_start = "%f[^%s%z" .. bidirectional_control_characters .. "%-]"
-- Bidirectional control characters should be avoided as much as possible,
-- but they are easily picked up when copying and pasting, so the module needs
-- to account for them.
-- This list is from [[w:Bidirectional control character]].

local V = "[aɔɛeiăəou‌āēīōūêôáéíóúḗṓếố][̂̄̆]?́?" 
local C = "[ʔḇḡḏhwzḥṭylsʕqrśšṯ'ḵmnfṣbdgptkjc″vḫẓġTZCDK]"

local c = { --direct translit
	--full char ie. C
    ["א"] = "ʔ",
    ["ב"] = "ḇ",
    ["ג"] = "ḡ",
    ["ד"] = "ḏ",
    ["ה"] = "h",
    ["ו"] = "w",
    ["ז"] = "z",
    ["ח"] = "ḥ",
    ["ט"] = "ṭ",
    ["י"] = "y",
    ["ל"] = "l",
    ["ס"] = "s",
    ["ע"] = "ʕ",
    ["ק"] = "q",
    ["ר"] = "r",
    ["ש"] = "ß",
    ["ת"] = "ṯ",
	--miscellaneous:
	["׳"] = "'", --geresh
    ["־"] = "-", --hyphen
    ["׃"] = " .", --dot
	["ׂ"] = "ˊ", --sin dot
	["ׁ"] = "ˇ", --shin dot
    ["ּ"] = "·", --dagesh
	["֫"] = "^", --oleh
	["ֽ"] = "+", --meteg
	--niqqud ie. V
	["ַ"] = "a",
	["ָ"] = "ɔ",
	["ֶ"] = "ɛ",
	["ֵ"] = "e",
	["ִ"] = "i",
	["ֳ"] = "ɔ̆",
	["ֲ"] = "ă",
	["ֱ"] = "ɛ̆",
	["ְ"] = "ə",
	["ֹ"] = "o",
	["ֻ"] = "u",
	["ׇ"] = "ɔ",
}

local b = { --BH
	--when different final form
	{"[כך]", "ḵ"},
	{"[מם]", "m"},
	{"[נן]", "n"},
	{"[פף]", "f"},
	{"[צץ]", "ṣ"},

	{"(" .. V .. ")(·?)(+?)(^?)([ˊˇ]?'?)", "%5%2%1%4%3"},  --order: s(h)in dot, geresh, dagesh, vowel (niqqud), oleh, meteg
	--bgdkft: fricative + dagesh > stop
	{"ḇ·", "b"},
	{"ḡ·", "g"},
	{"ḏ·", "d"},
	{"ṯ·", "t"},
	{"ḵ·", "k"},
	{"f·", "p"},
	--s(h)in dot
	{"ß(·?)ˇ", "š%1"},
	{"ß(·?)ˊ", "ś%1"},
	--vowel lengthenings
	{"i([+^]?)y", "ī%1"}, --V > long / _{jw}{no V no dagesh}
	{"ī([+^]?" .. V .. ")", "iy%1"},
	{"ī·", "iy·"},
	{"e([+^]?)y", "ē%1"},
	{"ē([+^]?" .. V .. ")", "ey%1"},
	{"ɛ([+^]?)y", "E%1"},  --see E > ɛ̄ below
	{"E([+^]?" .. V .. ")", "ɛy%1"},
	{"(" .. C .. "·?)wo", "%1ō"},
	{"(" .. V .. "[+^]?)w·", "%1U"},
	{"w·", "ū"},
	{"U", "w·"},
	{"(" .. C .. "·?)y·", "%1ī"},
	--h > circumflex / V_{no V no dagesh}
	{"(" .. V .. "[+^]?)h", "%1H"},
	{"H(" .. V .. ")", "h%1"},
	{"H·", "h"},
	{"e([+^]?)H", "ê%1"},
	{"o([+^]?)H", "ô%1"},
	{"ɛ([+^]?)H", "ɛ̂%1"},
	{"ɔ([+^]?)H", "ɔ̂%1"},
	{"a([+^]?)H", "â%1"},

	{"(" .. V .. "[+^]?%s?)(.)·(%s?" .. V .. ")", "%1%2%2%3"},  --dagesh gemination
	{"[·ß]", ""},  --deletion of unpointed s(h)ins and useless dageshim
	--schwa: Ə means "kept"
	{"ə" .. word_end, ""},
	{"ə([ḇḡḏḵfṯ])", "Ə%1"},
	{"([+‌āēīōūoE])(" .. C .. ")ə", "%1%2Ə"},
	{"E", "ɛ̄"},  --see >E above
	{"(" .. C .. "ə?" .. C .. ")ə", "%1Ə"},
	{"(" .. C .. ")Ə(" .. C .. ")([Əə])", "%1ə%2Ə"},
	{word_start .. "([ūw]?a?" .. C .. ")ə", "%1Ə"},
	{"ə", ""},
	{"Ə", "ə"},

	{"([ʕhḥ])a(" .. word_end .. ")", "^a%1%2"},  --final /a/-guttural inversion
	--penultimate stress: segolates & -áyiC
	{"(" .. C .. "[eɛo])(%+?".. C .. "ɛ" .. C .. ")" .. word_end, "%1^%2"},
	{"(" .. C .. "a)(%+?".. C .. C .. "?a" .. C ..")" .. word_end, "%1^%2"},
	{"ayi(" .. C .. ")" .. word_end, "a^yi%1"},
	--stress marking
	{"a^", "á"},
	{"e^", "é"},
	{"i^", "í"},
	{"o^", "ó"},
	{"u^", "ú"},
	{"ɛ^", "ɛ́"},
	{"ɔ^", "ɔ́"},
	{"ā^", "ā́"},
	{"ē^", "ḗ"},
	{"ī^", "ī́"},
	{"ō^", "ṓ"},
	{"ū^", "ū́"},
	{"ɛ̄^", "ɛ̄́"},
	{"ɔ̄^", "ɔ̄́"},
	{"ê^", "ế"},
	{"ô^", "ố"},
	{"ɛ̂^", "ɛ̂́"},
	{"ɔ̂^", "ɔ̂́"},

	{"ɔyw(" .. word_end .. ")", "ɔw%1"},  --irregular…
	{"(" .. V .. "[+^]?)([bdgptk])(" .. V .. ")", "%1%2%2%3"},  --dagesh bgdkft gemination
	{"f", "p̄"},  --bc p̄ are 2 chars
	{"%s%.", "."},  --quotes: " ." > "." (esthetics)
}

--MH
local m = { --direct change
	["ḏ"] = "d",
	["ḡ"] = "g",
	["ś"] = "s",
	["״"] = "″", --gershayim
	["q"] = "k",
	["ī"] = "i",
	["ū"] = "u",
	["́"] = "^", --stress marking conversion below
}

local l = {
	--indirect
	{"p̄", "f"},
	{"[̂̆̄]", ""},
	{"ḥ'", "ḫ"},
	{"ṯ'", "T"},
	{"ṭ'", "ẓ"},
	{"g'", "j"},
	{"z'", "Z"},
	{"ṣ'", "C"},
	{"d'", "D"},
	{"[rʕ]'", "ġ"},
	{"(.)%1", "%1"},
	{"[ḇw]", "v"},
	{"[ḵḥ]", "K"},
	{"[ṯṭ]", "t"},
	{"'", ""},
	{"[ʔʕ]", "'"},
	--above: loss of vowel length, loss of gemination, turning n-grams into 1 char, MH mergers.

	--schwa
	--prefixes
	-- {word_start .. "([bvkKlšdm])ə", "%1e"},
	-- {"(u[bvkKlšdm])ə", "%1e"},
	--initial C clusters
	{word_start .. "([rnmly])ə", "%1e"},
	{word_start .. "(" .. C .. ")ə([h'])", "%1e%2"},
	--internal
	{"([ə+]" .. C .. ")ə", "%1e"},
	{"(" .. C .. C .. ")ə", "%1e"},
	{"[ə+]", ""}, --deletion of remaining schwa and metegim

	--put here not above to avoid e/ə confusion
	{"[āâă]", "a"},
	{"[ēêɛ]", "e"},
	{"[ōô]", "o"},
	{"[ḗế]", "é"},
	{"[ṓố]", "ó"},

	{"(" .. word_start .. "[^áéíóú^]-[aeiouɔ])(" .. C .. "?" .. C .. "?)" .. word_end, "%1^%2"},  --module-explicit default final stress...
	--same articulation > schwa insertion
	{"([bp])([bp])", "%1e%2"},
	{"([vf])([vf])", "%1e%2"},
	{"([dt])([dt])", "%1e%2"},
	{"([DTṣ])([DTṣ])", "%1e%2"},
	{"([zs])([zs])", "%1e%2"},
	{"([Zš])([Zš])", "%1e%2"},
	{"([jC])([jC])", "%1e%2"},
	{"([gk])([gk])", "%1e%2"},
	{"(K)(K)", "%1e%2"},
	{"(r)(r)", "%1e%2"},
	{"''", "'e'"},

	--a/o, including kol
	{"ɔ(" .. C .. C .. ")", "o%1"},
	{"ɔ(" .. C .. ")" .. word_end, "o%1"},
	{"(" .. word_start .. "[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"([bvkKlšd][ea][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	-- {"(m[ei][kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"(" .. word_start .. "u[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"(ha[kK])ɔ(^l" .. word_end .. ")", "%1o%2"},
	{"ɔ", "a"},

	{"(" .. word_start .. C .. C .. "?" .. V .. ")^(" .. C .. "?" .. C .. "?" .. word_end .. ")", "%1%2"},  --…reader-implicit acute accent in monosyllabic
	--stress marking
	{"a^", "á"},
	{"e^", "é"},
	{"i^", "í"},
	{"o^", "ó"},
	{"u^", "ú"},
	--glottal stops: kept when {CV}'V,
	{"(" .. word_start .. ")'", "%1"},
	{"'(" .. C .. ")", "%1"},
	{"'(" .. word_end .. ")", "%1"},
	--fake digraphs
	{"([szck])h", "%1'h"},
	--one char > displaying
	{"ṣ", "ts"},
	{"š", "sh"},
	{"T", "t'"},
	{"Z", "zh"},
	{"C", "ch"},
	{"D", "d'"},
	{"K", "kh"},
}


function export.BH(text)
	text = s(s(text, '.', c), "[֣֖֣֑֣֣֧֛֖֥֧֛֥֖֑֣֖֥֔֗֗֙֔]", "") --remove cantillation marks so that it works for quotes too
	for a = 1, #b do
		text = s(text, b[a][1], b[a][2])
	end
	return text
end

function export.BH_tr(text)
	return s(export.BH(text), "+", "") --metegim kept for MH
end

function export.MH_tr(text)
	local acronym = false
	text = s(export.BH(text), '.', m) --.BH() to keep metegim, m is applied
	if match(text, "″") and not match(text, V) then --acronym = gershayim & no V
		text = s(s(s(text, "p̄", "p"), "ḇ", "b"), "ḵ", "k")
		acronym = true
	end
	for a = 1, #l do --in any case, l is applied
		text = s(text, l[a][1], l[a][2])
	end
	if acronym == true then
		text = mw.ustring.upper(text)
	end
	return text
end

function export.tr(text, lang, sc)
	if not sc then
		sc = require("Module:languages").getByCode(lang):findBestScript(text):getCode()
	end
	if sc ~= "Hebr" or not match(text, "[ְֱֲֳִֵֶַָׇֹֻ״־]") then
		return nil
	elseif lang == "he" then
		return export.MH_tr(text)
	elseif lang == "hbo" then --though useless
		return export.BH_tr(text)
	end
end

function export.tr_all(frame)
	return export.BH_tr(frame.args[1]) .. ", " .. export.MH_tr(frame.args[1])
end

--Erutuon's code for code points below

--[[
local Array = require "Module:array"
local function show_code_point_names(text)
	if not text then return "" end
	local names = Array()
	for cp in gcodepoint(text) do
		-- Remove HEBREW LETTER, HEBREW POINT, etc.
		local name = require "Module:Unicode data".lookup_name(cp)
			:gsub(
				"^HEBREW (%w+) ",
				function(type)
					if type == "ACCENT" then return "ACCENT " else return "" end
				end)
			:lower()
		names:insert(name)
	end
	return names:concat ", "
end


local old_s = s
function s(...)
	local old = ...
	local new = old_s(...)
	if old ~= new then
		mw.log(show_code_point_names(old), show_code_point_names(new), ...)
	end
	return new
end
--]]

return export