Jump to content

Module:Hebr-common

From Wiktionary, the free dictionary


local export = {}

local m_str_utils = require("Module:string utilities")

local makeDisplayText -- defined below
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
local usub = m_str_utils.sub

local DIACRITICS = "[" ..
	u(0x0307) .. u(0x0308) ..
	u(0x034F) .. -- combining grapheme joiner
	u(0x200C) .. -- zero width non-joiner
	u(0x200D) .. -- zero width joiner
	u(0x0591) .. "-" .. u(0x05BD) ..
	u(0x05BF) ..
	u(0x05C1) .. u(0x05C2) ..
	u(0x05C4) .. u(0x05C5) ..
	u(0x05C7) ..
	u(0xFB1E) ..
"]"

local GERESH = u(0x059C)
local GERSHAYIM = u(0x059E) -- double geresh
local MERCHA = u(0x05A5)
local MERCHA_KEFULA = u(0x05A6) -- double mercha
local TELISHA = u(0x05A0) .. u(0x05A9)
local KARNE_PARAH = u(0x059F)
local SHEVA = u(0x05B0)
local HOLAM = u(0x05B9)
local HOLAM_HASER_FOR_VAV = u(0x05BA)
local WIDE_ALEF = u(0xFB21)

local substitutes = {
	["'"] = "׳",
	['"'] = "״",
	["-"] = "־",
	["|"] = "׀",
	[SHEVA .. u(0x05B6)] = u(0x05B1),
	[SHEVA .. u(0x05B7)] = u(0x05B2),
	[SHEVA .. u(0x05B8)] = u(0x05B3),
	[SHEVA .. u(0x05C7)] = u(0x05B3),
}

function export.makeDisplayText(text, lang, sc)
	text = toNFD(text):gsub("['\"%-|]", substitutes)
		:gsub(SHEVA .. "[\214\215][\182-\184\135]", substitutes)
		:gsub(GERESH .. GERESH, GERSHAYIM)
		:gsub(MERCHA .. MERCHA, MERCHA_KEFULA)
		:gsub(TELISHA, KARNE_PARAH)

	-- Holam haser for vav (U+05BA) can only be placed on vav; otherwise, replace with holam (U+05B9).
	if text:find(HOLAM_HASER_FOR_VAV, nil, true) then
		text = ugsub(text, "()(" .. DIACRITICS .. "+)", function(loc, dia)
			loc = loc - 1
			if usub(text, loc, loc) ~= "ו" then
				return (dia:gsub(HOLAM_HASER_FOR_VAV, HOLAM))
			end
		end)
	end

	return toNFC(text)
end
makeDisplayText = export.makeDisplayText

local retain_diacritics = {
	["yi"] = true,
	["itk"] = true,
	["lad"] = true,
	["lsd"] = true,
}

function export.makeEntryName(text, lang, sc)
	if retain_diacritics[lang:getFullCode()] then
		return makeDisplayText(text, lang, sc)
	end

	text = ugsub(toNFD(text), DIACRITICS .. "+", "")
		:gsub("['\"%-|]", substitutes)

	return toNFC(text)
end

local sortkey_substitutes = {
	["'"] = "׳",
	['"'] = "״",
	["-"] = "־",
	["|"] = "׀",
	["ך"] = "כ",
	["ם"] = "מ",
	["ן"] = "נ",
	["ף"] = "פ",
	["ץ"] = "צ",
	["ׯ"] = "ו" .. u(0xF000),
	["װ"] = "וו",
	["ױ"] = "וי",
	["ײ"] = "יי",
	["ℵ"] = "א",
	["ℶ"] = "ב",
	["ℷ"] = "ג",
	["ℸ"] = "ד",
	["ﬠ"] = "ע",
	["ﬡ"] = "א",
	["ﬢ"] = "ד",
	["ﬣ"] = "ה",
	["ﬤ"] = "כ",
	["ﬥ"] = "ל",
	["ﬦ"] = "ם",
	["ﬧ"] = "ר",
	["ﬨ"] = "ת",
	["﬩"] = "+",
	["ﭏ"] = "אל"
}

-- Sort after U+FB21 HEBREW LETTER WIDE ALEF, so that it sorts after Arabic script titles.
local sort_after_wide_alef = {
	["ar"] = true,
	["fa"] = true,
	["ur"] = true,
	["shi"] = true,
}

function export.makeSortKey(text, lang, sc)
	text = ugsub(toNFD(text), DIACRITICS .. "+", "")
	text = ugsub(text, "['\"%-|ךםןףץׯ-ײℵ-ℸﬠ-﬩ﭏ]", sortkey_substitutes)

	if sort_after_wide_alef[lang:getFullCode()] then
		text = WIDE_ALEF .. text
	end

	return toNFC(text)
end

return export