Module:grc-translit/sandbox

The following documentation is located at Module:grc-translit/sandbox/documentation. ^[edit] Categories were auto-generated by Module:module categorization. ^[edit]

Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)

2 of 35 tests failed. (refresh)

test_links:
Text	Expected	Actual
ΛΌΓΟΣ	LÓGOS	LÓGOS
ΟἿΑΙ	HOÎAI	HoîAi
ῬΉΤΩΡ	RHḖTŌR	RhḖTŌR
λόγος	lógos	lógos
σφίγξ	sphínx	sphínx
ϝάναξ	wánax	wánax
οἷαι	hoîai	hoîai
ταῦρος	taûros	taûros
νηῦς	nēûs	nēûs
σῦς	sûs	sûs
ὗς	hûs	hûs
γυῖον	guîon	guîon
ἀναῡ̈τέω	anaṻtéō	anaṻtéō
δαΐφρων	daḯphrōn	daḯphrōn
τῶν	tôn	tôn
τοὶ	toì	toì
τῷ	tôi	tôi
τούτῳ	toútōi	toútōi
σοφίᾳ	sophíāi	sophíāi
μᾱ̆νός	mānós	mānós
ὁ	ho	ho
οἱ	hoi	hoi
εὕρισκε	heúriske	heúriske
ὑϊκός	huïkós	huïkós
πυρρός	purrhós	purrhós
ῥέω	rhéō	rhéō
σάἁμον	sáhamon	sáhamon
Ὀδυσσεύς	Odusseús	Odusseús
Εἵλως	Heílōs	Heílōs
ᾍδης	Hā́idēs	Hā́idēs
ἡ Ἑλήνη	hē Helḗnē	hē Helḗnē
𐠠𐠒𐠯𐠗	pi-lo-ti-mo	pi-lo-ti-mo
ἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
τί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;	tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?	tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?
τούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.	toútōn phōnḗenta mén estin heptá; a e ē i o u ō.	toútōn phōnḗenta mén estin heptá; a e ē i o u ō.

local export = {}

local m_data = require('Module:grc-utilities/data')
local tokenize = require('Module:grc-utilities').tokenize

local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local U = mw.ustring.char
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper

-- Can't do range with null byte apparently.
local UTF8char = '[\1-\127\194-\244][\128-\191]*'

-- Diacritics
local diacritics = m_data.named

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local diaeresis = diacritics.diaeresis
local smooth = diacritics.smooth
local rough = diacritics.rough
local macron = diacritics.macron
local breve = diacritics.breve
local subscript = diacritics.subscript

-- Latin
local hat = diacritics.Latin_circum

local macron_diaeresis = macron .. diaeresis .. "?" .. hat
-- equivalent to '[αΑ]'
local alpha = '\206[\177\145]'
local a_subscript = '^' .. alpha .. '.*' .. subscript .. '$'
local is_velar = {
	['κ'] = true,
	['γ'] = true,
	['χ'] = true,
	['ξ'] = true,
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = hat,
	[subscript] = 'i',
}

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module.
	-- This should not be necessary, as [[Module:translit-redirect]] redirects
	-- to this module only if script is polytonic.
	if sc == "Cprt" then
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-translit/Cprt]]
		require('Module:debug').track('grc-translit/Cprt')
		return require('Module:Cprt-translit').tr(text, lang, sc)
	end
	
	if text == '῾' then
		return 'h'
	end
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	]]
	text = ugsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- Convert token to lowercase and substitute each character
		-- for its transliteration
		local translit = ulower(token):gsub(UTF8char, tt)
		
		if token == 'γ' and is_velar[tokens[i + 1]] then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif token:find(a_subscript) then
			-- add macron to ᾳ
			translit = translit:gsub('[Aa]', '%0' .. macron)
		end
		
		if token:find(rough) then
			if ufind(token, '^[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if ufind(translit, macron_diaeresis) then
			translit = translit:gsub(macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= ulower(token) then
			translit = translit:gsub("^" .. UTF8char, uupper)
		end
		
		table.insert(output, translit)
	end
	output = table.concat(output)
	
	return output
end

return export