Module:bho-Kthi-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Bhojpuri language text. It is also used to transliterate Angika, Magahi, Maithili, and Sadri. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:bho-Kthi-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Transliteration for Bhojpuri

local export = {}
local gsub = mw.ustring.gsub
local match = mw.ustring.match

local conv = {
	-- consonants
	['๐‘‚'] = 'k', ['๐‘‚Ž'] = 'kh', ['๐‘‚'] = 'g', ['๐‘‚'] = 'gh', ['๐‘‚‘'] = 'แน…', 
	['๐‘‚’'] = 'c', ['๐‘‚“'] = 'ch', ['๐‘‚”'] = 'j', ['๐‘‚•'] = 'jh', ['๐‘‚–'] = 'รฑ', 
	['๐‘‚—'] = 'แนญ', ['๐‘‚˜'] = 'แนญh', ['๐‘‚™'] = 'แธ', ['๐‘‚›'] = 'แธh', ['๐‘‚'] = 'แน‡',
	['๐‘‚ž'] = 't', ['๐‘‚Ÿ'] = 'th', ['๐‘‚ '] = 'd', ['๐‘‚ก'] = 'dh', ['๐‘‚ข'] = 'n',
	['๐‘‚ฃ'] = 'p', ['๐‘‚ค'] = 'ph', ['๐‘‚ฅ'] = 'b', ['๐‘‚ฆ'] = 'bh', ['๐‘‚ง'] = 'm', 
	['๐‘‚จ'] = 'y', ['๐‘‚ฉ'] = 'r', ['๐‘‚ช'] = 'l', ['๐‘‚ซ'] = 'v', ['๐‘‚ซ'] = 'v', ['เคณ'] = 'แธท',
	['๐‘‚ฌ'] = 'ล›', ['๐‘‚ญ'] = 'แนฃ', ['๐‘‚ฎ'] = 's', ['๐‘‚ฏ'] = 'h',
	['๐‘‚š'] = 'แน›', ['๐‘‚š'] = 'แน›', ['๐‘‚œ'] = 'แน›h', ['๐‘‚œ'] = 'แน›h',
	-- ['๐‘‚”๐‘‚น๐‘‚–'] = 'gy',

	-- vowel diacritics
	['๐‘‚ฑ'] = 'i', ['๐‘‚ณ'] = 'u', ['๐‘‚ต'] = 'e', ['๐‘‚ท'] = 'o', 
	['๐‘‚ฐ'] = 'ฤ', ['๐‘‚ฒ'] = 'ฤซ', ['๐‘‚ด'] = 'ลซ', 
	['๐‘‚ถ'] = 'ai', ['๐‘‚ธ'] = 'au',

	-- vowel signs
	['๐‘‚ƒ'] = 'a', ['๐‘‚…'] = 'i', ['๐‘‚‡'] = 'u', ['๐‘‚‰'] = 'e', ['๐‘‚‹'] = 'o',
	['๐‘‚„'] = 'ฤ', ['๐‘‚†'] = 'ฤซ', ['๐‘‚ˆ'] = 'ลซ',  
	['๐‘‚Š'] = 'ai', ['๐‘‚Œ'] = 'au', 
	
	['เฅ'] = 'om',
	
	-- chandrabindu
	['๐‘‚€'] = 'ฬƒ',
	
	-- anusvara
	['๐‘‚'] = 'แน',
	
	-- visarga
	['๐‘‚‚'] = 'แธฅ',
	
	-- virama
	['๐‘‚น'] = '',
	
	-- numerals
	['เฅฆ'] = '0', ['เฅง'] = '1', ['เฅจ'] = '2', ['เฅฉ'] = '3', ['เฅช'] = '4',
	['เฅซ'] = '5', ['เฅฌ'] = '6', ['เฅญ'] = '7', ['เฅฎ'] = '8', ['เฅฏ'] = '9',
	
	-- punctuation
	['๐‘ƒ€'] = '.', -- danda
	['๐‘ƒ'] = '.', -- double danda
	['+'] = '', -- compound separator
	
	-- abbreviation sign
	['๐‘‚ป'] = '.',
}

local nasal_assim = {
	['๐‘‚'] = '๐‘‚‘', ['๐‘‚Ž'] = '๐‘‚‘', ['๐‘‚'] = '๐‘‚‘', ['๐‘‚'] = '๐‘‚‘',
	['๐‘‚’'] = '๐‘‚–', ['๐‘‚“'] = '๐‘‚–', ['๐‘‚”'] = '๐‘‚–', ['๐‘‚•'] = '๐‘‚–',
	['๐‘‚—'] = '๐‘‚', ['๐‘‚˜'] = '๐‘‚', ['๐‘‚™'] = '๐‘‚', ['๐‘‚›'] = '๐‘‚',
	['๐‘‚ฃ'] = '๐‘‚ง', ['๐‘‚ค'] = '๐‘‚ง', ['๐‘‚ฅ'] = '๐‘‚ง', ['๐‘‚ฆ'] = '๐‘‚ง', ['๐‘‚ง'] = '๐‘‚ง',
}

local perm_cl = {
	['๐‘‚ง๐‘‚น๐‘‚ช'] = true, ['๐‘‚ซ๐‘‚น๐‘‚ช'] = true, ['๐‘‚ซ๐‘‚น๐‘‚ช'] = true, ['๐‘‚ข๐‘‚น๐‘‚ช'] = true,
	
}

local all_cons, special_cons = '๐‘‚๐‘‚Ž๐‘‚๐‘‚๐‘‚‘๐‘‚’๐‘‚“๐‘‚”๐‘‚•๐‘‚–๐‘‚—๐‘‚˜๐‘‚™๐‘‚š๐‘‚›๐‘‚œ๐‘‚ž๐‘‚Ÿ๐‘‚ ๐‘‚ก๐‘‚ฃ๐‘‚ค๐‘‚ฅ๐‘‚ฆ๐‘‚ฌ๐‘‚ญ๐‘‚ฎ๐‘‚จ๐‘‚ฉ๐‘‚ช๐‘‚ซ๐‘‚ฏ๐‘‚๐‘‚ข๐‘‚ง', '๐‘‚จ๐‘‚ฉ๐‘‚ช๐‘‚ฅ๐‘‚ซ๐‘‚ฏ๐‘‚ข๐‘‚ง'
local vowel, vowel_sign = 'a๐‘‚ฐ๐‘‚ฑ๐‘‚ฒ๐‘‚ณ๐‘‚ด๐‘‚ต๐‘‚ถ๐‘‚ท๐‘‚ธ', '๐‘‚ƒ๐‘‚„๐‘‚…๐‘‚†๐‘‚‡๐‘‚ˆ๐‘‚‰๐‘‚Š๐‘‚‹๐‘‚Œ'
local syncope_pattern = '([' .. vowel .. vowel_sign .. '])(๐‘‚บ?[' .. all_cons .. '])a(๐‘‚บ?[' .. gsub(all_cons, "๐‘‚จ", "") .. '])([๐‘‚๐‘‚€]?[' .. vowel .. vowel_sign .. '])'

local function rev_string(text)
	local result, length = {}, mw.ustring.len(text)
	for i = 1, length do
		table.insert(result, mw.ustring.sub(text, length - i + 1, length - i + 1))
	end
	return table.concat(result)
end

function export.tr(text, lang, sc)
	text = gsub(
		text,
		'([' .. all_cons .. ']๐‘‚บ?)([' .. vowel .. '๐‘‚น]?)',
		function(c, d)
			return c .. (d == "" and 'a' or d)
		end
	)

	for word in mw.ustring.gmatch(text, "[๐‘‚€-๐‘ƒa]+") do
		local orig_word = word
		
		word = rev_string(word)
		
		word = gsub(
			word,
			'^a(๐‘‚บ?)([' .. all_cons .. '])(.)(.?)',
			function(opt, first, second, third)
				local a = ""
				if match(first, '[' .. special_cons .. ']')
					and match(second, '๐‘‚น')
					and not perm_cl[first..second..third]
					or match(first .. second, '๐‘‚จ[๐‘‚ฒ๐‘‚ต๐‘‚ถ]') then
						a = "a"
				end
				
				return a .. opt .. first .. second .. third
			end
		)
		
		while match(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, '%1%2%3%4')
		end
		
		word = gsub(
			word,
			'(.?)๐‘‚(.)',
			function(succ, prev)
				local mid = nasal_assim[succ] or "n"
				if succ..prev == "a" then
					mid = "๐‘‚บ๐‘‚ง" 
				elseif succ == "" and match(prev, '[' .. vowel .. ']') then
					mid = "ฬƒ" 
				end
				return succ .. mid .. prev
			end
		)
		
		text = gsub(text, orig_word, rev_string(word))
	end
	text = gsub(text, '.๐‘‚บ?', conv)
	text = gsub(text, 'a([iu])ฬƒ', 'aอ %1')
    text = gsub(text, '๐‘‚”๐‘‚น๐‘‚–', conv)
    text = gsub(text, '%*', 'a')

	return mw.ustring.toNFC(text)
end

return export