Jump to content

Module:pa-Arab-translit/sandbox

From Wiktionary, the free dictionary

Issues

[edit]
  • Bari ye isn't working with anything other than an alif
  • Middle y paired with zer should return "īy[..]"
  • Middle y stand-alone or paired with a jazm should return "ey[..]"
  • دُھواں should return "dhūāṉ", not "dhūvāṉ"
  • Humza should be transliterated as: ['], contrasting to the ain [ʻ]

local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local export = {}

local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)

local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ" 
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark

local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. "]"

-- not all letters here are used by urdu
local mapping = {
	["آ"] = 'ā', ["ب"] = 'b', ["ٻ"] = 'ḇ', ["پ"] = 'p', ["ت"] = 't', ["ٹ"] = 'ṭ', ["ث"] = 's̱',
	["ج"] = 'j', ["ڄ"] = 'ǰ', ["چ"] = 'c', ["ح"] = 'ḥ', ["خ"] = 'x', 
	["د"] = 'd', ["ڈ"] = 'ḍ', ["ݙ"] = 'ḏ', ["ذ"] = 'ẕ', ["ر"] = 'r', ['ڑ'] = "ṛ", ["ز"] = 'z', ["ژ"] = 'ž',
	["س"] = 's', ["ش"] = 'ś', ["ص"] = 'ṣ', ["ض"] = 'ẓ', 
	["ط"] = 't̤', ["ظ"] = 'z̤', ["ع"] = 'ʻ', ["غ"] = 'ġ', ["ف"] = 'f', ["ق"] = 'q',
	["ک"] = 'k', ["گ"] = 'g', ["ڳ"] = 'g̈', ["ݨ"] = 'ṇ', ["ࣇ"] = 'ḷ',
	["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', ["و"] = 'v', ["ہ"] = 'h', ["ی"] = 'y', ["۔"] = ".", ["ں"] = 'ṉ',

	["ھ"] = "h",
	["أ"] = '',
	
	
	-- diacritics
	["٘"] = 'ṉ',
	[zabar] = "a",
	[zer] = "i",
	[pesh] = "u",
	[jazm] = "", -- also sukun - no vowel
	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
	
	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",
	
	-- kashida
	["ـ"] = "-", -- kashida, no sound
	
	-- numerals
	["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
	["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
	
	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["۔"] = ".", -- period
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = '“', -- quotation mark
	["»"] = '”', -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousand
	["ۓ"] = "-ye", 
	[highhmz] = "-yi",
}

local punctuation = "%-:%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "[^" .. aiu .. "]" -- for nasalization exceptions

local has_diacritics_subs = {
	-- remove arabic ye (ruins conversions)
	{"لل" ..  he , ""},
	{"لل" .. tashdid ..  he , ""},
	{"لل" .. tashdid .. dagger_alif ..  he , ""},
	{"ۃ" , ""},
	-- aspirated consonants should cound as 1 consonant not two
	{"([" .. consonants .. "][".. ZZP .. diacritics .. "?])" ..  aspirate , "%1"},
	{"([" .. consonants .. "])" ..  aspirate , "%1"},
	{ aspirate , ""},
	-- remove punctuation and tashdid
	{"[" .. punctuation .. tashdid .. highhmz .. zwnj .. numbers .. "]", ""},
	-- noon gunna and silent consonants can be removed
	{ ".. [".. ZZP .. indvowels .. diacritics .. "?] .. ([" .. consonantS2 .. "])" .. "([".. ghunna .. jazm .."])" .. "([" .. consonantS2 .. "])"  , ""},
	{ "([" .. consonants .. "])" .. ghunna , ""},
	{ "([" .. consonantS2 .. "])" .. jazm , ""},
	{ "([" .. consonantS2 .. "])" .. "یٰ" , ""},
	-- must go before removing final consonants
	{"[".. ZZP .. diacritics .. "]" .. alif , alif },
	{fatHataan , "" },
	{ "([" .. consonantS2 .. "])" .. "[" .. ZZP .. diacritics .. indvowels .. "?]" .. "([ںۓۂۂ])", "" },
	{ "([ںۓۂۂ])", "" },
	{ "([" .. ye .. alif .. "])" .. dagger_alif, alif},
	{ dagger_alif .. ye , alif},
	{ alif .. "[".. ZZP .. diacritics .. "]" , ""},
	{ "[".. ZZP .. diacritics .. "]" .. alif , alif},
	{ dagger_alif .. "([" .. ye .. alif .. "])", alif},
	-- Remove consonants at end of word or utterance, so that we're OK with
	-- words lacking iʿrāb (must go before removing other consonants).
	-- If you want to catch places without iʿrāb, comment out the next two lines.
	{"[" .. lconsonants .. "]$", ""},
	-- closed consonants
	{"([" .. consonantS2 .. "])[" .. indvowels .. ZZP .. "]", ""},
	-- remove consonants (or alif) when followed by diacritics
	-- must go after removing tashdid
	-- do not remove the diacritics yet because we need them to handle
	-- long-vowel sequences of diacritic + pseudo-consonant
	{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "])", "%1"},
	-- the following two must go after removing consonants w/diacritics because
	{"([" .. rconsonants .. "])([".. ZZP .. diacritics .. "?][" .. indvowels .. "?])([" .. consonantS2 .. "])", ""},
	{"[" .. indvowels .. "]([" .. rconsonants .. "])", ""},
	{"[".. ZZP .. diacritics .. "]([" .. lconsonants .. "])", ""},
	{"([" .. consonants .. "])[" .. indvowels .. ZZP .. diacritics .. "]", ""},
	{"([" .. rconsonants .. "])(" .. space_like_class .. ")", ""},
	{"[" .. lconsonants .. "]" .. zabar .. "[".. ye .. ye3 .. vao .. "]", ""},
	-- we only want to treat vocalic wāw/yā' in them (we want to have removed
	-- remove vaw
	{ "[" .. lconsonants .. "]" .. vao, ""},
	{"ؤ" .. pesh , ""},
	{"ؤ", ""},
	-- remove ye
	{ "[" .. lconsonants .. "]" .. ye, ""},
	{ye3, ""},
	{"([" .. consonants .. "][" .. ZZP .. "])" .. he,""},
	-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
	{"[" .. fatHataan .. zabar .. "][" .. alif .. ye .. "]", ""},
	-- remove diacritics and independant vowels
	{"[" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "]", ""},
	{ "[" .. indvowels .. "]" , ""},
	{ "[".. semivowel .."]" .. "[" .. indvowels .. "]" , ""},
	-- remove numbers, hamzatu l-waṣl, alif madda
	{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
	{"%s", ""},
}

-- declared as local above
local function has_diacritics(text)
	local count
	text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
	if count > 0 then
		require("Module:debug").track("ur-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = gsub(text, unpack(sub))
	end
	return #text == 0
end

function export.tr(text, lang, sc)
	
	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "\n" , "#".."\n" .. "#")
	text = gsub(text, "(["..punctuation.."])" , "#".."%1" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	text = gsub(text, zwnj, "#"..zwnj.."#")
	-- hastags now mark the beginning and end of a word
	
	--exceptions
	text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
	text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
	text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
	text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
	text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
	text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
	text = gsub(text, "ن٘", "ṉ")
	
	--character reformatting
	--to make an exceptions for a word, put hashtags on both sides
	text = gsub(text, "ۂ", he .. highhmz)
	text = gsub(text, highhmz, "#"..highhmz.."#")
	--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
	text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
	text = gsub(text, 'ٰ', "ā")
	text = gsub(text, 'ا' .. fatHataan, "an")
	text = gsub(text, 'لا', "ﻻ")
	text = gsub(text, "ة" 	, "ۃ")
	text = gsub(text, "ۃ" .. "([" .. ZZP .. jazm .. "])", "ت%1")
	text = gsub(text, "ۃ" , he)
	
	-- Tashdeed
	text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid, "%1%1")
	text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")
	-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
	text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2")
	text = gsub(text, '([' .. ZZP .. '])' .. aspirate, aspirate.."%1") 
	text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
	text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1")
	text = gsub(text,  vao .. '([' .. ZZP .. '])' .. tashdid, "vv%1")
	text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1")
	text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "vv%1")
	

    --initial alif
    text = gsub(text, "(["..consonantS2.."])" .. alif, "%1ā") 
    --alifs paired to a consonant are a vowel
    text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
    text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
    text = gsub(text, "(["..consonantS2.."])" .. "آ", "%1'ā") 
    	text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
    text = gsub(text, zabar .. alif, "ā")
    text = gsub(text, "(["..diacritics.."])" .. alif, "%1")
    text = gsub(text, "(["..ZZP.."])" .. alif, "%1")
    --alifs not paired to a consonant are a glottal stop (not shown currently)
    text = gsub(text, alif.."(["..diacritics.."])".. "(["..consonantS2.."])", "%1%2")
    text = gsub(text, alif..ye.."#", "ī")
    text = gsub(text, alif..ye, "e")
    text = gsub(text, alif..ye3, "e")
    text = gsub(text, alif..zabar..ye3, "ai")
    text = gsub(text, alif..vao, "o")
    text = gsub(text, alif..zer..ye, "ī")
    text = gsub(text, alif..pesh..vao, "ū")
    text = gsub(text, alif.."(["..diacritics.."])", "%1")
    
    
    -- convert semi vowels
    text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
    text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1")
    text = gsub(text, ye .. "ā", "yā")
    text = gsub(text, vao.. "ā", "vā")
    text = gsub(text, ye .. "(["..zabar.."]?)" .. ye3, "y%1"..ye3.."")
    text = gsub(text, vao .. "(["..zabar.."]?)" .. ye3, "v%1"..ye3.."")
    text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "e%1%2")
    text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "o%1%2")
    text = gsub(text, ye .. "(["..semivowel.."])", "y%1")
    text = gsub(text, vao .. "(["..semivowel.."])", "v%1")
    
    -- conversions for vaav/vaw/vao
    text = gsub(text, pesh.. vao, "ū")
    text = gsub(text, zabar .. vao, "au")
    text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
    text = gsub(text, "(["..diacritics..ZZP.."])" .. vao, "%1v")
    -- conversions for ye
    text = gsub(text, zer.. ye, "ī")
    text = gsub(text, ye .. "#", "ī#")
    text = gsub(text, zabar.. ye, "ai")
    text = gsub(text, zabar.. ye3, "ai")
    text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1")
    text = gsub(text, "(["..diacritics..ZZP.."])" .. ye , "%1y")
    
    -- final he and izafa/ezafe
    text = gsub(text, "e" .. zer .. "#", "e-yi#")
    text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
    text = gsub(text, "y" .. zer .. "#", "-yi#")
    text = gsub(text, zer .. "#", "-i#")
    text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-")
    text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#")
    text = gsub(text, zabar .. he .. "#", "a#")
    
    -- get rid of hashtags (not needed)
    text = gsub(text, "#", "")
    text = gsub(text, "HASHTAG", "#")
    text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
    -- convert all characters
    text = gsub(text, '.', mapping)
    
    -- vowel fixes
	
	-- alif
	-- Final corrections
	text = gsub(text, "hh", "h")
	text = gsub(text, "lll", "ll")
	text = gsub(text, "āa", "ā")
	text = gsub(text, "aaa", "ā")
	text = gsub(text, "āā", "ā")
	text = gsub(text, "aa", "ā")
	
	--now get rid of the zero consonants
	text = gsub(text, "ئ", "")
	text = gsub(text, "u" .. "ؤ" , "u")
	text = gsub(text, "ؤ" .. "u" .. "$", "ū")  -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise
	text = gsub(text, "ؤ" .. "u" .. "([ ,.;?!-])", "ū%1")
	text = gsub(text, "ؤ" .. "u" , "u")
	text = gsub(text, "ؤ", "o")
	
	text = mw.ustring.toNFC(text)
	
	return text
end

return export