Module:pa-Arab-translit/sandbox
Jump to navigation
Jump to search
- The following documentation is located at Module:pa-Arab-translit/sandbox/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
Issues
[edit]- Bari ye isn't working with anything other than an alif
- Middle y paired with zer should return "īy[..]"
- Middle y stand-alone or paired with a jazm should return "ey[..]"
- دُھواں should return "dhūāṉ", not "dhūvāṉ"
- Humza should be transliterated as: ['], contrasting to the ain [ʻ]
local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)
local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ"
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. "]"
-- not all letters here are used by urdu
local mapping = {
["آ"] = 'ā', ["ب"] = 'b', ["ٻ"] = 'ḇ', ["پ"] = 'p', ["ت"] = 't', ["ٹ"] = 'ṭ', ["ث"] = 's̱',
["ج"] = 'j', ["ڄ"] = 'ǰ', ["چ"] = 'c', ["ح"] = 'ḥ', ["خ"] = 'x',
["د"] = 'd', ["ڈ"] = 'ḍ', ["ݙ"] = 'ḏ', ["ذ"] = 'ẕ', ["ر"] = 'r', ['ڑ'] = "ṛ", ["ز"] = 'z', ["ژ"] = 'ž',
["س"] = 's', ["ش"] = 'ś', ["ص"] = 'ṣ', ["ض"] = 'ẓ',
["ط"] = 't̤', ["ظ"] = 'z̤', ["ع"] = 'ʻ', ["غ"] = 'ġ', ["ف"] = 'f', ["ق"] = 'q',
["ک"] = 'k', ["گ"] = 'g', ["ڳ"] = 'g̈', ["ݨ"] = 'ṇ', ["ࣇ"] = 'ḷ',
["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', ["و"] = 'v', ["ہ"] = 'h', ["ی"] = 'y', ["۔"] = ".", ["ں"] = 'ṉ',
["ھ"] = "h",
["أ"] = '',
-- diacritics
["٘"] = 'ṉ',
[zabar] = "a",
[zer] = "i",
[pesh] = "u",
[jazm] = "", -- also sukun - no vowel
[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
-- ligatures
["ﻻ"] = "lā",
["ﷲ"] = "allāh",
-- kashida
["ـ"] = "-", -- kashida, no sound
-- numerals
["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
-- punctuation (leave on separate lines)
["؟"] = "?", -- question mark
["۔"] = ".", -- period
["،"] = ",", -- comma
["؛"] = ";", -- semicolon
["«"] = '“', -- quotation mark
["»"] = '”', -- quotation mark
["٪"] = "%", -- percent
["؉"] = "‰", -- per mille
["٫"] = ".", -- decimals
["٬"] = ",", -- thousand
["ۓ"] = "-ye",
[highhmz] = "-yi",
}
local punctuation = "%-:%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "[^" .. aiu .. "]" -- for nasalization exceptions
local has_diacritics_subs = {
-- remove arabic ye (ruins conversions)
{"لل" .. he , ""},
{"لل" .. tashdid .. he , ""},
{"لل" .. tashdid .. dagger_alif .. he , ""},
{"ۃ" , ""},
-- aspirated consonants should cound as 1 consonant not two
{"([" .. consonants .. "][".. ZZP .. diacritics .. "?])" .. aspirate , "%1"},
{"([" .. consonants .. "])" .. aspirate , "%1"},
{ aspirate , ""},
-- remove punctuation and tashdid
{"[" .. punctuation .. tashdid .. highhmz .. zwnj .. numbers .. "]", ""},
-- noon gunna and silent consonants can be removed
{ ".. [".. ZZP .. indvowels .. diacritics .. "?] .. ([" .. consonantS2 .. "])" .. "([".. ghunna .. jazm .."])" .. "([" .. consonantS2 .. "])" , ""},
{ "([" .. consonants .. "])" .. ghunna , ""},
{ "([" .. consonantS2 .. "])" .. jazm , ""},
{ "([" .. consonantS2 .. "])" .. "یٰ" , ""},
-- must go before removing final consonants
{"[".. ZZP .. diacritics .. "]" .. alif , alif },
{fatHataan , "" },
{ "([" .. consonantS2 .. "])" .. "[" .. ZZP .. diacritics .. indvowels .. "?]" .. "([ںۓۂۂ])", "" },
{ "([ںۓۂۂ])", "" },
{ "([" .. ye .. alif .. "])" .. dagger_alif, alif},
{ dagger_alif .. ye , alif},
{ alif .. "[".. ZZP .. diacritics .. "]" , ""},
{ "[".. ZZP .. diacritics .. "]" .. alif , alif},
{ dagger_alif .. "([" .. ye .. alif .. "])", alif},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
-- closed consonants
{"([" .. consonantS2 .. "])[" .. indvowels .. ZZP .. "]", ""},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing tashdid
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "])", "%1"},
-- the following two must go after removing consonants w/diacritics because
{"([" .. rconsonants .. "])([".. ZZP .. diacritics .. "?][" .. indvowels .. "?])([" .. consonantS2 .. "])", ""},
{"[" .. indvowels .. "]([" .. rconsonants .. "])", ""},
{"[".. ZZP .. diacritics .. "]([" .. lconsonants .. "])", ""},
{"([" .. consonants .. "])[" .. indvowels .. ZZP .. diacritics .. "]", ""},
{"([" .. rconsonants .. "])(" .. space_like_class .. ")", ""},
{"[" .. lconsonants .. "]" .. zabar .. "[".. ye .. ye3 .. vao .. "]", ""},
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- remove vaw
{ "[" .. lconsonants .. "]" .. vao, ""},
{"ؤ" .. pesh , ""},
{"ؤ", ""},
-- remove ye
{ "[" .. lconsonants .. "]" .. ye, ""},
{ye3, ""},
{"([" .. consonants .. "][" .. ZZP .. "])" .. he,""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"[" .. fatHataan .. zabar .. "][" .. alif .. ye .. "]", ""},
-- remove diacritics and independant vowels
{"[" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "]", ""},
{ "[" .. indvowels .. "]" , ""},
{ "[".. semivowel .."]" .. "[" .. indvowels .. "]" , ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
{"%s", ""},
}
-- declared as local above
local function has_diacritics(text)
local count
text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("ur-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, " | ", "# | #")
text = gsub(text, "\n" , "#".."\n" .. "#")
text = gsub(text, "(["..punctuation.."])" , "#".."%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, zwnj, "#"..zwnj.."#")
-- hastags now mark the beginning and end of a word
--exceptions
text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
text = gsub(text, "ن٘", "ṉ")
--character reformatting
--to make an exceptions for a word, put hashtags on both sides
text = gsub(text, "ۂ", he .. highhmz)
text = gsub(text, highhmz, "#"..highhmz.."#")
--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
text = gsub(text, 'ٰ', "ā")
text = gsub(text, 'ا' .. fatHataan, "an")
text = gsub(text, 'لا', "ﻻ")
text = gsub(text, "ة" , "ۃ")
text = gsub(text, "ۃ" .. "([" .. ZZP .. jazm .. "])", "ت%1")
text = gsub(text, "ۃ" , he)
-- Tashdeed
text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid, "%1%1")
text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")
-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2")
text = gsub(text, '([' .. ZZP .. '])' .. aspirate, aspirate.."%1")
text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1")
text = gsub(text, vao .. '([' .. ZZP .. '])' .. tashdid, "vv%1")
text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1")
text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "vv%1")
--initial alif
text = gsub(text, "(["..consonantS2.."])" .. alif, "%1ā")
--alifs paired to a consonant are a vowel
text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
text = gsub(text, "(["..consonantS2.."])" .. "آ", "%1'ā")
text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
text = gsub(text, zabar .. alif, "ā")
text = gsub(text, "(["..diacritics.."])" .. alif, "%1")
text = gsub(text, "(["..ZZP.."])" .. alif, "%1")
--alifs not paired to a consonant are a glottal stop (not shown currently)
text = gsub(text, alif.."(["..diacritics.."])".. "(["..consonantS2.."])", "%1%2")
text = gsub(text, alif..ye.."#", "ī")
text = gsub(text, alif..ye, "e")
text = gsub(text, alif..ye3, "e")
text = gsub(text, alif..zabar..ye3, "ai")
text = gsub(text, alif..vao, "o")
text = gsub(text, alif..zer..ye, "ī")
text = gsub(text, alif..pesh..vao, "ū")
text = gsub(text, alif.."(["..diacritics.."])", "%1")
-- convert semi vowels
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao.. "ā", "vā")
text = gsub(text, ye .. "(["..zabar.."]?)" .. ye3, "y%1"..ye3.."")
text = gsub(text, vao .. "(["..zabar.."]?)" .. ye3, "v%1"..ye3.."")
text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "e%1%2")
text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "o%1%2")
text = gsub(text, ye .. "(["..semivowel.."])", "y%1")
text = gsub(text, vao .. "(["..semivowel.."])", "v%1")
-- conversions for vaav/vaw/vao
text = gsub(text, pesh.. vao, "ū")
text = gsub(text, zabar .. vao, "au")
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
text = gsub(text, "(["..diacritics..ZZP.."])" .. vao, "%1v")
-- conversions for ye
text = gsub(text, zer.. ye, "ī")
text = gsub(text, ye .. "#", "ī#")
text = gsub(text, zabar.. ye, "ai")
text = gsub(text, zabar.. ye3, "ai")
text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, "(["..diacritics..ZZP.."])" .. ye , "%1y")
-- final he and izafa/ezafe
text = gsub(text, "e" .. zer .. "#", "e-yi#")
text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
text = gsub(text, "y" .. zer .. "#", "-yi#")
text = gsub(text, zer .. "#", "-i#")
text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#")
text = gsub(text, zabar .. he .. "#", "a#")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = gsub(text, '.', mapping)
-- vowel fixes
-- alif
-- Final corrections
text = gsub(text, "hh", "h")
text = gsub(text, "lll", "ll")
text = gsub(text, "āa", "ā")
text = gsub(text, "aaa", "ā")
text = gsub(text, "āā", "ā")
text = gsub(text, "aa", "ā")
--now get rid of the zero consonants
text = gsub(text, "ئ", "")
text = gsub(text, "u" .. "ؤ" , "u")
text = gsub(text, "ؤ" .. "u" .. "$", "ū") -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise
text = gsub(text, "ؤ" .. "u" .. "([ ,.;?!-])", "ū%1")
text = gsub(text, "ؤ" .. "u" , "u")
text = gsub(text, "ؤ", "o")
text = mw.ustring.toNFC(text)
return text
end
return export