Module:User:Justinrleung
Appearance
- This module sandbox lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local gsplit = mw.text.gsplit
function export.extract_gloss(content, useetc)
local senses = {}
local len = mw.ustring.len
local literally = match(content, 'zh%-forms[^}]*|lit=([^{|}]+)[|}]')
local sense_id = 0
local etc = false
local translingual_section, zh_section, j, pos, section
while true do
-- Find language sections beginning with ==...== and ending with the same
-- or an empty string. Grab the Chinese and Translingual ones.
_, j, language_name, section = content:find("%f[=]==%s*([^=]+)%s*==(\n.-)\n==%f[^=]", pos)
if j == nil then
i, j, language_name, section = content:find("%f[=]==%s*([^=]+)%s*==(\n.+)", pos)
end
if j == nil then
break
else
-- Move to the beginning of "==" at the end of the current match.
pos = j - 1
end
if language_name == 'Translingual' then
translingual_section = section
elseif language_name == 'Chinese' then
zh_section = section
break
end
end
if not zh_section then
zh_section = translingual_section
if not zh_section then
return ""
end
elseif translingual_section then -- also use translingual section if Chinese section contains only rfdef
zh_section = zh_section..translingual_section
end
-- Delete etymology sections, because they sometimes contain ordered lists,
-- which would then be interpreted as definitions.
zh_section = zh_section:gsub("\n===+Etymology.-(\n==)", "%1")
for sense in zh_section:gmatch('\n# ([^\n]+)') do
if not sense:match('rfdef') and not sense:match('defn') then
sense_id = sense_id + 1
if sense_id > 2 then
etc = true
break
end
table.insert(senses, sense)
end
end
gloss_text = (literally and literally .. "; " or "") .. (senses[1] or "")
local gloss_text_extend = gloss_text .. (senses[2] and "; " .. senses[2] or "")
gloss_text = (len(gloss_text) < 80 and len(gloss_text_extend) < 160) and gloss_text_extend or gloss_text
if gloss_text ~= gloss_text_extend then etc = true end
local function replace_gloss(text)
local function replace_wp(text)
return text:gsub('{{w|([^|}]+)|?([^|}]*)}}',
function(w_link, w_display)
return '[[w:'..w_link..'|'..(w_display~='' and w_display or w_link)..']]'
end)
end
if text:find("{{") then
text = replace_wp(text)
text = text:gsub(' %({{taxlink[^}%)]+}}%)', '')
:gsub('{{zh%-l|%*([^}]*)}}', '%1')
:gsub('{{lb|zh|[^}]*}}', '')
:gsub('{{zh%-erhua form of|word=[^}]+}}', '')
:gsub('{{zh%-erhua form of|([^}]+)}}', '%1')
:gsub('{{zh%-alt%-name|[^}]+|([^\n]+)}}', '%1')
:gsub('{{zh%-short%-comp|[^}]+|t=([^\n}|]+)[^}]*}}', '%1')
:gsub('{{zh%-short%-comp|[^}]+}}', '')
:gsub('{{zh%-classifier|[^}]+|t=([^\n}|]+)[^}]*}}', '%1')
:gsub('{{zh%-classifier|[^}]+}}', '')
:gsub('{{zh%-alt%-form|[^}]+}}', '')
:gsub('{{zh%-[^dm|}][^|}]+|[^|}]+|([^\n}|]+)}}', '%1')
:gsub('{{place|zh|[^}]*t=([^\n}|]+)[^}]*}}', '%1')
:gsub('{{vern', '{{w')
:gsub('|', "|")
end
text = text:gsub('( ?)([{%(]+[^}%){%(]+[}%)]+)', function(space, captured)
local taxlink = captured:match("{{taxlink|([^|}]+)")
local wiki_link =
taxlink and "''" .. taxlink .. "''" or
(match(captured, "({{w|.+}})") or false)
return wiki_link and space..wiki_link or "" end)
text = mw.text.split(text, ';')
local text_sec = {}
for _, s in ipairs(text) do
if s:find'%w' then
table.insert(text_sec, (s:gsub('^%s+',''):gsub('%s+$','')))
end
end
return table.concat(text_sec, '; ')
end
gloss_text = replace_gloss(gloss_text)
gloss_text = replace_gloss(gloss_text)
if etc and useetc and gloss_text ~= "" then
gloss_text = gloss_text .. "; etc."
elseif gloss_text:find("{{") then --temporary solution to suppress wikitext issues
gloss_text = ""
end
return gloss_text
end
function export.is_redirect(frame)
if mw.title.new( frame.args[1] ).isRedirect then
return 1
else
return 0
end
end
function export.link(text)
return require("Module:links").language_link(text, nil, require("Module:languages").getByCode("zh"))
end
local function ine(var)
if var == "" then
return nil
else
return var
end
end
local tones = '[̄́̌̀]'
local py_tone = {
['̄'] = '1',
['́'] = '2',
['̌'] = '3',
['̀'] = '4'
}
function export.py_transform(text, detone, not_spaced)
if type(text) == 'table' then text, detone, not_spaced = text.args[1], text.args[2], text.args[3] end
if find(text, '') then
error("Pinyin contains the hidden character: (U+200B). Please remove that character from the text.")
end
detone = ine(detone)
not_spaced = ine(not_spaced)
text = gsub(gsub(mw.ustring.toNFD(text), mw.ustring.toNFD('ê'), 'ê'), mw.ustring.toNFD('ü'), 'ü')
if find(mw.ustring.lower(text), '[aeiouêü]' .. tones .. '[aeiou]?[aeiouêü]' .. tones .. '') and not not_spaced then
error(("Missing apostrophe before null-initial syllable - should be \"%s\" instead."):format(gsub(text, '([aeiouêü]' .. tones .. '[aeiou]?)([aeiouêü]' .. tones .. ')', "%1'%2"))) end
original_text = text
text = gsub(text,'([aoeAOE])([iou])(' .. tones .. ')', '%1%3%2')
text = gsub(text,'([iuü])(' .. tones .. ')([aeiou])', '%1%3%2')
if text ~= original_text then
error("Incorrect diacritic placement in Pinyin - should be \"".. text .. "\" instead.") end
text = mw.ustring.lower(text)
if not mw.ustring.find(text, tones) and text:find('[1-5]') then
return gsub(text, '(%d)(%l)', '%1 %2')
end
text = gsub(text, "#", " #")
if find(text, '[一不,.?]') then
text = gsub(text, '([一不])$', {['一'] = ' yī', ['不'] = ' bù'})
text = gsub(text, '([一不])', ' %1 ')
text = gsub(text, '([,.?])', ' %1 ')
text = gsub(text, ' +', ' ')
text = gsub(text, '^ ', '')
text = gsub(text, ' $', '')
text = gsub(text, '%. %. %.', '...')
end
text = gsub(text, "['%-]", ' ')
text = gsub(text, '([aeiouêümn]' .. tones .. '?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)', '%1 %2')
text = gsub(text, ' ([grn])$', '%1')
text = gsub(text, ' ([grn]) ', '%1 ')
if detone then
text = gsub(text, tones, py_tone)
text = gsub(text, '([1234])([^ ]*)', '%2%1')
text = gsub(text, '([%lüê]) ', '%15 ')
text = gsub(text, '([%lüê])$', '%15')
end
if not_spaced then
text = gsub(text, ' ', '')
end
return mw.ustring.toNFC(text)
end
function export.py_tongyong(text)
if type(text) == 'table' then text = text.args[1] end
local ty_tone = {
["1"] = "", ["2"] = "\204\129", ["3"] = "\204\140", ["4"] = "\204\128", ["5"] = "\204\138"
}
local function num_to_mark(syllable, tone)
tone = ty_tone[tone]
if tone ~= "" then
if syllable:find('[aeê]') then
syllable = syllable:gsub("([aeê])", "%1" .. tone)
elseif syllable:find('o') then
syllable = syllable:gsub("(o)", "%1" .. tone)
elseif syllable:find('[iu]') then
syllable = syllable:gsub("([iu])", "%1" .. tone)
end
end
return syllable
end
local words = {}
for word in gsplit(text, " ") do
local cap = word:find("^[A-Z]")
word = export.py_transform(word, true)
local syllables = {}
for syllable in gsplit(word, " ") do
syllable = syllable:gsub("([zcs]h?)i", "%1ih")
syllable = syllable:gsub("ü", "yu")
syllable = syllable:gsub("([jqx])u", "%1yu")
syllable = syllable:gsub("iu", "iou")
syllable = syllable:gsub("ui", "uei")
syllable = syllable:gsub("([wf])eng", "%1ong")
syllable = syllable:gsub("wen", "wun")
syllable = syllable:gsub("iong", "yong")
syllable = syllable:gsub("^zh", "jh")
syllable = syllable:gsub("^q", "c")
syllable = syllable:gsub("^x", "s")
syllable = #syllables ~= 0 and syllable:gsub("^([aeo])", "'%1") or syllable
syllable = syllable:gsub("^([^1-5]+)([1-5])$", num_to_mark)
table.insert(syllables, syllable)
end
word = table.concat(syllables, "")
word = cap and word:gsub("^.", string.upper) or word
table.insert(words, word)
end
return mw.ustring.toNFC(table.concat(words, " "))
end
function export.pfs_check_invalid(text)
local correct = mw.ustring.toNFD(text) .. "-"
local accent = "[́̀̂̍]"
local switch = "%1%3%2%4"
correct = gsub(correct, "(o)([ae])(" .. accent .. ")([ⁿ%-/ ])", switch)
--correct = gsub(correct, "(o)(" .. accent .. ")([ae])([imnptkh][gh]?ⁿ?)", switch)
--correct = gsub(correct, "(oa)(i)(" .. accent .. ")(h?ⁿ?)", switch)
--correct = gsub(correct, "(a)([iu])(" .. accent .. ")(h?ⁿ?)", switch)
--correct = gsub(correct, "(i)(" .. accent .. ")([aou])(u?[mnptkh]?g?ⁿ?)", switch)
--correct = gsub(correct, "(ia)(u)(" .. accent .. ")(h?ⁿ?)", switch)
--correct = gsub(correct, "(u)(i)(" .. accent .. ")([hⁿ]?)", switch)
--correct = gsub(correct, "(e)(e)(" .. accent .. ")(h?ⁿ?)", switch)
--correct = gsub(correct, "(o" .. accent .. ")[ou·]", "%1͘")
correct = mw.ustring.toNFC(gsub(correct, "-$", ""))
--if text ~= correct then
--error("invalid poj \"" .. gsub(text, "-$", "") .. "\": correct poj is \"" .. correct .. "\"")
--end
return correct
end
function export.gd_to_ipa(text)
local initial_conv = {
["b"] = "p", ["p"] = "pʰ", ["m"] = "m", ["f"] = "f", ["v"] = "ʋ",
["d"] = "t", ["t"] = "tʰ", ["n"] = "n", ["l"] = "l",
["g"] = "k", ["k"] = "kʰ", ["ng"] = "ŋ", ["h"] = "h", [""] = "",
["z"] = "t͡s", ["c"] = "t͡sʰ", ["s"] = "s",
["j"] = "t͡ɕ", ["q"] = "t͡ɕʰ", ["x"] = "ɕ"
}
local final_conv = {
["ii"] = "z̩", ["i"] = "i", ["u"] = "u",
["a"] = "a", ["ia"] = "ia", ["ua"] = "ua",
["ê"] = "e", ["iê"] = "ie", ["uê"] = "ue",
["o"] = "o", ["io"] = "io", ["uo"] = "uo",
["m"] = "m̩", ["n"] = "n̩",
["ai"] = "aɪ", ["iai"] = "iaɪ", ["uai"] = "uaɪ",
["oi"] = "oɪ",
["ui"] = "uɪ", ["iui"] = "iuɪ",
["au"] = "au", ["iau"] = "iau",
["êu"] = "eu",
["iu"] = "iu",
["em"] = "əm", ["im"] = "im",
["am"] = "am", ["iam"] = "iam",
["êm"] = "ɛm",
["en"] = "ən", ["in"] = "in",
["an"] = "an", ["ian"] = "ian", ["uan"] = "uan",
["ên"] = "ɛn", ["iên"] = "iɛn", ["uên"] = "uɛn",
["on"] = "ɔn", ["ion"] = "iɔn", ["uon"] = "uɔn",
["un"] = "un", ["iun"] = "iun",
["ang"] = "aŋ", ["iang"] = "iaŋ", ["uang"] = "uaŋ",
["ong"] = "ɔŋ", ["iong"] = "iɔŋ", ["uong"] = "uɔŋ",
["ung"] = "ʊŋ", ["iung"] = "iʊŋ",
["eb"] = "əp̚", ["ib"] = "ip̚",
["ab"] = "ap̚", ["iab"] = "iap̚",
["êb"] = "ɛp̚",
["ed"] = "ət̚", ["id"] = "it̚",
["ad"] = "at̚", ["iad"] = "iat̚", ["uad"] = "uat̚",
["êd"] = "ɛt̚", ["iêd"] = "iɛt̚", ["uêd"] = "uɛt̚",
["od"] = "ɔt̚",
["ud"] = "ut̚", ["iud"] = "iut̚",
["ag"] = "ak̚", ["iag"] = "iak̚", ["uag"] = "uak̚",
["og"] = "ɔk̚", ["iog"] = "iɔk̚", ["uog"] = "uɔk̚",
["ug"] = "ʊk̚", ["iug"] = "iʊk̚"
}
local tone_conv = {
["1"] = "⁴⁴", ["2"] = "¹¹",
["3"] = "³¹",
["4"] = "⁵³",
["5"] = "¹", ["6"] = "⁵",
["1*"] = "⁴⁴⁻³⁵",
["4*"] = "⁵³⁻⁵⁵"
}
local palatal = {
['g'] = 'c',
['k'] = 'cʰ',
['ng'] = 'ɲ',
['h'] = 'ç'
}
if type(text) == 'table' then text = text.args[1] end
local syllables = mw.text.split(mw.ustring.gsub(text, 'gd=', ''), ' ')
local initial, final, tone, ipa, result = {}, {}, {}, {}, {}
for i, syllable in ipairs(syllables) do
initial[i] = mw.ustring.match(syllable, "^[bpmfvdtnlgkhzcsjqx]?g?")
final[i] = mw.ustring.match(mw.ustring.sub(syllable, mw.ustring.len(initial[index]) + 1, -1), "^[^1-6]*")
final[i] = mw.ustring.gsub(mw.ustring.gsub(final[i], "^yi", "i"), "^y", "")
if mw.ustring.find(initial[i], "[zcs]") and final[i] == "i" then
final[i] = "ii"
end
if final[i] == "" then
final[i] = initial[i]
initial[i] = ""
end
tone[i] = mw.ustring.match(syllable, "[1-6]$")
end
for i, syllable in ipairs(syllables) do
initial[i] = (mw.ustring.find(final[i], "^i") and palatal[initial[i]] or initial_conv[initial[i]]) or error(("Unrecognised initial: \"%s\""):format(initial[i]))
final[i] = final_conv[final[i]] or error(("Unrecognised final: \"%s\""):format(final[i]))
if mw.ustring.match(tone[i], "[14]") and mw.ustring.match(tone[i+1] or "", "[2345]") then
tone[i] = tone[i] .. "*"
end
tone[i] = tone_conv[tone[i]]
ipa[i] = initial[i] .. final[i] .. tone[i]
end
return table.concat(ipa, " ")
end
function export.pfs_to_hrs(text)
if type(text) == 'table' then text = text.args[1] end
local syllables = mw.text.split(mw.ustring.gsub(mw.ustring.gsub(mw.ustring.lower(text), 'pfs=', ''), ' ', '-'), "-")
for i, syllable in ipairs(syllables) do
-- change consonants
syllable = mw.ustring.gsub(syllable,'[ptky]',{['p']='b',['t']='d',['k']='g',['y']='i'})
syllable = mw.ustring.gsub(syllable,'[bdgc]h',{['bh']='p',['dh']='t',['gh']='k',['ch']='z'})
syllable = mw.ustring.gsub(syllable,'zh','c')
local palatal = {['z']='j',['c']='q',['s']='x',['i']=''}
syllable = mw.ustring.gsub(syllable,'([zcsi])([iíìî])', function(a,b) return palatal[a]..b end)
-- find tones
local tone = ''
if mw.ustring.find(syllable, '[âêîôû̂]') then
tone = '´'
elseif mw.ustring.find(syllable, '[àèìòùǹ̀]') then
tone = 'ˇ'
elseif mw.ustring.find(syllable, '[áéíóúń́]') or
(mw.ustring.find(syllable, '[aeiouṳ][bdg]$') and not mw.ustring.find(syllable, '̍')) then
tone = '`'
end
-- remove tone marks and fix vowels
local final_conv = {
['á'] = 'a', ['é'] = 'e', ['í'] = 'i', ['ó'] = 'o', ['ú'] = 'u', ['́'] = '',
['à'] = 'a', ['è'] = 'e', ['ì'] = 'i', ['ò'] = 'o', ['ù'] = 'u', ['̀'] = '',
['â'] = 'a', ['ê'] = 'e', ['î'] = 'i', ['ô'] = 'o', ['û'] = 'u', ['̂'] = '',
['ń'] = 'n', ['ǹ'] = 'n',
['̍'] = '',
['ṳ'] = 'ii',
}
syllable = mw.ustring.gsub(syllable, '[âêîôû̂àèìòù̀áéíóú́ńǹ̍ṳ]', final_conv)
syllable = mw.ustring.gsub(syllable, 'o([ae])', 'u%1')
-- add new tone marks
syllables[i] = syllable .. tone
end
return table.concat(syllables, " ")
end
function export.test()
local a = "abc"
local b = "abc"
local c = {}
c[a] = 5
return (c[b] == c[a])
end
return export