Module:cpx-pron
Appearance
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local m_string_utils = require("Module:string utilities")
local m_table = require("Module:table")
local m_data = require("Module:cpx-pron/data")
local sub = m_string_utils.sub
local find = m_string_utils.find
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local toNFD = mw.ustring.toNFD
local toNFC = mw.ustring.toNFC
local SPECIAL_MARKERS = {
NO_ASSIMILATION = "*",
NO_SANDHI = "#",
MANUAL_CHANGE = ">",
CAPITALIZATION = "^",
SPACE_AFTER = "\\"
}
local FORMAT_MODES = {
DEBUG = "debug",
BRIEF = "brief",
COMPLETE = "complete"
}
local dialects = {
pt = "[[w:Putian dialect|Putian]]",
-- nr = "[[w:Nanri Island|Nanri]]",
-- jk = "[[w:zh:江口鎮 (莆田市)|Jiangkou]]",
xy = "[[w:Xianyou dialect|Xianyou]]",
-- ft = "[[w:zh:楓亭鎮|Fengting]]",
-- yy = "[[w:zh:游洋鎮|Youyang]]",
}
local initials = {
pt = {
["b"] = "p", ["p"] = "pʰ", ["m"] = "m",
["d"] = "t", ["t"] = "tʰ", ["n"] = "n", ["l"] = "l",
["z"] = "t͡s", ["c"] = "t͡sʰ", ["s"] = "ɬ",
["g"] = "k", ["k"] = "kʰ", ["ng"] = "ŋ", ["h"] = "h",
[""] = ""
},
xy = {
["b"] = "p", ["p"] = "pʰ", ["m"] = "m",
["d"] = "t", ["t"] = "tʰ", ["n"] = "n", ["l"] = "l",
["z"] = "t͡s", ["c"] = "t͡sʰ", ["s"] = "ɬ",
["g"] = "k", ["k"] = "kʰ", ["ng"] = "ŋ", ["h"] = "h",
[""] = "",
["bh"] = "β",
},
}
local finals = {
pt = {
["a"] = "a", ["ae"] = "ɛ", ["e"] = "e", ["i"] = "i", ["o"] = "o",
["oe"] = "ø", ["or"] = "ɒ", ["u"] = "u", ["y"] = "y",
["ai"] = "ai", ["ao"] = "au", ["ia"] = "ia", ["ieo"] = "ieu", ["iu"] = "iu",
["ou"] = "ɔu", ["ua"] = "ua", ["ue"] = "uei", ["ui"] = "ui", ["yo"] = "yɒ",
["ang"] = "aŋ", ["orng"] = "ɒŋ", ["eng"] = "ɛŋ", ["oeng"] = "œŋ", ["ong"] = "ɔŋ",
["ing"] = "iŋ", ["ieng"] = "iɛŋ", ["ung"] = "uŋ", ["uang"] = "uaŋ", ["yng"] = "yŋ",
["yong"] = "yɒŋ", ["ng"] = "ŋ̍",
["ah"] = "aʔ", ["orh"] = "ɒʔ", ["eh"] = "ɛʔ", ["oeh"] = "œʔ", ["oh"] = "ɔʔ",
["ih"] = "iʔ", ["iah"] = "iaʔ", ["ieh"] = "iɛʔ", ["uh"] = "uʔ", ["uah"] = "uaʔ",
["uoh"] = "uoʔ", ["yh"] = "yʔ", ["yoh"] = "yɒʔ"
},
xy = {
["a"] = "a", ["ae"] = "ɛ", ["e"] = "e", ["i"] = "i", ["o"] = "ɵ",
["oe"] = "ø", ["or"] = "ɒ", ["u"] = "u", ["y"] = "y",
["ai"] = "ai", ["ao"] = "au", ["ia"] = "ia", ["ieo"] = "ieu", ["iu"] = "iu",
["ou"] = "ɔu", ["ua"] = "ua", ["ue"] = "uei", ["ui"] = "ui", ["ya"] = "ya",
["ang"] = "aŋ", ["orng"] = "ɒŋ", ["eng"] = "ɛŋ",
["ing"] = "iŋ", ["ieng"] = "iɛŋ", ["yng"] = "yŋ",
["yeng"] = "yøŋ", ["uong"] = "uoŋ", ["ng"] = "ŋ̍",
["ah"] = "aʔ", ["orh"] = "ɒʔ", ["eh"] = "ɛʔ",
["ih"] = "iʔ", ["ieh"] = "iɛʔ", ["uh"] = "uʔ",
["uoh"] = "uoʔ", ["yh"] = "yʔ", ["yeh"] = "yøʔ",
["iah"] = "iaʔ", ["uah"] = "uaʔ", -- iah, uah only for 代詞促調
["aⁿ"] = "ã", ["iⁿ"] = "ĩ", ["yⁿ"] = "ỹ", ["orⁿ"] = "ɒ̃", ["aiⁿ"] = "ãĩ",
["aoⁿ"] = "ãũ", ["iaⁿ"] = "ĩã", ["iuⁿ"] = "ĩũ", ["uaⁿ"] = "ũã", ["uiⁿ"] = "ũĩ",
["yaⁿ"] = "ỹã"
},
}
-- 1 ~ 7 correspond to 陰平, 陽平, 陰上, 陰去, 陽去, 陰入, 陽入, S are "special tones"
-- S1, S4, S7: the tones sounds a bit like 1, 4, 7 after tone sandhi (according to 莆仙方言大詞典)
-- S3: 代詞促調, act like 陰上 in both Putian and Xianyou after tone sandhi
-- S5: 古陰入, labelled as 陽去 in dictionaries but has its own rule for tone sandhi
local tones = {
pt = {
["1"] = "⁵³³", ["2"] = "¹³", ["3"] = "⁴⁵³", ["4"] = "⁴²",
["5"] = "²¹", ["6"] = "¹", ["7"] = "⁴",
["S1"] = "⁵⁵", ["S3"] = "³²", ["S4"] = "⁴²", ["S5"] = "²¹", ["S7"] = "⁴⁵"
},
xy = {
["1"] = "⁵³³", ["2"] = "¹³", ["3"] = "³³²", ["4"] = "⁴²",
["5"] = "²¹", ["6"] = "²", ["7"] = "²⁴",
["S1"] = "⁵⁵", ["S3"] = "³²", ["S5"] = "²¹"
},
}
local sandhi_rules = {
pt = {
["1"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="2", ["5"]="2", ["6"]="2", ["7"]="5"},
["2"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="5"},
["3"] = {["1"]="5", ["2"]="2", ["3"]="5", ["4"]="5", ["5"]="2", ["6"]="2", ["7"]="5"},
["4"] = {["1"]="S1", ["2"]="4", ["3"]="S1", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="S1"},
["5"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="5"},
["6"] = {["1"]="S7", ["2"]="S7", ["3"]="S7", ["4"]="S7", ["5"]="S4", ["6"]="S4", ["7"]="S7"},
["7"] = {["1"]="6", ["2"]="6", ["3"]="6", ["4"]="7", ["5"]="S4", ["6"]="S4", ["7"]="6"},
["S3"] = {["1"]="7", ["2"]="7", ["3"]="7", ["4"]="7", ["5"]="7", ["6"]="7", ["7"]="7"},
["S5"] = {["1"]="S1", ["2"]="S1", ["3"]="S1", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="S1"}
},
xy = {
["1"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="2", ["5"]="2", ["6"]="2", ["7"]="5"},
["2"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="5"},
["3"] = {["1"]="5", ["2"]="S1", ["3"]="5", ["4"]="5", ["5"]="2", ["6"]="2", ["7"]="5"},
["4"] = {["1"]="S1", ["2"]="S1", ["3"]="S1", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="S1"},
["5"] = {["1"]="5", ["2"]="5", ["3"]="5", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="5"},
["6"] = {["1"]="7", ["2"]="7", ["3"]="7", ["4"]="7", ["5"]="7", ["6"]="7", ["7"]="7"},
["7"] = {["1"]="6", ["2"]="6", ["3"]="6", ["4"]="7", ["5"]="7", ["6"]="7", ["7"]="6"},
["S3"] = {["1"]="7", ["2"]="7", ["3"]="7", ["4"]="7", ["5"]="7", ["6"]="7", ["7"]="7"},
["S5"] = {["1"]="S1", ["2"]="S1", ["3"]="S1", ["4"]="S1", ["5"]="4", ["6"]="4", ["7"]="S1"}
}
}
local initial_assimilation_rules = {
pt = {
nasal_final = {
["b"] = "m", ["p"] = "m", ["m"] = "m",
["d"] = "n", ["t"] = "n", ["n"] = "n", ["l"] = "n", ["z"] = "n", ["c"] = "n", ["s"] = "n",
["g"] = "ng", ["k"] = "ng", ["h"] = "ng", ["ng"] = "ng", [""] = "ng"
},
glottal_final = {}, -- remain unchanged
other_final = {
["b"] = "", ["p"] = "",
["m"] = "m", ["n"] = "n", ["l"] = "l", ["ng"] = "ng",
["d"] = "l", ["t"] = "l", ["z"] = "l", ["c"] = "l", ["s"] = "l",
["g"] = "", ["k"] = "", ["h"] = "", [""] = ""
}
},
xy = {
nasal_final = {
["b"] = "m", ["p"] = "m", ["m"] = "m",
["d"] = "n", ["t"] = "n", ["n"] = "n", ["l"] = "n", ["z"] = "n", ["c"] = "n", ["s"] = "n",
["g"] = "ng", ["k"] = "ng", ["h"] = "ng", ["ng"] = "ng", [""] = "ng"
},
nasalized_final = {
["b"] = "m", ["m"] = "m", ["p"] = "m",
["d"] = "n", ["t"] = "n", ["n"] = "n", ["l"] = "n", ["z"] = "n", ["c"] = "n", ["s"] = "n",
["g"] = "", ["k"] = "", ["h"] = "",
["ng"] = "ng",
[""] = ""
},
glottal_final = {}, -- remain unchanged
other_final = {
["b"] = "bh", ["p"] = "bh",
["m"] = "m", ["n"] = "n", ["l"] = "l", ["ng"] = "ng",
["d"] = "l", ["t"] = "l", ["z"] = "l", ["c"] = "l", ["s"] = "l",
["g"] = "", ["k"] = "", ["h"] = "", [""] = ""
}
}
}
local buc_initials = {
["b"] = "b",
["p"] = "p",
["m"] = "m",
["d"] = "d",
["t"] = "t",
["n"] = "n",
["l"] = "l",
["z"] = "c",
["c"] = "ch",
["s"] = "s",
["g"] = "g",
["k"] = "k",
["ng"] = "ng",
["h"] = "h",
[""] = ""
}
local buc_finals = {
["a"] = {{"a", 1}, {"aⁿ", 1}, {"ah", 1}},
["ae"] = {{"e", 1}},
["ah"] = {{"ah", 1}},
["ai"] = {{"ai", 1}},
["ang"] = {{"ang", 1}},
["ao"] = {{"au", 1}},
["e"] = {{"a̤", 1}, {"a̤ⁿ", 1}, {"a̤h", 1}},
["eh"] = {{"eh", 1}},
["eng"] = {{"eng", 1}},
["i"] = {{"i", 1}, {"ih", 1}},
["ia"] = {{"ia", 2}, {"iaⁿ", 2}, {"iah", 2}},
["iah"] = {{"iah", 2}},
["ieh"] = {{"iah", 2}},
["ieng"] = {{"iang", 2}},
["ieo"] = {{"a̤u", 2}, {"a̤uⁿ", 2}, {"a̤uh", 2}}, -- on `u`
["ih"] = {{"ih", 1}},
["ing"] = {{"ing", 1}},
["iu"] = {{"iu", 2}},
["ng"] = {{"ng", 1}}, -- actually in the middle of `n` and `g`
["o"] = {{"eo", 2}, {"eoh", 2}},
["oe"] = {{"e̤", 1}, {"e̤ⁿ", 1}},
["oeh"] = {{"e̤h", 1}},
["oeng"] = {{"e̤ng", 1}},
["oh"] = {{"eoh", 2}},
["ong"] = {{"eong", 2}},
["or"] = {{"o̤", 1}, {"o̤ⁿ", 1}, {"o̤h", 1}},
["orh"] = {{"o̤h", 1}},
["orng"] = {{"o̤ng", 1}},
["ou"] = {{"o", 1}},
["u"] = {{"u", 1}},
["ua"] = {{"ua", 2}, {"uaⁿ", 2}, {"uah", 2}},
["uah"] = {{"uah", 2}},
["uang"] = {{"uang", 2}},
["ue"] = {{"oi", 1}, {"oiⁿ", 1}, {"oih", 1}}, -- on `o`
["uh"] = {{"uh", 1}},
["ui"] = {{"ui", 1}}, -- on `u`
["ung"] = {{"ng", 1}}, -- actually in the middle of `n` and `g`
["y"] = {{"ṳ", 1}},
["yh"] = {{"ṳh", 1}},
["yng"] = {{"ṳng", 1}},
["yo"] = {{"io̤", 2}, {"io̤ⁿ", 2}, {"io̤h", 2}},
["yoh"] = {{"io̤h", 2}},
["yong"] = {{"io̤ng", 2}}
}
local buc_tones = {
["1"] = "", -- 陰平 null
["2"] = "́", -- 陽平 u+0301
["3"] = "̂", -- 上聲 u+0302
["4"] = "̍", -- 陰去 u+030D
["5"] = "̄", -- 陽去 u+0304
["S5"] = "̄", -- same as above
["6"] = "", -- 陰入 -h
["7"] = "̍", -- 陽入 -h + u+030D
}
local function split_dialect_codes(code_string)
local codes = {}
for code in code_string:gmatch("[^,]+") do
-- Validate dialect code
if not dialects[code] then
error("Unsupported dialect: " .. code)
end
table.insert(codes, code)
end
return codes
end
local function get_syllable_markers(syllable)
local markers = {
capitalize = false,
space_after = false,
comma_after = false,
manual_buc = nil
}
if syllable:sub(1, 1) == SPECIAL_MARKERS.CAPITALIZATION then
markers.capitalize = true
syllable = syllable:sub(2)
end
if syllable:sub(-1) == SPECIAL_MARKERS.SPACE_AFTER then
markers.space_after = true
syllable = syllable:sub(1, -2)
end
if syllable:sub(-1) == "," then
markers.comma_after = true
syllable = syllable:sub(1, -2)
end
-- Check manual BUC
local manual_start, manual_end = syllable:find("{[^}]+}")
if manual_start then
markers.manual_buc = syllable:sub(manual_start + 1, manual_end - 1)
syllable = syllable:sub(1, manual_start - 1) .. syllable:sub(manual_end + 1)
end
return markers, syllable
end
local function split_initial_final(options)
if not options or not options.form then
error("split_initial_final: form is required")
end
local form = options.form
local initial, final
if form:sub(1, 2) == "bh" then
initial, final = "bh", form:sub(3)
require("Module:debug").track('cpx-pron/entries using bh')
elseif form == "ng" then
initial, final = "", form
elseif form:sub(1, 2) == "ng" and #form > 2 then
initial, final = "ng", form:sub(3)
else
initial = form:match("^[bpmnltdzcsghk]h?") or ""
final = form:sub(#initial + 1)
end
if not final or final == "" then
error("Invalid form: " .. form .. " (unable to extract final)")
end
return initial, final
end
-- Phonological rule application functions
local function get_final_type(options)
if not options or type(options) ~= "table" then
error("get_final_type: options must be a table")
end
local initial = options.initial
local final = options.final
local dialect = options.dialect
if not final then
error("get_final_type: final cannot be nil")
end
if sub(final, -2) == "ng" then
return "nasal_final"
elseif sub(final, -1) == "h" then
return "glottal_final"
elseif sub(final, -1) == "ⁿ" or
(match(initial, "[mn]g?") and
get_final_type({initial = "", final = final}) == "other_final" and
dialect == "xy") then
return "nasalized_final"
elseif sub(final, -1) == "n" then
error('Please replace the syllable-final "n" with "ⁿ"')
else
return "other_final"
end
end
local function track_buc_issue(reason)
require("Module:debug").track('cpx-pron/' .. reason)
end
local function combine_buc_syllable(options)
local initial = options.initial
local final = options.final
local tone = options.tone
local tone_position = options.tone_position
local tone_mark = buc_tones[tone]
if not tone_mark then
error("Invalid tone: " .. tone)
end
-- Split the final string into character table
local chars = {}
for char in mw.ustring.gmatch(final, ".") do
table.insert(chars, char)
end
-- put tone diacritic
if #chars >= tone_position then
chars[tone_position] = chars[tone_position] .. tone_mark
else
error("Invalid tone position: " .. tone_position)
end
return mw.ustring.toNFC(initial .. table.concat(chars))
end
local function lookup_char_readings(char)
if not m_data.buc[char] then
return nil
end
return m_data.buc[char]
end
-- Convert single PSP syllable to BUC
local function convert_to_buc_syllable(options)
local syllable_info = options.syllable_info
local char = options.char
-- mw.log("convert_to_buc_syllable - original_initial: " .. syllable_info.original_initial ..
-- ", original_final: " .. syllable_info.original_final ..
-- ", original_tone: " .. syllable_info.original_tone)
-- If BUC is manually specified, first verify
if syllable_info.manual_buc then
local is_valid, error_msg = validate_manual_buc(syllable_info.manual_buc)
if not is_valid then
track_buc_issue("manual form incorrect")
return nil
end
return syllable_info.manual_buc
end
-- Special handling for S3 tone
local lookup_tone = syllable_info.original_tone
local lookup_final = syllable_info.original_final
if syllable_info.original_tone == "S3" then
lookup_tone = "3"
-- Remove final h if present
if lookup_final:sub(-1) == "h" then
lookup_final = lookup_final:sub(1, -2)
end
end
-- Get possible BUC finals
local possible_finals = buc_finals[lookup_final]
if not possible_finals then
track_buc_issue("no final found")
return nil
end
-- Get BUC initial
local initial = buc_initials[syllable_info.original_initial]
if not initial then
track_buc_issue("no initial found")
return nil
end
-- No need to look up Hanzi-BUC table if hanzi's and PSP's counts don't match
if not char then
if #possible_finals > 1 then
track_buc_issue("contraction and multiple final found")
-- temp
local finals_for_output = {}
for _, final_info in ipairs(possible_finals) do
table.insert(finals_for_output, final_info[1])
end
return nil
end
return combine_buc_syllable({
initial = initial,
final = possible_finals[1][1],
tone = lookup_tone,
tone_position = possible_finals[1][2]
})
end
-- Generate all possible Pinghua word forms
local filtered_finals = {} -- special check for BUC tone 7B which merged into tone 2
for _, final_info in ipairs(possible_finals) do
local final, tone_position = final_info[1], final_info[2]
local is_tone_7b_final = final:match("h$")
local psp_has_h = syllable_info.original_final:match("h$")
local should_keep = true
local use_tone = lookup_tone
if is_tone_7b_final and not psp_has_h then
if lookup_tone == "2" then
use_tone = "7"
final = final .. "*"
else
-- Impossible to correspond to this final
should_keep = false
end
end
if should_keep then
table.insert(filtered_finals, {
final = final,
tone_position = tone_position,
tone = use_tone
})
end
end
local candidates = {}
for _, final_info in ipairs(filtered_finals) do
local candidate = combine_buc_syllable({
initial = initial,
final = final_info.final,
tone = final_info.tone,
tone_position = final_info.tone_position
})
table.insert(candidates, candidate)
end
if #candidates == 1 then
return candidates[1]
end
local char_readings = lookup_char_readings(char)
if not char_readings then
track_buc_issue("cannot look up table")
return nil
end
local matches = {}
for _, candidate in ipairs(candidates) do
for _, reading in ipairs(char_readings) do
if candidate == reading then
table.insert(matches, candidate)
end
end
end
if #matches == 0 then
track_buc_issue("no matching reading found")
return nil
elseif #matches > 1 then
track_buc_issue("multiple matching readings found")
return nil
end
return matches[1]
end
local function generate_buc(options)
if not options.syllable_infos then
error("Missing required syllable_infos in generate_buc")
end
if options.dialect ~= "pt" then
return nil
end
local page_title = mw.title.getCurrentTitle().text
local chars = mw.ustring.gsub(page_title, "[\n\r\t ,]", "")
local char_count = mw.ustring.len(chars)
-- Check if #PSP syllables == #hanzi
local syllable_count = #options.syllable_infos
local check_char_table = (syllable_count == char_count)
local buc_syllables = {}
for i, syllable_info in ipairs(options.syllable_infos) do
if syllable_info.manual_buc then
table.insert(buc_syllables, syllable_info.manual_buc)
else
-- Get possible BUC
local syllable_result = convert_to_buc_syllable({
syllable_info = syllable_info,
char = check_char_table and mw.ustring.sub(chars, i, i) or nil,
word = options.word
})
-- If any syllable cannot be uniquely identified
if not syllable_result then
return nil
end
if syllable_info.capitalize then
local normalized = mw.ustring.toNFD(syllable_result)
local first_char = mw.ustring.sub(normalized, 1, 1)
syllable_result = mw.ustring.toNFC(
mw.ustring.upper(first_char) ..
mw.ustring.sub(normalized, 2)
)
end
table.insert(buc_syllables, syllable_result)
end
end
-- concat syllables
local result = {}
for i = 1, #buc_syllables do
table.insert(result, buc_syllables[i])
if i < #buc_syllables then
if options.syllable_infos[i].comma_after then
table.insert(result, ", ")
elseif options.syllable_infos[i].space_after then
table.insert(result, " ")
else
table.insert(result, "-")
end
end
end
return table.concat(result)
end
local function split_syllable(syllable)
-- Initialize result table
local components = {
orig_form = nil,
changed_form = nil,
tone_part = nil,
orig_initial = nil,
orig_final = nil,
changed_initial = nil,
changed_final = nil,
orig_tone = nil,
manual_sandhi_tone = nil,
no_sandhi = false,
no_assimilation = false,
-- BUC (only for Putian)
capitalize = false,
space_after = false,
comma_after = false,
manual_buc = nil
}
if not syllable or syllable == "" then
error("Invalid syllable: " .. tostring(syllable))
end
local markers, cleaned_syllable = get_syllable_markers(syllable)
components.capitalize = markers.capitalize
components.space_after = markers.space_after
components.comma_after = markers.comma_after
components.manual_buc = markers.manual_buc
syllable = cleaned_syllable
components.no_assimilation = syllable:sub(1, 1) == SPECIAL_MARKERS.NO_ASSIMILATION
if components.no_assimilation then
syllable = syllable:sub(2)
end
components.no_sandhi = syllable:sub(-1) == SPECIAL_MARKERS.NO_SANDHI
if components.no_sandhi then
syllable = syllable:sub(1, -2)
end
if syllable:find(SPECIAL_MARKERS.MANUAL_CHANGE) then
components.orig_form, components.changed_form, components.tone_part =
syllable:match("(.-)>(.-)([1-7S]+.*)$")
else
components.orig_form, components.tone_part =
syllable:match("(.-)([1-7S]+.*)$")
components.changed_form = components.orig_form
end
-- If the segmentation is not correct
if not components.orig_form or not components.tone_part then
error("Invalid syllable format: " .. syllable)
end
-- Process form components
components.orig_initial, components.orig_final =
split_initial_final({form = components.orig_form})
components.changed_initial, components.changed_final =
split_initial_final({form = components.changed_form})
-- Process tone components
if components.tone_part:find("-") then
components.orig_tone, components.manual_sandhi_tone =
components.tone_part:match("^([1-7S]+)%-([1-7S]+)$")
require("Module:debug").track('cpx-pron/manual sandhi tone')
else
components.orig_tone = components.tone_part
end
-- Special tone processing
if components.orig_tone == '3' and components.changed_final:sub(-1) == 'h' then
components.orig_tone = 'S3'
end
-- final validation
if not (components.orig_initial and components.orig_final and components.orig_tone) then
error("Unable to parse syllable: " .. syllable)
end
return components
end
local function create_syllable_info(options)
local syllable_components = split_syllable(options.syllable)
return {
original_initial = syllable_components.orig_initial,
original_final = syllable_components.orig_final,
original_tone = syllable_components.orig_tone,
changed_initial = syllable_components.changed_initial,
changed_final = syllable_components.changed_final,
changed_tone = syllable_components.orig_tone, -- default: original tone
no_sandhi = syllable_components.no_sandhi,
no_assimilation = syllable_components.no_assimilation,
is_first_syllable = options.is_first_syllable,
manual_sandhi_tone = syllable_components.manual_sandhi_tone,
-- BUC
capitalize = syllable_components.capitalize,
space_after = syllable_components.space_after,
comma_after = syllable_components.comma_after,
manual_buc = syllable_components.manual_buc
}
end
-- Syllable processing functions
local function create_syllable_infos(options)
local syllable_infos = {}
for syllable in options.word:gmatch("%S+") do
local syllable_options = {
syllable = syllable,
is_first_syllable = #syllable_infos == 0
}
table.insert(syllable_infos, create_syllable_info(syllable_options))
end
return syllable_infos
end
local function post_process_nasalization(options)
local syllable = options.syllable
-- Remove duplicate nasalization
if syllable.changed_initial:match("^[mn]g?") and
syllable.changed_final:match("ⁿ$") then
syllable.changed_final = syllable.changed_final:gsub("ⁿ$", "")
end
-- Simplify ng-initial syllables
if syllable.changed_initial == "ng" and
syllable.changed_final == "ng" then
syllable.changed_initial = ""
end
end
local function get_sandhi_tone(options)
local curr_syllable = options.curr_syllable
local next_syllable = options.next_syllable
local dialect = options.dialect
-- Handle manual tone specification
if curr_syllable.manual_sandhi_tone then
return curr_syllable.manual_sandhi_tone
end
-- Handle final syllable
if not next_syllable then
return curr_syllable.original_tone
end
-- Apply sandhi rules
local current_tone = curr_syllable.original_tone
local next_tone = next_syllable.original_tone
-- S5 is treated as regular 5 in non-sandhi position
next_tone = gsub(next_tone, "S5", "5")
return sandhi_rules[dialect][current_tone][next_tone] or
curr_syllable.original_tone
end
local function apply_sandhi(options)
local dialect = options.dialect
local syllable_infos = options.syllable_infos
for i = 1, #syllable_infos do
local curr_syllable = syllable_infos[i]
local next_syllable = syllable_infos[i + 1]
-- No sandhi if one of the following conditions are met
-- 1. there is a no_sandhi mark
-- 2. syllable followed by a comma
-- 3. is the last syllable
if curr_syllable.no_sandhi or
curr_syllable.comma_after or
-- curr_syllable.space_after or
not next_syllable then
curr_syllable.changed_tone = curr_syllable.original_tone
else
curr_syllable.changed_tone = get_sandhi_tone({
curr_syllable = curr_syllable,
next_syllable = next_syllable,
dialect = dialect
})
end
-- Special tone adjustment for glottal finals
if curr_syllable.changed_tone == '3' and
curr_syllable.changed_final:sub(-1) == 'h' then
curr_syllable.changed_tone = 'S3'
end
end
end
local function apply_initial_assimilation(options)
local dialect = options.dialect
local syllable_infos = options.syllable_infos
local result = {}
-- Handle first syllable
result[1] = syllable_infos[1]
result[1].is_first_syllable = true
-- Process subsequent syllables
for i = 2, #syllable_infos do
local prev_syllable = result[i-1]
local curr_syllable = syllable_infos[i]
-- No initial assimilation if one of the following conditions are met:
-- 1. there is a no_assimilation mark
-- 2. the previous syllable is followed by a comma
if not curr_syllable.no_assimilation and
not prev_syllable.comma_after and
curr_syllable.changed_initial == curr_syllable.original_initial then
-- Get the type of the previous syllable's final
local final_type = get_final_type({
initial = prev_syllable.changed_initial,
final = prev_syllable.changed_final,
dialect = dialect
})
-- Special rule for nasalized finals
local should_apply_nasal_rule =
final_type == "other_final" and
curr_syllable.original_initial:match("^[bpdtzcs]") and
get_final_type({
initial = curr_syllable.original_initial,
final = curr_syllable.original_final,
dialect = dialect
}) == "nasalized_final"
if should_apply_nasal_rule then
final_type = "nasal_final"
end
-- Apply assimilation rules
curr_syllable.changed_initial =
initial_assimilation_rules[dialect][final_type][curr_syllable.original_initial] or
curr_syllable.original_initial
end
-- Post-process nasalization
local post_process_options = {
syllable = curr_syllable,
dialect = dialect
}
post_process_nasalization(post_process_options)
table.insert(result, curr_syllable)
end
return result
end
local function generate_actual_pronunciation(syllable_infos)
local pronunciations = {}
for _, syllable in ipairs(syllable_infos) do
-- Combine the changed components
local pronunciation = syllable.changed_initial ..
syllable.changed_final ..
syllable.changed_tone
table.insert(pronunciations, pronunciation)
end
return table.concat(pronunciations, " ")
end
local function get_ipa_value(options)
-- Validation
if not options.type or not options.dialect or not options.value then
error("Missing required parameter for IPA lookup")
end
-- Get the appropriate lookup table
local lookup_tables = {
initials = initials,
finals = finals,
tones = tones
}
local table = lookup_tables[options.type]
if not table then
error("Invalid lookup type: " .. options.type)
end
-- Get the result
local result = table[options.dialect] and
table[options.dialect][options.value]
-- Handle variant finals
if not result and options.type == "finals" then
local final_variants = {
["au"] = "ao",
["iang"] = "ieng",
["ieu"] = "ieo",
["iau"] = "ieo",
["iao"] = "ieo",
["uai"] = "ue",
["uei"] = "ue",
["yoeh"] = "yeh",
["yoeng"] = "yeng",
["yor"] = "yo",
["yorh"] = "yoh",
["yorng"] = "yong"
}
if final_variants[options.value] then
error(string.format(
'Invalid final: %s. Please use "%s" instead.',
options.value,
final_variants[options.value]
))
end
end
-- Error if no result found
if not result then
error(string.format(
"Invalid %s: %s",
options.type:sub(1, -2),
options.value
))
end
return result
end
local function get_ipa_components(options)
local syllable_info = options.syllable_info
local dialect = options.dialect
-- Get basic components
local components = {
initial = get_ipa_value({
type = "initials",
dialect = dialect,
value = syllable_info.changed_initial
}),
final = get_ipa_value({
type = "finals",
dialect = dialect,
value = syllable_info.changed_final
}),
tone = get_ipa_value({
type = "tones",
dialect = dialect,
value = syllable_info.original_tone
})
}
-- Handle tone change
if syllable_info.changed_tone ~= syllable_info.original_tone then
local sandhi_tone = get_ipa_value({
type = "tones",
dialect = dialect,
value = syllable_info.changed_tone
})
if not sandhi_tone then
error("Invalid sandhi tone: " .. syllable_info.changed_tone ..
" for dialect: " .. dialect)
end
components.tone = components.tone .. "⁻" .. sandhi_tone
end
return components
end
local function get_original_initial_display(options)
local syllable_info = options.syllable_info
local dialect = options.dialect
-- Only show original initial for non-first syllables with changes
if syllable_info.is_first_syllable or
(syllable_info.original_initial == syllable_info.changed_initial and
syllable_info.original_initial == syllable_info.changed_initial) then
return ""
end
-- Handle empty initial case
if syllable_info.original_initial == "" then
return "<sup>(Ø-)</sup>"
end
-- Get IPA for original initial
local ipa_initial = get_ipa_value({
type = "initials",
dialect = dialect,
value = syllable_info.original_initial
})
return "<sup>(" .. ipa_initial .. "-)</sup>"
end
local function syllable_to_ipa(options)
local syllable_info = options.syllable_info
local dialect = options.dialect
-- Get IPA components
local ipa_components = get_ipa_components({
syllable_info = syllable_info,
dialect = dialect
})
-- Generate display for changed initial if needed
local original_initial_display = get_original_initial_display({
syllable_info = syllable_info,
dialect = dialect
})
-- Combine all parts
return original_initial_display ..
ipa_components.initial ..
ipa_components.final ..
ipa_components.tone
end
-- Generate IPA for the syllables
local function generate_ipa(options)
if not options or not options.syllable_infos then
error("Missing required syllable_infos in generate_ipa")
end
local syllable_infos = options.syllable_infos
local dialect = options.dialect
local ipa_parts = {}
for _, syllable_info in ipairs(syllable_infos) do
table.insert(ipa_parts, syllable_to_ipa({
syllable_info = syllable_info,
dialect = dialect
}))
end
return table.concat(ipa_parts, " ")
end
-- Process a single pronunciation entry
local function process_pronunciation(options)
local result = {
dialect_codes = options.dialect_codes,
word = options.word,
processed = {},
index = options.index
}
local dialect_list = split_dialect_codes(options.dialect_codes)
-- Create syllable info objects for each syllable
local syllable_options = {
word = options.word,
is_first_syllable = true
}
local original_syllable_infos = create_syllable_infos(syllable_options)
-- Process for each dialect
for _, dialect in ipairs(dialect_list) do
local syllable_infos = m_table.deepCopy(original_syllable_infos)
-- Apply phonological rules
local processed_syllables = apply_initial_assimilation({
dialect = dialect,
syllable_infos = syllable_infos
})
apply_sandhi({
dialect = dialect,
syllable_infos = processed_syllables
})
-- Generate IPA and collect results
local pronunciation_result = {
dialect = dialect,
original = options.word,
actual = generate_actual_pronunciation(processed_syllables),
ipa = generate_ipa({
syllable_infos = processed_syllables,
dialect = dialect
}),
index = options.index
}
-- Generate BUC only for Putian
if dialect == "pt" then
pronunciation_result.buc = generate_buc({
syllable_infos = processed_syllables,
dialect = dialect,
word = options.word
})
end
table.insert(result.processed, pronunciation_result)
end
return result
end
-- Formatting helper functions
local function font_consolas(text)
return '<span class="zhpron-monospace">' .. text .. '</span>'
end
local function font_ipa(text)
return '<span class="IPA">/' .. text .. '/</span>'
end
local function clear_pinging_format(text)
if not text then
return ""
end
return text:gsub("%-S?%d", "") -- remove tone sandhi
:gsub(">[a-zⁿ]+", "") -- remove irregular sound change
:gsub("[#*^\\]+", "") -- remove special symbols
:gsub("{[^}]+}", "") -- remove manual BUC
:gsub("(%d)", "<sup>%1</sup>") -- superscript tone numbers
:gsub("S", "") -- remove "S" in special tones
end
-- Output formatting functions
local function format_debug_output(options)
local results = options.results
local output = {}
for _, result in ipairs(results) do
for _, processed in ipairs(result.processed) do
local debug_parts = {
processed.dialect .. ": ",
processed.original
}
-- Add actual pronunciation if different
if processed.original ~= processed.actual then
table.insert(debug_parts, " → ")
table.insert(debug_parts, processed.actual)
end
-- Add IPA
table.insert(debug_parts, " /")
table.insert(debug_parts, processed.ipa)
table.insert(debug_parts, "/")
table.insert(output, table.concat(debug_parts))
end
end
return table.concat(output, ", ")
end
local function format_dialect_info(options)
local dialect_names = {}
for _, code in ipairs(options.dialect_codes) do
table.insert(dialect_names, dialects[code] or code)
end
local prefix = options.include_full_title and
"\n*: <small>(<i>" or
"\n** <small>(''"
local suffix = options.include_full_title and
")</i>)</small>: " or
"'')</small>"
local dialect_str = ""
if options.include_full_title and #dialect_names == 1 then
dialect_str = dialect_names[1] .. ", "
end
return prefix .. dialect_str .. "[[Wiktionary:About Chinese/Puxian Min|Pouseng Ping'ing]]" .. suffix
end
function format_brief_output(options)
local results = options.results
local output_parts = {}
local dialect_codes = {}
local seen_pronunciations = {}
local order = {}
-- Collecte Pronunciation and dialect codes in their original order
for _, result in ipairs(results) do
if result.processed and #result.processed > 0 then
local original = result.processed[1].original
local cleared_text = clear_pinging_format(original)
-- If the cleaned pronunciation has not appeared before, record its order
if not seen_pronunciations[cleared_text] then
seen_pronunciations[cleared_text] = {
original = original,
index = result.index
}
table.insert(order, cleared_text)
end
-- Collect dialect codes
for _, processed in ipairs(result.processed) do
if not dialect_codes[processed.dialect] then
dialect_codes[processed.dialect] = true
end
end
end
end
local dialect_codes_array = {}
for code, _ in pairs(dialect_codes) do
table.insert(dialect_codes_array, code)
end
local output = " " -- "Puxian Min" already written in zh-pron
if #dialect_codes_array == 1 then
output = output .. "<small>(<i>" .. dialects[dialect_codes_array[1]] .. ", "
else
output = output .. "<small>(<i>"
end
output = output .. "[[Wiktionary:About Chinese/Puxian Min|Pouseng Ping'ing]]</i>): </small>"
-- Generate the pronunciation parts in the original order
if #order > 0 then
local formatted = {}
for _, cleared_text in ipairs(order) do
table.insert(formatted, cleared_text)
end
output = output .. font_consolas(table.concat(formatted, " / "))
end
return output
end
function format_complete_output(options)
local results = options.results
local output = {}
-- Organize pronunciation data in the order in which they were entered
local ordered_pronunciations = {}
for _, result in ipairs(results) do
for _, processed in ipairs(result.processed) do
table.insert(ordered_pronunciations, {
original = processed.original,
actual = processed.actual,
ipa = processed.ipa,
buc = processed.buc,
dialect = processed.dialect,
index = result.index,
input_order = #ordered_pronunciations + 1
})
end
end
table.sort(ordered_pronunciations, function(a, b)
return a.index < b.index
end)
-- Group by pronunciation, but keep the original order
local grouped_by_pron = {}
local order = {}
for _, pron in ipairs(ordered_pronunciations) do
local key = pron.original .. "|" .. pron.actual .. "|" .. pron.ipa
if not grouped_by_pron[key] then
grouped_by_pron[key] = {
data = pron,
dialects = {},
index = pron.index
}
table.insert(order, key)
end
table.insert(grouped_by_pron[key].dialects, pron.dialect)
end
-- Output in original order
for _, key in ipairs(order) do
local group = grouped_by_pron[key]
-- Dialect names
local dialect_names = {}
for _, dialect in ipairs(group.dialects) do
table.insert(dialect_names, dialects[dialect])
end
table.insert(output, "\n** <small>(<i>" .. table.concat(dialect_names, ", ") .. "</i>)</small>")
-- Pouseng Ping'ing
table.insert(output, "\n*** <small><i>[[Wiktionary:About Chinese/Puxian Min|Pouseng Ping'ing]]</i></small>: " ..
font_consolas(clear_pinging_format(group.data.original)))
if clear_pinging_format(group.data.original) ~= clear_pinging_format(group.data.actual) then
table.insert(output, font_consolas(
" [<small>Phonetic</small>: " .. clear_pinging_format(group.data.actual)) ..
"]")
end
-- BUC
if group.data.dialect == "pt" and group.data.buc then
table.insert(output, "\n*** <small>''[[w:Hinghwa Romanized|Báⁿ-uā-ci̍]]''</small>: " ..
font_consolas(group.data.buc:gsub("%*", "")))
end
-- IPA
table.insert(output, '\n*** <small>Sinological [[Wiktionary:International Phonetic Alphabet|IPA]] ' ..
'<sup>([[w:Pu–Xian Min|key]])</sup></small>: ' ..
font_ipa(group.data.ipa))
end
return table.concat(output)
end
-- Main entry point
function export.rom_display(text, mode)
if type(text) == "table" then
text = text.args[1]
end
-- Parameter validation
if not text or text == "" then
error("Invalid input: text must be a non-empty string")
end
mode = mode or FORMAT_MODES.DEBUG
local pronunciation_data = {
results = {},
mode = mode
}
-- Process each pronunciation in the input
local index = 1
for pronunciation in text:gmatch("[^/]+") do
local dialect_codes, word = pronunciation:match("^(.+):(.+)$")
if not dialect_codes or not word then
error("Invalid input format: " .. pronunciation)
end
local pron_options = {
dialect_codes = dialect_codes,
word = word,
index = index
}
table.insert(pronunciation_data.results,
process_pronunciation(pron_options))
index = index + 1
end
-- Format output according to the specified mode
if mode == FORMAT_MODES.DEBUG then
return format_debug_output(pronunciation_data)
elseif mode == FORMAT_MODES.BRIEF then
return format_brief_output(pronunciation_data)
elseif mode == FORMAT_MODES.COMPLETE then
return format_complete_output(pronunciation_data)
else
error("Unsupported mode: " .. mode)
end
end
-- Convert single BUC syllable to PSP
local function syllable_to_psp(input)
local buc_to_psp_initials = {
["b"] = "b", ["ch"] = "c", ["c"] = "z",
["d"] = "d", ["g"] = "g", ["h"] = "h",
["k"] = "k", ["l"] = "l", ["m"] = "m",
["ng"] = "ng", ["n"] = "n", ["p"] = "p",
["s"] = "s", ["t"] = "t", [""] = ""
}
local buc_to_psp_finals = {
["a"] = "a",
["aⁿ"] = "a",
["ah"] = "ah",
["ah*"] = "a",
["ai"] = "ai",
["ang"] = "ang",
["au"] = "ao",
["a̤"] = "e",
["a̤ⁿ"] = "e",
["a̤h"] = "eh",
["a̤h*"] = "e",
["e"] = "ae",
["eh"] = "eh",
["eng"] = "eng",
["e̤"] = "oe",
["e̤ⁿ"] = "oe",
["e̤h"] = "oeh",
["e̤ng"] = "oeng",
["i"] = "i",
["ih"] = "ih",
["ih*"] = "i",
["ing"] = "ing",
["ia"] = "ia",
["iaⁿ"] = "ia",
["iah"] = "iah",
["iah*"] = "ia",
["iang"] = "ieng",
["iu"] = "iu",
["o"] = "ou",
["o̤"] = "or",
["o̤ⁿ"] = "or",
["o̤h"] = "orh",
["o̤h*"] = "or",
["o̤ng"] = "orng",
["eo"] = "o",
["eoh"] = "oh",
["eoh*"] = "o",
["eong"] = "ong",
["u"] = "u",
["uh"] = "uh",
["ua"] = "ua",
["uaⁿ"] = "ua",
["uah"] = "uah",
["uah*"] = "ua",
["uang"] = "uang",
["ui"] = "ui",
["oi"] = "ue",
["oiⁿ"] = "ue",
["oih"] = "ue",
["oih*"] = "ue",
["ṳ"] = "y",
["ṳh"] = "yh",
["ṳng"] = "yng",
["io̤"] = "yo",
["io̤ⁿ"] = "yo",
["io̤h"] = "yoh",
["io̤h*"] = "yo",
["io̤ng"] = "yong",
["ng"] = "ng",
["a̤u"] = "ieo",
["a̤uⁿ"] = "ieo",
["a̤uh"] = "ieoh",
["a̤uh*"] = "ieo"
}
-- Handle input parameter
local syllable
if type(input) == "table" then
syllable = input.args[1]
else
syllable = input
end
if not syllable or syllable == "" then
return syllable
end
-- Try to convert the syllable, return original if any error occurs
local success, result = pcall(function()
-- Decompose the syllable and check for validity
local decomposed = mw.ustring.toNFD(syllable)
if not decomposed then
return syllable
end
-- Extract and remove tone marks
local tone = ""
if decomposed:find("́") then -- Tone 2: COMBINING ACUTE ACCENT
tone = "2"
decomposed = decomposed:gsub("́", "")
elseif decomposed:find("̂") then -- Tone 3: COMBINING CIRCUMFLEX ACCENT
tone = "3"
decomposed = decomposed:gsub("̂", "")
elseif decomposed:find("̍") then -- Tone 4/7: COMBINING VERTICAL LINE ABOVE
if decomposed:find("h%*$") then -- Special case: -h* ending -> tone 2
if not decomposed:find("̍") then -- If has h* but no vertical line
return syllable
end
tone = "2"
elseif decomposed:find("h$") then
tone = "7"
else
tone = "4"
end
decomposed = decomposed:gsub("̍", "")
elseif decomposed:find("̄") then -- Tone 5: COMBINING MACRON
tone = "5"
decomposed = decomposed:gsub("̄", "")
else
-- No tone mark: either tone 1 (no -h) or tone 6 (with -h)
if decomposed:find("h$") and not decomposed:find("h%*$") then
tone = "6"
else
tone = "1"
end
end
-- Recompose and check validity
local normalized = mw.ustring.toNFC(decomposed)
if not normalized then
return syllable
end
-- Special case: standalone `ng` syllable after tone removal
if normalized == "ng" then
return "ng" .. tone
end
-- Extract initial
local initial = ""
if normalized:match("^[Cc][Hh]") then
initial = normalized:sub(1, 2):lower()
normalized = normalized:sub(3)
elseif normalized:match("^[Nn][Gg]") then
initial = normalized:sub(1, 2):lower()
normalized = normalized:sub(3)
elseif normalized:match("^[BbCcDdFfGgHhKkLlMmNnPpSsTt]") then
initial = normalized:sub(1, 1):lower()
normalized = normalized:sub(2)
end
local psp_initial = buc_to_psp_initials[initial] or ""
-- Process final
-- Remove -h* marker if present (affects tone but not final lookup)
local final = normalized:gsub("h%*$", "")
-- Look up PSP final
local psp_final = buc_to_psp_finals[final]
if not psp_final then
return syllable
end
-- Combine all parts to form complete PSP syllable
return (psp_initial .. psp_final .. tone):lower()
end)
-- Return original syllable if conversion failed
return success and result or syllable
end
-- Convert BUC to PSP (both single syllable and text)
function export.buc_to_psp(input)
-- Handle input parameter
local text
if type(input) == "table" then
text = input.args[1]
else
text = input
end
if not text or text == "" then
return text
end
-- Split text into parts by delimiters while keeping delimiters
local parts = {}
local last_pos = 1
local pattern = "[%s%-%.,;:!%?,。;:!?「」『』、]"
for pos, delimiter in mw.ustring.gmatch(text, "()("..pattern..")") do
if pos > last_pos then
table.insert(parts, mw.ustring.sub(text, last_pos, pos - 1))
end
table.insert(parts, delimiter)
last_pos = pos + mw.ustring.len(delimiter)
end
-- Handle the last part
if last_pos <= mw.ustring.len(text) then
table.insert(parts, mw.ustring.sub(text, last_pos))
end
-- Convert syllables and keep delimiters
for i = 1, #parts do
if not parts[i]:match("^[%s%-%.,;:!%?,。;:!?「」『』、]$") then
parts[i] = syllable_to_psp(parts[i])
end
end
return table.concat(parts)
end
return export