Module:id-pron/sandbox

This module lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
local export = {}

local m_IPA = require("Module:IPA")
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local put_module = "Module:parse utilities"
local set_utilities_module = "Module:set utilities"
local headword_data_module = "Module:headword/data"
local accent_qualifier_module = "Module:accent qualifier"
local accent_qualifier_data_module = "Module:accent qualifier/data"
local rhymes_module = "Module:rhymes"
local hyphenation_module = "Module:hyphenation"

local lang = require("Module:languages").getByCode("id")

local maxn = table.maxn
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rsplit = m_str_utils.split
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local trim = mw.text.trim
local u = m_str_utils.char
local ulen = m_str_utils.len
local ulower = m_str_utils.lower

local AC = u(0x0301) -- acute =  ́
local GR = u(0x0300) -- grave =  ̀
local CFLEX = u(0x0302) -- circumflex =  ̂
local MAC = u(0x0304) -- macron
local BR = u(0x0306) -- breve = ˘

local vowel = "aeéèioòuəɛɔ" -- vowel
local V = "[" .. vowel .. "]"
local NV = "[^" .. vowel .. "]"

local accent = AC .. GR .. MAC .. BR
local accent_c = "[" .. accent .. "]"
local stress_c = "[" .. MAC .. BR .. "]"
local ipa_stress = "ˈ"
local ipa_stress_c = "[" .. ipa_stress .. "]"

local separator =  "# ." 
local separator_c = "[" .. separator .. "]"
local C = "[^" .. vowel .. separator .. "]" -- consonant

local unstressed_words = require("Module:table").listToSet({ --feel free to add more unstressed words
	"di", "ké", -- prepositions
	"dan", -- conjunctions
	"ku", "mu", "nya", -- pronouns
})

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- version of rsubn() that returns a 2nd argument boolean indicating whether
-- a substitution was made.
local function rsubb(term, foo, bar)
	local retval, nsubs = rsubn(term, foo, bar)
	return retval, nsubs > 0
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- ĵ, ɟ and ć are used internally to represent [d͡ʒ], [j] and [t͡ʃ]
--

function export.IPA(text, phonetic)

	local debug = {}
	text = ulower(text or mw.title.getCurrentTitle().text)
	-- decompose everything but é, è
	text = mw.ustring.toNFD(text)
	text = rsub(text, "." .. "[" .. AC .. CFLEX .. GR .. "]", {
		["e" .. AC] = "é",
		["e" .. GR] = "è",
		["o" .. GR] = "ò", -- O as in the Javanese place names "Solo", "Purwokerto", "Probolinggo"
	})

	-- convert commas and en/en dashes to IPA foot boundaries
	text = rsub(text, "%s*[,–—]%s*", " | ")
	-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary
	text = rsub(text, "([^%s])%s*[!?]%s*([^%s])", "%1 | %2")

	-- canonicalize multiple spaces and remove leading and trailing spaces
	local function canon_spaces(text)
		text = rsub(text, "%s+", " ")
		text = rsub(text, "^ ", "")
		text = rsub(text, " $", "")
		return text
	end

	text = canon_spaces(text)

	-- Make prefixes unstressed unless they have an explicit stress marker; also make certain
	-- monosyllabic words (e.g. [[di]], [[ke]], [[se-]], [[ban]], etc.) without stress marks be
	-- unstressed.
	local words = rsplit(text, " ")
	for i, word in ipairs(words) do
		if rfind(word, "%-$") and not rfind(word, accent_c) or unstressed_words[word] then
			-- add BR to the last vowel not the first one
			-- adding the BR after the 'u'
			words[i] = rsub(word, "^(.*" .. V .. ")", "%1" .. BR)
		end
	end
	text = table.concat(words, " ")

	-- Convert hyphens to spaces
	text = rsub(text, "%-", " ")
	-- canonicalize multiple spaces again, which may have been introduced by hyphens
	text = canon_spaces(text)
	-- now eliminate punctuation
	text = rsub(text, "[!?']", "")
	-- put # at word beginning and end and double ## at text/foot boundary beginning/end
	text = rsub(text, " | ", "# | #")
	text = "##" .. rsub(text, " ", "# #") .. "##"

	table.insert(debug, text)

	--"i" or "u" to glide (as part of a diphthong)
	text = rsub(text, "(" .. V .. ")i([#.])", "%1ɟ%2")
	text = rsub(text, "(" ..V.. ")u([#.])", "%1w%2")

    -- syllable-initial X (e.g. in [[xenofobia]], [[xenon]], [[xilofon]])
	text = rsub(text, "x("..V..")", "s%1")

	-- handle certain combinations; kh, ng, ny and sy handling needs to go first
	text = rsub(text, "kh", "x")
	text = rsub(text, "ng", "ŋ")
	text = rsub(text, "ny", "ɲ")
	text = rsub(text, "sy", "ʃ")
	
	table.insert(debug, text)

	--alphabet-to-phoneme
	text = rsub(text, "[ceéègjòqvy]",
	--["g"]="ɡ":  U+0067 LATIN SMALL LETTER G → U+0261 LATIN SMALL LETTER SCRIPT G
		{ ["c"] = "ć", ["e"] = "ə", ["é"] = "e", ["è"] = "ɛ", ["g"] = "ɡ", ["j"] = "ĵ", ["ò"] = "ɔ", ["q"] = "k", ["y"] = "j" })	

	-- glottal stop. use also to replace "k" when this corresponds to it
	text = rsub(text, "7", "ʔ")

	table.insert(debug, text)

	--syllable division
	local vowel_to_glide = { ["i"] = "j", ["u"] = "w" }
	-- i, o and u between vowels -> j and u e.g. [[rangkaian]])
	text = rsub_repeatedly(text, "(" .. V .. ")([iu])(" .. V .. ")",
			function(v1, iu, v2)
				return v1 .. vowel_to_glide[iu] .. v2
			end
	)

	text = rsub_repeatedly(text, "(" .. V .. accent_c .."*)(" .. C ..  V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. V .. accent_c .."*" .. C .. ")(" .. C .. V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. V .. accent_c .."*" .. C .. "+)(" .. C .. C .. V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. C .. ")%.s(" .. C .. ")", "%1s.%2")
	text = rsub_repeatedly(text, "([aeiouɛɔ]" .. accent_c .. "*)([aeiouɛɔ])", "%1.%2")

	table.insert(debug, text)

	local accent_to_stress_mark = { [MAC] = "ˈ", [BR] = "" }

	local function accent_word(word, syllables)
		-- Now stress the word. If any accent exists in the word (including breves indicating an unaccented word),
		-- put the stress mark(s) at the beginning of the indicated syllable(s). Otherwise, apply the default
		-- stress rule.
		if rfind(word, accent_c) then
			for i = 1, #syllables do
				syllables[i] = rsub(syllables[i], "^(.*)(" .. accent_c .. ")(.*)$",
						function(pre, accent, post)
							return accent_to_stress_mark[accent] .. pre .. post
						end
				)
			end
		else

			-- Default stress rule. Words without vowels (e.g. IPA foot boundaries) don't get stress.
			if #syllables > 1 and (rfind(word, "[^aəeéèioòuɛɔʔbcdfgɡhjɟĵklmnŋɲpqrstvwxz#]#")) or #syllables == 1 and rfind(word, V) then
				syllables[#syllables] = "ˈ" .. syllables[#syllables]
			elseif #syllables <= 2 and rfind(word, "[ə]") then
				syllables[#syllables] = "ˈ" .. syllables[#syllables]
			elseif #syllables >= 3 and rfind(word, "[ə]") then
				syllables[#syllables - 1] = "ˈ" .. syllables[#syllables - 1]
			elseif #syllables > 1 then
				syllables[#syllables - 1] = "ˈ" .. syllables[#syllables - 1]
			end
		end
	end


	local words = rsplit(text, " ")
	for j, word in ipairs(words) do

		local syllables = rsplit(word, "%.")

			accent_word(word, syllables)

		-- Reconstruct the word.
		words[j] = table.concat(syllables, phonetic and "." or "")
	end

	text = table.concat(words, " ")

	-- suppress syllable mark before IPA stress indicator
	text = rsub(text, "%.(" .. ipa_stress_c .. ")", "%1")

	table.insert(debug, text)

    local id_IPA_table = {
    	["phonetic"] = text,
    	["phonemic"] = text
    }

	for key, value in pairs(id_IPA_table) do
		text = id_IPA_table[key]

		--phonetic transcription
		if key == "phonetic" then

       	table.insert(debug, text)

        --phonemic diphthongs
		text = rsub(text, "([aeou])([ɟj])([#.ˈ])", "%1i̯%3")
		text = rsub(text, "([a])w([#.ˈ])", "%1u̯%2")

       	table.insert(debug, text)

        --change e, i, u in closed final syllables
	    text = rsub(text, "([bćdfhjĵɟklmnɲŋprsʃtwz])e([bćdfhjĵɟklmnɲŋprstwz])([#])","%1ɛ%2%3")
	    text = rsub(text, "([bćdfhjĵɟklmnɲŋprsʃtwz])i([bćdfhjĵɟklmnɲŋprstwz])([#])","%1ɪ%2%3")
	    text = rsub(text, "([bćdfhjĵɟklmnɲŋprsʃtwz])u([bćdfhjĵɟklmnɲŋprstwz])([#])","%1ʊ%2%3")

       	table.insert(debug, text)

        --i, u in closed stressed syllables with nasal coda
	    text = rsub(text, "([ˈ])([bćdfhjĵɟklmnɲŋprsʃtwz])ɪ([mnŋ])([.#])","%1%2i%3%4")
	    text = rsub(text, "([ˈ])([bćdfhjĵɟklmnɲŋprsʃtwz])ʊ([mnŋ])([.#])","%1%2u%3%4")

       	table.insert(debug, text)

	    --devoice final B, D an G
	    text = rsub(text, "b([#.ˈ])","p̚%1")
	    text = rsub(text, "d([#.ˈ])","t̚%1")
	    text = rsub(text, "ɡ([#.ˈ])","k̚%1")

        --/n/ and /ŋ/ sandhi
	    text = rsub(text,"([nŋ])([# .]*[bpm])", "m%2")
	    text = rsub(text,"([ŋ])([ˈˌ# .]*[dlstz])","n%2")
	    text = rsub(text,"([n])([ˈˌ# .]*[ćĵʃ])","ɲ%2")

        --final K to glottal stop
	    text = rsub(text, "k([#.ˈ])","ʔ%1")

	    --dental T
	    text = rsub(text, "t","t̪")

	    --V to F
	    text = rsub(text, "v","f")

	    mw.log(text)
	end

	table.insert(debug, text)

	-- convert fake symbols to real ones
    	local final_conversions = {
		["ć"] = "t͡ʃ", -- fake "c" to real "c"
		["ɟ"] =  "j", -- fake "i" to real "i"
        ["ĵ"] = "d͡ʒ" -- fake "j" to real "j"
	}

		text = rsub(text, "[ĉɟĵ]", final_conversions)

		-- Do not have multiple syllable break consecutively
		text = rsub_repeatedly(text, "([.]+)", ".")
    	text = rsub_repeatedly(text, "([.]?)(‿)([.]?)", "%2")

    	-- remove # symbols at word and text boundaries
		text = rsub_repeatedly(text, "([.]?)#([.]?)", "")

		-- resuppress syllable mark before IPA stress indicator
		text = rsub(text, "%.(" .. ipa_stress_c .. ")", "%1")
		text = rsub_repeatedly(text, "([.]?)(" .. ipa_stress_c .. ")([.]?)", "%2")

    	id_IPA_table[key] = toNFC(text)
	end

	return id_IPA_table
end

function export.show(frame)
	local params = {
		[1] = {},
		["pre"] = {},
		["bullets"] = {type = "number", default = 1},
	}

	local parargs = frame:getParent().args
	local args = require("Module:parameters").process(parargs, params)

	local results = {}

	local text = args[1] or mw.title.getCurrentTitle().text

	local IPA_result = export.IPA(text)
	table.insert(results, { pron = "/" .. IPA_result["phonemic"] .. "/" })
	table.insert(results, { pron = "[" .. IPA_result["phonetic"] .. "]" })

	local pre = args.pre and args.pre .. " " or ""
	local bullet = (args.bullets ~= 0) and "* " or ""

	return bullet .. pre .. m_IPA.format_IPA_full { lang = lang, items = results }
end


local function parse_gloss(arg)
	local poses, gloss
	if arg:find("%^") then
		poses, gloss = arg:match("^(.-)%^(.*)$")
		if gloss == "" then
			gloss = nil
		end
	else
		gloss = arg
	end
	if poses then
		poses = split_on_comma(poses)
		local m_headword_data = mw.loadData(headword_data_module)
		for i, pos in ipairs(poses) do
			poses[i] = m_headword_data.pos_aliases[pos] or pos
		end
	end
	return {
		poses = poses,
		gloss = gloss,
	}
end


-- Parse a raw accent spec, which is one or more comma-separated accents, each of which may be aliases listed in the
-- accent data in [[Module:accent qualifier/data]]. FIXME: The separate accent qualifier data will be going away and
-- merged into label data, at which point we'll have to rewrite this.
local function parse_accents(arg)
	-- Accent group processing
	local accent_data = mw.loadData(accent_qualifier_data_module)

	-- Split on commas and canonicalize aliases.
	local accents = rsplit(arg, "%s*,%s*")
	for i, alias in ipairs(accents) do
		if accent_data.aliases[alias] then
			accents[i] = accent_data.aliases[alias]
		end
	end

	return accents
end


-- Return the number of syllables of a phonemic or phonetic representation, which should have syllable dividers in it
-- but no hyphens.
local function get_num_syl_from_ipa(pron)
	-- Maybe we should just count vowels instead of the below code.
	pron = rsub(pron, "|", " ") -- remove IPA foot boundaries
	local words = rsplit(pron, " +")
	for i, word in ipairs(words) do
		-- IPA stress marks are syllable divisions if between characters; otherwise just remove.
		word = rsub(word, "(.)[ˌˈ](.)", "%1.%2")
		word = rsub(word, "[ˌˈ]", "")
		words[i] = word
	end
	-- There should be a syllable boundary between words.
	pron = table.concat(words, ".")
	return ulen(rsub(pron, "[^.]", "")) + 1
end


-- Get the rhyme by truncating everything up through the last stress mark + any following consonants, and remove
-- syllable boundary markers.
local function convert_phonemic_to_rhyme(phonemic)
	-- NOTE: This works because the phonemic vowels are just [aeiou] possibly with diacritics that are separate
	-- Unicode chars. If we want to handle things like ɛ or ɔ we need to add them to `vowel`.
	phonemic = rsplit(phonemic, " ")
	phonemic = phonemic[#phonemic]
	return rsub(rsub(phonemic, ".*[ˌˈ]", ""), "^" .. NV .. "*", ""):gsub("%.", "")
end


local function split_syllabified_spelling(spelling)
	return rsplit(spelling, "%.")
end


-- "Align" syllabified respelling `syllab` to original spelling `spelling` by matching character-by-character, allowing
-- for extra syllable and accent markers in the syllabification and certain mismatches in the consonants. The goal is to
-- produce the appropriately syllabified version of the original spelling (the pagename) by matching characters in the
-- syllabified respelling to the original spelling, putting the syllable boundaries in the appropriate places in the
-- original spelling. As an example, given syllabified respelling 'a.ma.7ín' and original spelling 'amain', we would
-- like to produce 'a.ma.in'.
--
-- If we encounter an extra syllable marker (.), we allow and keep it. If we encounter an extra accent marker in the
-- syllabification, we drop it. We allow for mismatches in capitalization and for certain other mismatches, e.g. extra
-- glottal stops (written 7), h in respelling vs. g or j in the original, etc. If we can't match, we return nil
-- indicating the alignment failed.
local function align_syllabification_to_spelling(syllab, spelling)
	local result = {}
	local function concat_result()
		-- Postprocess to remove dots (syllable boundaries) next to hyphens.
		return (toNFC(table.concat(result)):gsub("%.%-", "-"):gsub("%-%.", "-"))
	end
	-- Remove glottal stop (7) from respelling to simplify the code below, because it's never found in the original
	-- spelling. (FIXME: We should do the same for diacritics, but they're currently removed earlier, in
	-- syllabify_from_spelling(). We should probably get rid of the removal there and put it here.)
	syllab = decompose(syllab):gsub("7", "")
	spelling = decompose(spelling)
	local syll_chars = rsplit(ulower(syllab), "")
	local spelling_chars = rsplit(spelling, "")
	local i = 1
	local j = 1
	local function matches(uci, ucj)
		-- Return true if a syllabified respelling character (uci) matches the corresponding spelling char (ucj).
		-- Both uci and ucj should be lowercase.
		return uci == ucj or
			uci == "h" and (ucj == "g" or ucj == "j" or ucj == "x") or
			uci == "j" and ucj == "g" or
			uci == "y" and ucj == "i" or
			uci == "w" and ucj == "u"
	end
	local function silent_spelling_letter(ucj)
		return ucj == "h" or ucj == "'" or ucj == "-"
	end
	local function syll_at(pos)
		return syll_chars[pos] or ""
	end
	local function spell_at(pos)
		return spelling_chars[pos] or ""
	end
	local function uspell_at(pos)
		local c = spelling_chars[pos]
		return c and ulower(c) or ""
	end
	while i <= #syll_chars or j <= #spelling_chars do
		local uci = syll_at(i)
		local cj = spell_at(j)
		local ucj = uspell_at(j)
		if uci == "g" and syll_at(i - 1) == "n" and syll_at(i + 1) == "." and matches(syll_at(i + 2), ucj) and
			not matches(syll_at(i + 2), uspell_at(j + 1)) then
			-- As a special case, before checking whether the corresponding characters match, we have to skip an extra
			-- g in an -ng- sequence in the syllabified respelling if the corresponding spelling character matches the
			-- next respelling character (taking into account the syllable boundary). This is so that e.g.
			-- syll='ba.rang.gay' matches spelling='barangay'. Otherwise we will match the first respelling g against
			-- the spelling g and the second respelling g won't match. A similar case occurs with
			-- syll='E.vang.he.lis.ta' and spelling='Evangelista'. But we need an extra condition to not do this hack
			-- when syll='ba.rang.gay' matches spelling='baranggay'.
			i = i + 1
		elseif matches(uci, ucj) then
			table.insert(result, cj)
			i = i + 1
			j = j + 1
		elseif ucj == uspell_at(j - 1) and uci == "." and ucj ~= syll_at(i + 1) then
			-- See below. We want to allow for a doubled letter in spelling that is pronounced single, and preserve the
			-- doubled letter. But it's tricky in the presence of syllable boundaries on both sides of the doubled
			-- letter as well as doubled letters pronounced double. Specifically, there are three possibilities,
			-- exemplified by:
			-- (1) syll='Mal.lig', spelling='Mallig' -> 'Mal.lig';
			-- (2) syll='Ma.lig', spelling='Mallig' -> 'Ma.llig';
			-- (3) syll='Wil.iam', spelling='William' -> 'Will.iam'.
			-- If we copy the dot first, we get (1) and (2) right but not (3).
			-- If we copy the double letter first, we get (2) and (3) right but not (1).
			-- We choose to copy the dot first except in the situation exemplified by (3), where we copy the doubled
			-- letter first. The condition above handles (3) (the doubled letter matches against a dot) while not
			-- interfering with (1) (where the doubled letter also matches against a dot but the next letter in the
			-- syllabification is the same as the doubled letter, because the doubled letter is pronounced double).
			table.insert(result, cj)
			j = j + 1
		elseif silent_spelling_letter(ucj) and uci == "." and ucj ~= syll_at(i + 1) and
			not rfind(uspell_at(j + 1), V) then
			-- See below for apostrophe in spelling. This condition is parallel to the one directly above
			-- for silent doubled letters in spelling and handles the case of syllab='Abduramán', spelling='Abdurahman',
			-- which should be syllabified 'Ab.du.rah.man'. But we need a check to see that the next spelling character
			-- isn't a vowel, because in that case we want the silent letter to go after the period, e.g.
			-- syllab='Jumu7á', spelling='Jumu'ah' -> 'Ju.mu.'ah' (the 7 is removed above).
			table.insert(result, cj)
			j = j + 1
		elseif uci == "." then
			table.insert(result, uci)
			i = i + 1
		elseif ucj == uspell_at(j - 1) then
			-- A doubled letter in spelling that is pronounced single. Examples:
			-- * syllab='Ab.dur.rah.man', spelling='Abdurrahman' -> 'Ab.du.rrah.man' (with r)
			-- * syllab='a.sa.la.mu a.lai.kum', spelling='assalamu alaikum' -> 'as.sa.la.mu a.lai.kum' (with s)
			-- * syllab='Tal.lo', spelling='Tallo' -> 'Ta.llo' (with ll)
			-- * syllab='Ha.sa.nu.din', spelling='Hasanuddin' -> 'Ha.sa.nu.din' (with b)
			-- * syllab='Ka.ba', spelling='Kaaba' -> 'Kaa.ba' (with a)
			table.insert(result, cj)
			j = j + 1
		elseif silent_spelling_letter(ucj) then
			-- A silent h, apostrophe or hyphen in spelling. Examples:
			-- * syllab='Ramadān', spelling='Ramadhan' -> 'Ra.ma.dhan'
			table.insert(result, cj)
			j = j + 1
		elseif uci == AC or uci == GR or uci == CFLEX or uci == DIA or uci == TILDE or uci == MACRON or
			uci == "y" or uci == "w" then
			-- skip character
			i = i + 1
		else
			-- non-matching character
			mw.log(("Syllabification alignment mismatch for pagename '%s' (position %s, character %s), syllabified respelling '%s' (position %s, character %s), aligned result so far '%s'"
				):format(spelling, j, ucj, syllab, i, uci, concat_result()))
			return nil
		end
	end
	if i <= #syll_chars or j <= #spelling_chars then
		-- left-over characters on one side or the other
		mw.log(("Syllabification alignment mismatch for pagename '%s' (%s), syllabified respelling '%s' (%s), aligned result so far '%s'"
			):format(
				spelling, j > #spelling_chars and "end of string" or ("position %s, character %s"):format(j, uspell_at(j)),
				syllab, i > #syll_chars and "end of string" or ("position %s, character %s"):format(i, syll_at(i)),
				concat_result()))
		return nil
	end
	return concat_result()
end


local function generate_syll_obj(term)
	return {syllabification = term, hyph = split_syllabified_spelling(term)}
end


-- Word should already be decomposed.
local function word_has_vowels(word)
	word = ulower(word)
	return rfind(word, V) or word:find("y")
end


local function any_words_have_vowels(term)
	local words = rsplit(decompose(term), "[ %-]")
	for i, word in ipairs(words) do
		-- Allow empty word; this occurs with prefixes and suffixes.
		if word_has_vowels(word) then
			return true
		end
	end
	return false
end


local function should_generate_rhyme_from_respelling(term)
	local words = rsplit(decompose(term), " +")
	local last_word = words[#words]
	local should_generate_cat = #words == 1
	local should_generate_rhyme =
		not last_word:find("%-$") and -- no if word is a prefix
		not (last_word:find("^%-") and last_word:find(MACRON)) and -- no if word is an unstressed suffix
		word_has_vowels(last_word) -- no if word has no vowels (e.g. a single letter)
	return should_generate_rhyme, should_generate_cat
end


local function should_generate_rhyme_from_ipa(ipa)
	local should_generate_cat = not ipa:find("%s")
	local should_generate_rhyme = word_has_vowels(decompose(ipa))
	return should_generate_rhyme, should_generate_cat
end


local function should_generate_rhyme_from_termobj(termobj)
	if termobj.raw then
		return should_generate_rhyme_from_ipa(termobj.raw_phonemic or termobj.raw_phonetic)
	else
		return should_generate_rhyme_from_respelling(termobj.term)
	end
end


local function process_specified_rhymes(rhymes, sylls, parsed_respellings)
	local rhyme_ret = {}
	for _, rhyme in ipairs(rhymes) do
		local num_syl = rhyme.num_syl
		local no_num_syl = false

		-- If user explicitly gave the rhyme but didn't explicitly specify the number of syllables, try to take it from
		-- the syllabification.
		if not num_syl then
			num_syl = {}
			for _, syll in ipairs(sylls) do
				if should_generate_rhyme_from_respelling(syll.syllabification) then
					local this_num_syl = 1 + ulen(rsub(syll.syllabification, "[^.]", ""))
					m_table.insertIfNot(num_syl, this_num_syl)
				else
					no_num_syl = true
					break
				end
			end
			if no_num_syl or #num_syl == 0 then
				num_syl = nil
			end
		end

		-- If that fails and term is single-word, try to take it from the phonemic.
		if not no_num_syl and not num_syl then
			for _, parsed in ipairs(parsed_respellings) do
				for _, pronun in ipairs(parsed.pronuns) do
					-- Check that pronun.phonemic exists (it may not if raw phonetic-only pronun is given), and rhyme
					-- isn't suppressed (which may happen if the term has a qualifier "colloquial", "obsolete" or the
					-- like or is an auto-generated "glottal stop elision" pronunciation).
					if pronun.phonemic and not pronun.no_rhyme then
						if not should_generate_rhyme_from_ipa(pronun.phonemic) then
							no_num_syl = true
							break
						end
						-- Count number of syllables by looking at syllable boundaries (including stress marks).
						local this_num_syl = get_num_syl_from_ipa(pronun.phonemic)
						m_table.insertIfNot(num_syl, this_num_syl)
					end
				end
				if no_num_syl then
					break
				end
			end
			if no_num_syl or #num_syl == 0 then
				num_syl = nil
			end
		end

		local rhymeobj = m_table.shallowcopy(rhyme)
		rhymeobj.num_syl = num_syl
		table.insert(rhyme_ret, rhymeobj)
	end
end


-- Parse a pronunciation modifier in `arg`, the argument portion in an inline modifier (after the prefix), which
-- specifies a pronunciation property such as rhyme, syllabification, homophones or audio. The argument can itself have
-- inline modifiers, e.g. <audio:Foo.ogg<a:Jakarta>>. The allowed inline modifiers are specified by `param_mods` (of
-- the format expected by `parse_inline_modifiers()`); in addition to any modifiers specified there, the modifiers
-- <q:...>, <qq:...>, <a:...> and <aa:...> are always accepted (and can be repeated). `generate_obj` and `parse_err` are
-- like in `parse_inline_modifiers()` and specify respectively a function to generate the object into which modifier
-- properties are stored given the non-modifier part of the argument, and a function to generate an error message (given
-- the message). Normally, a comma-separated list of pronunciation properties is accepted and parsed, where each element
-- in the list can have its own inline modifiers and where no spaces are allowed next to the commas in order for them to
-- be recognized as separators. If `no_split_on_comma` is given, only a single pronunciation property is accepted. If
-- `has_outer_container` is given, the list of pronunciation properties is embedded in the `terms` property of an outer
-- container, into which other list-level modifiers can also be stored (by setting `overall = "true"` in the respective
-- spec in `param_mods`). The return value is a list if neither `no_split_on_comma` nor `has_outer_container` are given,
-- otherwise a container object (which, in the case of `has_outer_container`, will contain a list inside of it, in the
-- `terms` property).
local function parse_pron_modifier(arg, parse_err, generate_obj, param_mods, no_split_on_comma, has_outer_container)
	if arg:find("<") then
		local insert = { store = "insert" }
		param_mods.q = insert
		param_mods.qq = insert
		param_mods.a = insert
		param_mods.aa = insert
		return require(put_module).parse_inline_modifiers(arg, {
			param_mods = param_mods,
			generate_obj = generate_obj,
			parse_err = parse_err,
			splitchar = not no_split_on_comma and "," or nil,
			outer_container = has_outer_container and {} or nil,
		})
	elseif no_split_on_comma then
		return generate_obj(arg)
	else
		local retval = {}
		for _, term in ipairs(split_on_comma(arg)) do
			table.insert(retval, generate_obj(term))
		end
		if has_outer_container then
			retval = {
				terms = retval,
			}
		end
		return retval
	end
end


local function parse_rhyme(arg, parse_err)
	local function generate_obj(term)
		return {rhyme = term}
	end
	local param_mods = {
		s = {
			item_dest = "num_syl",
			convert = function(arg, parse_err)
				local nsyls = rsplit(arg, ",")
				for i, nsyl in ipairs(nsyls) do
					if not nsyl:find("^[0-9]+$") then
						parse_err("Number of syllables '" .. nsyl .. "' should be numeric")
					end
					nsyls[i] = tonumber(nsyl)
				end
				return nsyls
			end,
		},
	}

	return parse_pron_modifier(arg, parse_err, generate_obj, param_mods)
end


local function parse_syll(arg, parse_err)
	local param_mods = {
		cap = { overall = true},
	}

	-- We need to pass in has_outer_container because we have an overall property <cap:...> (the caption, defaulting
	-- to "Syllabification") applying to the whole set of syllabifications.
	return parse_pron_modifier(arg, parse_err, generate_syll_obj, param_mods, nil, "has outer container")
end


local function parse_homophone(arg, parse_err)
	local function generate_obj(term)
		return {term = term}
	end
	local param_mods = {
		t = {
			-- We need to store the <t:...> inline modifier into the "gloss" key of the parsed term,
			-- because that is what [[Module:links]] (called from [[Module:homophones]]) expects.
			item_dest = "gloss",
		},
		gloss = {},
		pos = {},
		alt = {},
		lit = {},
		id = {},
		g = {
			-- We need to store the <g:...> inline modifier into the "genders" key of the parsed term,
			-- because that is what [[Module:links]] (called from [[Module:homophones]]) expects.
			item_dest = "genders",
			convert = function(arg)
				return rsplit(arg, ",")
			end,
		},
	}

	return parse_pron_modifier(arg, parse_err, generate_obj, param_mods)
end


local function generate_audio_obj(arg)
	local file, gloss = arg:match("^(.-)%s*#%s*(.*)$")
	if not file then
		file = arg
		gloss = "Audio"
	end
	return {file = file, gloss = gloss}
end


local function parse_audio(arg, parse_err)
	-- None other than qualifiers
	local param_mods = {}

	-- Don't split on comma because some filenames have embedded commas not followed by a space (typically followed by
	-- an underscore).
	return parse_pron_modifier(arg, parse_err, generate_audio_obj, param_mods, "no split on comma")
end

return export
Module:id-pron/sandbox

Navigation menu

Search