Jump to content

Module:User:Suzukaze-c/zh-extract

From Wiktionary, the free dictionary

{{#invoke:User:Suzukaze-c/zh-extract|extract_roman|度}}

{{#invoke:User:Suzukaze-c/zh-extract|extract_roman|度|yes}}

{{#invoke:User:Suzukaze-c/zh-extract|extract_roman|蘋|yes}}


local export = {}
local replace = mw.ustring.gsub
local match = mw.ustring.match
local itermatch = mw.ustring.gmatch
local split = mw.text.split

-- idea: +simplified? since we're pulling stuff from the page anyway

local default_set_separator = '//'

function export.extract_roman(word, combine, set_separator)
	local plaintext = false
	if type(word) == 'table' then
		plaintext = true
		word, combine, set_separator = word.args[1], word.args[2], word.args[3]
	end

	mw.log('PROCESSING: [[' .. word .. ']]')

	local content = mw.title.new(word):getContent() or error('the [[' .. word .. ']] entry does not exist!?')
	local each = {}
	local roman_final = {}

	content = replace(content, "{{zh%-pron", "ⓐⓐⓐⓐⓐ")
	content = replace(content, "(|cat=[a-z,:]*)\n?}}\n", "%1ⓩⓩⓩⓩⓩ") -- making assumptions about formatting

	if match(content, "ⓐ") and not match(content, "ⓩ") then
		error("please add the cat param to zh-pron at [[" .. word .. "]]")
	end

	-- Convert each {{zh-pron}} instance to a table subsumed in $each
	local box_i = 1
	for innards in itermatch(content, "ⓐⓐⓐⓐⓐ([^ⓩ]+)ⓩⓩⓩⓩⓩ") do
		each[box_i] = {}

		innards = split(innards, "\n|")
		table.remove(innards, 1)
		for i, item in ipairs(innards) do
			local param, value = match(item, "^([^=]+)=(.*)$")
			each[box_i][param] = value
		end

		box_i = box_i + 1
	end

	-- If told to combine tables, then combine each $each sub-table into a mega-table,
	-- otherwise return the data of the first {{zh-pron}} instance
	if combine then
		-- make $roman_final[param] a table containing every possible $value
		for i, etable in ipairs(each) do
			for param, value in pairs(etable) do
				if not roman_final[param] then roman_final[param] = {} end
				if value ~= '' then table.insert(roman_final[param], value) end
			end
		end

		-- flatten $roman_final[param] into text
		for param, value in pairs(roman_final) do
			roman_final[param] = table.concat(roman_final[param], (set_separator or default_set_separator))
		end
	else
		roman_final = each[1]
	end

	if plaintext then
		return require('module:debug').dump(roman_final)
	else
		return roman_final
	end
end

return export