Jump to content

Module:User:Benwing2/th-scraping-translit

From Wiktionary, the free dictionary

Language code in page name (User:Benwing2/th) not recognized.


local export = {}

local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local u = mw.ustring.char
local TEMP0 = u(0xFFF0)

local th_pron_module = "Module:th-pron"
local string_utilities_module = "Module:string utilities"
local links_module = "Module:links"
local templateparser_module = "Module:templateparser"


local function fail(lang, request)
	local langObj, req, cat = require("Module:languages").getByCode(lang)
	if request then
		cat = {"Requests for transliteration of " .. langObj:getCanonicalName() .. " terms"}
	end
	return nil, true, cat
end

local thai_char_range = "ก-ฺเ-๎" -- U+0E01 through U+0E3A and U+0E40 through U+0E4E (skipping Bhat sign and Thai numerals)

local function translit_term(term)
	return require(th_pron_module).translit(term, "th", "Thai", "paiboon", "translit-module") or term
end


local function scrape_pronun(term)
	local title = mw.title.new(term)
	if title then
		local content = title:getContent()
		if content then
			local template_contents = content:match("{{th%-pron[^}]*}}")
			local pron
			if template_contents then
				local args_contents = content:match("{{th%-pron([^}]*)}}")
				if args_contents == "" then
					return term
				elseif args_contents:find("=") then
					local _, args = require(templateparser_module).parseTemplate(template_contents)
					pron = args[1]
				else
					pron = rsplit(args_contents, "|")[2]
				end
				pron = pron:gsub(":.*", "")
				return pron
			end
		end
	end

	return nil
end


local function scrape_and_translit_term(term)
	-- FIXME! Issue warning (in preview mode?) or error if respelling can't be found.
	local respelling = scrape_pronun(term) or term
	local translit = translit_term(respelling)
	-- error(term .. " || " .. respelling .. " || " .. translit)
	return translit, respelling
end


local function parse_brace_segment(segment)
	local inside = segment:match("^{ *(.-) *}$")
	if not inside then
		error(("Internal error: Can't match braces in brace-delimited segment %s"):format(segment))
	end
	local parts
	if inside:find("//") then
		parts = rsplit(inside, "//", true)
	else
		parts = rsplit(inside, "/", true)
	end
	if #parts ~= 2 then
		error(("Expected two slash-separated components in brace-delimited segment %s"):format(segment))
	end
	return unpack(parts)
end


local function process(text, fn)
	local left, right
	local scrape_and_translit, translit
	if fn == "links" then
		left = "[["
		right = "]]"
	else
		left = ""
		right = ""
	end
	if fn == "translit" then
		scrape_and_translit = scrape_and_translit_term
		translit = translit_term
	else
		scrape_and_translit = function(term) return term, term end
		translit = function(term) return term end
	end
	if not text then
		return text
	end
	local trimmed_text = text:match("^ *(.-) *$")
	if trimmed_text == "" then
		return text
	end
	text = trimmed_text

	if rfind(text, ("^[%s-]+$"):format(thai_char_range)) then
		-- Just Thai text -- a single term.
		return left .. scrape_and_translit(text) .. right
	end

	if rfind(text, ("^[%s -]+$"):format(thai_char_range)) then
		-- Just Thai text + spaces.
		local phrases = rsplit(text, "  +")
		local preceding_word
		for i, phrase in ipairs(phrases) do
			local words = rsplit(phrase, " ")
			for j, word in ipairs(words) do
				if word == "ๆ" and fn == "translit" then -- repetition marker 
					if not preceding_word then
						error(("Repetition mark ๆ cannot occur at the beginning of the sentence: %s"):format(text))
					end
					words[j] = translit(preceding_word) -- left and right are blank
					-- Leave preceding_word as-is in case of another repetition mark (can this occur?).
				else
					words[j], preceding_word = scrape_and_translit(word)
					words[j] = left .. words[j] .. right
				end
			end
			phrases[i] = table.concat(words, fn == "translit" and " " or "")
		end
		return table.concat(phrases, fn == "translit" and " • " or " ")
	end

	-- Numbers, brackets, braces, etc. may occur.
	local preceding_word
	local capturing_split = require(string_utilities_module).capturing_split
	local split_brackets = capturing_split(text, "(%[%[.-%]%])")
	for i, bracket_segment in ipairs(split_brackets) do
		if i % 2 == 1 then -- not a bracketed segment
			local split_braces = capturing_split(bracket_segment, "({.-})")
			for j, brace_segment in ipairs(split_braces) do
				if j % 2 == 1 then -- not a brace-delimited segment
					local words_and_delimiters = capturing_split(brace_segment, ("([%s-]+)"):format(thai_char_range))
					for k, word in ipairs(words_and_delimiters) do
						if k % 2 == 1 then -- outside of Thai word range
							if fn == "translit" then
								-- translit separators in case of Thai numerals
								words_and_delimiters[k] = translit_term(word)
							else
								-- Remove single spaces but convert double spaces to single
								word = word:gsub("  +", TEMP0)
								word = word:gsub(" ", "")
								word = word:gsub(TEMP0, " ")
								words_and_delimiters[k] = word
							end
						else
							if word == "ๆ" and fn == "translit" then -- repetition marker 
								if not preceding_word then
									error(("Repetition mark ๆ cannot occur at the beginning of the sentence: %s"):format(text))
								end
								words_and_delimiters[k] = translit(preceding_word) -- left and right are blank
								-- Leave preceding_word as-is in case of another repetition mark (can this occur?).
							else
								words_and_delimiters[k], preceding_word = scrape_and_translit(word)
								words_and_delimiters[k] = left .. words_and_delimiters[k] .. right
							end
						end
					end
					split_braces[j] = table.concat(words_and_delimiters)
				else -- a brace-delimited segment
					local from, to = parse_brace_segment(brace_segment)
					preceding_word = to
					if fn == "translit" then
						split_braces[j] = translit_term(to)
					else
						split_braces[j] = left .. from .. right
					end
				end
			end
			split_brackets[i] = table.concat(split_braces)
		else -- a bracketed segment
			if fn == "links" then
				split_brackets[i] = bracket_segment
				-- no need to set preceding_word; it isn't used except when fn == "translit"
			else
				local term = require(links_module).remove_links(bracket_segment)
				split_brackets[i], preceding_word = scrape_and_translit(term)
				-- no need to add left or right; they're blank
			end
		end
	end

	text = table.concat(split_brackets)
	if fn == "translit" then
		text = text:gsub("  +", " • ")
	end

	return text
end


function export.tr(text, lang, sc)
	return process(text, "translit")
end


function export.makeEntryName(text, lang, sc)
	return process(text, "entry")
end


function export.makeDisplayText(text, lang, sc)
	return process(text, "display")
end


function export.preprocessLinks(text, lang, sc)
	return process(text, "links")
end


function export.tr_template(frame)
	return export.tr(frame:getParent().args[1])
end


function export.makeEntryName_template(frame)
	return export.makeEntryName(frame:getParent().args[1])
end


function export.makeDisplayText_template(frame)
	return export.makeDisplayText(frame:getParent().args[1])
end


function export.preprocessLinks_template(frame)
	return export.preprocessLinks(frame:getParent().args[1])
end


return export