Jump to content

Module:User:Erutuon/languages

From Wiktionary, the free dictionary

An experiment with creating a Language object to which only a specified number of the methods in Module:languages are added. This may be less memory-intensive than creating a Language object with every function (many of which are never used in the module in which the object is generated), as is currently done.

Another issue is that the data in the Language object can be gotten by indexing it – enLanguageObject._rawData.canonicalName -- "English", for example – but this is discouraged. It can even be modified: enLanguageObject._rawData.canonicalName = "German" -- now enLanguageObject:getCanonicalName() == "German". Storing the data in the metatable and preventing access to it would make both of these impossible.


local export = {}

local Language = {}
Language.__index = Language

local find = mw.ustring.find

local function addTables(table1, table2)
	if not table2 then
		return table1
	end
	for key, value in pairs(table2) do
		if type(key) == "string" then
			table.insert(table1, key)
		end
	end
	table2 = nil
	return table1
end

local function makeMetatable(code, data, functions)
	local itemsToAdd = {
		"getCode",
		"getCanonicalName",
		"getFamily",
	}
	
	local mt = {}
	mt.__index = mt
	
	for i, key in ipairs(addTables(itemsToAdd, functions)) do
		mt[key] = Language[key]
	end
	
	mt._rawData = data
	mt._code = code
	
	mw.logObject(mt)
	
	return mt
end

function export.makeObject(code, data, functions)
	if data and data.deprecated then
		require("Module:debug").track {
			"languages/deprecated",
			"languages/deprecated/" .. code
		}
	end
	
	local object = data and setmetatable({ }, makeMetatable(code, data, functions)) or nil
	
	mw.logObject(object)
	
	return object
end

--[=[	This function checks for things that could plausibly be a language code:
		two or three lowercase letters, two or three groups of three lowercase
		letters with hyphens between them. If such a pattern is not found,
		it is likely the editor simply forgot to enter a language code. ]=]

function export.err(langCode, param, text)
	local ordinals = { "first", "second", "third", "fourth" }
	
	if type(param) == "number" then
		ordinal = ordinals[param]
		param = ordinal .. ' parameter'
	elseif type(param) == "string" then
		param = 'parameter "' .. param .. '"'
	else
		error("The parameter name is "
				.. (type(param) == "table" and "a table" or tostring(param))
				.. ", but it should be a number or a string.")
	end
	
	--[[	Can't use "%l" because that would include all Unicode
			lowercase letters; language codes only use ASCII.	]]
	local lower = "[a-z]"
	
	if not langCode or langCode == "" then
		error("The " .. param .. " (" .. (text or "language code") .. ") is missing.", 2)
	elseif find(langCode, "^" .. lower .. lower .. lower .. "?$")
			or find(langCode, "^" .. lower .. lower .. lower
			.. "%-" .. lower .. lower .. lower .. "$")
			or find(langCode, "^" .. lower .. lower .. lower
			.. "%-" .. lower .. lower .. lower
			.. "%-" .. lower .. lower .. lower .. "$") then
		error("The language code \"" .. langCode .. "\" is not valid.", 2)
	else
		error("Please enter a " .. (text or "language code") .. " in the " .. param .. ".", 2)
	end
end

function Language:getCode()
	return self._code
end


function Language:getCanonicalName()
	return self._rawData[1]
end


function Language:getOtherNames()
	return self._rawData.otherNames or {}
end


function Language:getType()
	return self._rawData.type or "regular"
end


function Language:getWikimediaLanguages()
	if not self._wikimediaLanguageObjects then
		local m_wikimedia_languages = require("Module:wikimedia languages")
		self._wikimediaLanguageObjects = {}
		local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
		
		for _, wlangcode in ipairs(wikimedia_codes) do
			table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
		end
	end
	
	return self._wikimediaLanguageObjects
end


function Language:getWikipediaArticle()
	return self._rawData.wikipedia_article or mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
end



function Language:makeWikipediaLink()
	return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end


function Language:getScripts(onlyPatterns)
	
	if onlyPatterns and not self._scriptPatterns or not self._scriptObjects then
		local m_scripts = require("Module:scripts")
		if onlyPatterns then
			self._scriptPatterns = {}
		else
			self._scriptObjects = {}
		end
		
		for _, sc in ipairs(self._rawData.scripts or { "None" }) do
			if onlyPatterns then
				local data = mw.loadData("Module:scripts/data")[sc]
				table.insert(self._scriptPatterns, { canonicalName = data.canonicalName, pattern = data.characters })
			else
				table.insert(self._scriptObjects, m_scripts.getByCode(sc))
			end
		end
	end
	
	return onlyPatterns and self._scriptPatterns or self._scriptObjects
end

function Language:getScriptCodes()
	return self._rawData.scripts or { "None" }
end

function Language:getFamily()
	if self._rawData[3] and not self._familyObject then
		self._familyObject = require("Module:families").getByCode(self._rawData[3])
	end
	
	return self._familyObject
end


function Language:getAncestors()
	if not self._ancestorObjects then
		self._ancestorObjects = {}
		
		if self._rawData.ancestors then
			for _, ancestor in ipairs(self._rawData.ancestors) do
				table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
			end
		else
			local fam = self:getFamily()
			local protoLang = fam and fam:getProtoLanguage() or nil
			
			-- For the case where the current language is the proto-language
			-- of its family, we need to step up a level higher right from the start.
			if protoLang and protoLang:getCode() == self:getCode() then
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			table.insert(self._ancestorObjects, protoLang)
		end
	end
	
	return self._ancestorObjects
end

local function iterateOverAncestorTree(node, func)
	for _, ancestor in ipairs(node:getAncestors()) do
		if ancestor then
			local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
			if ret then
				return ret
			end
		end
	end
end

function Language:getAncestorChain()
	if not self._ancestorChain then
		self._ancestorChain = {}
		local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
		
		while step do
			table.insert(self._ancestorChain, 1, step)
			step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
		end
	end
	
	return self._ancestorChain
end


function Language:hasAncestor(otherlang)
	local function compare(ancestor)
		return ancestor:getCode() == otherlang:getCode()
	end
	
	return iterateOverAncestorTree(self, compare) or false
end


function Language:getCategoryName()
	local name = self._rawData.canonicalName
	
	-- If the name already has "language" in it, don't add it.
	if name:find("[Ll]anguage$") then
		return name
	else
		return name .. " language"
	end
end


function Language:getStandardCharacters()
	return self._rawData.standardChars
end


function Language:makeEntryName(text)
	text = mw.ustring.gsub(text, "^[¿¡]", "")
	text = mw.ustring.gsub(text, "(.)[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]$", "%1")
	
	if self:getCode() == "ar" then
		local U = mw.ustring.char
		local taTwiil = U(0x640)
		local waSla = U(0x671)
		-- diacritics ordinarily removed by entry_name replacements
		local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
		
		if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
			return text
		end
	end
	
	if type(self._rawData.entry_name) == "table" then
		for i, from in ipairs(self._rawData.entry_name.from) do
			local to = self._rawData.entry_name.to[i] or ""
			text = mw.ustring.gsub(text, from, to)
		end
	end
	
	--[=[ For instance, ᾰ (alpha-breve) + combining smooth breathing is converted
		to alpha + combining smooth breathing by the entry_name replacements.
		It must be re-combined to alpha-smooth breathing (ἀ) so that
		allowSelfLink in [[Module:links]] will work properly. ]=]
	if self:getCode() == "grc" then
		text = mw.ustring.toNFC(text)
	end
	
	return text
end


-- Add to data tables?
local has_dotted_undotted_i = {
	["az"] = true,
	["crh"] = true,
	["gag"] = true,
	["kaa"] = true,
	["tt"] = true,
	["tr"] = true,
	["zza"] = true,
}

function Language:makeSortKey(name, sc)
	if has_dotted_undotted_i[self:getCode()] then
		name = mw.ustring.gsub(name, "I", "ı")
	end
	
	name = mw.ustring.lower(name)
	
	-- Remove initial hyphens and *
	local hyphens_regex = "^[-־ـ*]+(.)"
	name = mw.ustring.gsub(name, hyphens_regex, "%1")
	
	-- Remove parentheses, as long as they are either preceded or followed by something
	name = mw.ustring.gsub(name, "(.)[()]+", "%1")
	name = mw.ustring.gsub(name, "[()]+(.)", "%1")
	
	-- If there are language-specific rules to generate the key, use those
	if type(self._rawData.sort_key) == "table" then
		for i, from in ipairs(self._rawData.sort_key.from) do
			local to = self._rawData.sort_key.to[i] or ""
			name = mw.ustring.gsub(name, from, to)
		end
	elseif type(self._rawData.sort_key) == "string" then
		name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
	end
	
	if has_dotted_undotted_i[self:getCode()] then
		name = mw.ustring.gsub(name, "i", "İ")
	end
	
	return mw.ustring.upper(name)
end

function Language:overrideManualTranslit()
	if self._rawData.override_translit then
		return true
	else
		return false
	end
end


function Language:transliterate(text, sc, module_override)
	if not ((module_override or self._rawData.translit_module) and text) then
		return nil
	end
	
	if module_override then
		require("Module:debug").track("module_override")
	end
	
	return require("Module:" .. (module_override or self._rawData.translit_module)).tr(text, self:getCode(), sc and sc:getCode() or nil)
end

function Language:hasTranslit()
	return self._rawData.translit_module and true or false
end


function Language:link_tr()
	return self._rawData.link_tr and true or false
end


function Language:toJSON()
	local entryNamePatterns = nil
	
	if self._rawData.entry_name then
		entryNamePatterns = {}
		
		for i, from in ipairs(self._rawData.entry_name.from) do
			local to = self._rawData.entry_name.to[i] or ""
			table.insert(entryNamePatterns, { from = from, to = to })
		end
	end
	
	local ret = {
		ancestors = self._rawData.ancestors,
		canonicalName = self:getCanonicalName(),
		categoryName = self:getCategoryName(),
		code = self._code,
		entryNamePatterns = entryNamePatterns,
		family = self._rawData.family,
		otherNames = self:getOtherNames(),
		scripts = self._rawData.scripts,
		type = self:getType(),
		wikimediaLanguages = self._rawData.wikimedia_codes,
	}
	
	return require("Module:JSON").toJSON(ret)
end


-- Do NOT use this method!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
	return self._rawData
end


function export.getDataModuleName(code)
	if code:find("^[a-z][a-z]$") then
		return "languages/data/2"
	elseif code:find("^[a-z][a-z][a-z]$") then
		local prefix = code:sub(1, 1)
		return "languages/data/3/" .. prefix
	elseif code:find("^[a-z-]+$") then
		return "languages/data/exceptional"
	else
		return nil
	end
end


local function getRawLanguageData(code)
	local modulename = export.getDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


function export.getByCode(code, functions)
	if type(code) ~= "string" then
		error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
	end
	
	return export.makeObject(code, getRawLanguageData(code), functions)
end


function export.getByName(name, functions)
	local byName = mw.loadData("Module:languages/by name")
	local code = byName.all and byName.all[name] or byName[name]
	
	if not code then
		return nil
	end
	
	return export.makeObject(code, getRawLanguageData(code), functions)
end


function export.getByCanonicalName(name, functions)
	local byName = mw.loadData("Module:languages/canonical names")
	local code = byName and byName[name]
	
	if not code then
		return nil
	end
	
	return export.makeObject(code, getRawLanguageData(code), functions)
end


function export.iterateAll()
	mw.incrementExpensiveFunctionCount()
	local m_data = mw.loadData("Module:languages/data/all")
	local func, t, var = pairs(m_data)
	
	return function()
		local code, data = func(t, var)
		return export.makeObject(code, data)
	end
end

return export