User:Surjection/catfix-regrouper.js

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Note: You may have to bypass your browser’s cache to see the changes. In addition, after saving a sitewide CSS file such as MediaWiki:Common.css, it will take 5-10 minutes before the changes take effect, even if you clear your cache.

  • Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
  • Konqueror and Chrome: click Reload or press F5;
  • Opera: clear the cache in Tools → Preferences;
  • Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.


/*
 * dependencies: mediawiki.Title
 */

/** TODO: move this data somewhere else */
/** REGROUPER_DATA_LANG Data format:
 *      IMPORTANT! The regrouper already assumes that the language sort keys
 *      	are set up correctly. It does not protect against the same group
 *      	heading being repeated *AT ALL*. Do not enable the regrouper for
 *      	a language until sort keys are set up for it first (so that
 *          each would-be group is contiguous).
 *      The data is organized by language code.
 *      If the code is missing or the value is undefined, the
 * 			regrouper is disabled. This is the default.
 *      If it is ``true``, default values are used.
 *      Otherwise it is an object, with the following fields:
 * 			group
 * 				A function which is given the following parameters,
 * 					in this order:
 * 						- page title (minus namespace and language prefixes),
 * 						- language code,
 * 						- script code,
 * 						- namespace number.
 *					``this`` will be the language data object (so that you
 *					can add your own variables, functions, etc.; but if you
 *					do so, please prefix their name with x_).
 * 				It should return the group, i.e. the heading that the
 *	 			page should be categorized under. If the value returned
 *				is undefined, null or an empty string, its existing group
 *				will be kept.
 *				The default is the default grouping function, either of the
 *				script, or of ``defaultGroup``;
 *				see below for a description thereof.
 *			detectScript
 * 				A function which is given the following parameters,
 * 					in this order:
 * 						- page title (minus namespace and language prefixes),
 * 						- language code,
 * 						- (default) script code,
 * 						- namespace number.
 *					``this`` will be the language data object (so that you
 *					can add your own variables, functions, etc.; but if you
 *					do so, please prefix their name with x_).
 * 				It should return a script code, or undefined to use
 * 				the default script.
 * 			initials
 * 				If the default grouping function is used, this can be a RegEx
 * 				of initial letters/digraphs/etc. which are automatically mapped
 * 				to that respective group. All variants of that same letter
 * 				should be included (e.g. both uppercase and lowercase).
 * 			initialFallback
 * 				Only applies for the default grouping function and if initials
 * 				is defined. If ``true``, the default grouping function falls
 *				back to the default logic when the initial doesn't match any
 *				specified in initials; if ``false`` (default), it just returns
 *				``undefined`` (i.e. keeps the existing group).
 *			ignore
 *				Preceding characters to ignore instead of the default ones.
 *				This is in RegExp character class syntax.
 *			ignoreAdd
 *				Preceding characters to ignore in addition to the default ones.
 *				This is in RegExp character class syntax.
 * 			unsupported
 * 				If ``true``, unsupported titles are passed directly to
 * 				``group``. If ``false`` (default), they are ignored, and their
 * 				existing groups are kept.
 * 			dottedDotlessI
 * 				Used in case conversion; ``true`` means the language has both
 * 				dotted and dotless I as separate letters (like in Turkish),
 * 				and ``false`` (default) means it doesn't.
 * 
 * REGROUPER_DATA_SC data format:
 * 		This data is organized by script. Note that these configurations are
 * 		still only considered for languages that have regrouping enabled.
 * 
 * 		If missing, defaults are used. Else, an object may override:
 * 			group
 * 				A function that works just like group in REGROUPER_DATA_LANG.
 * 				If missing, the default grouping function is used.
 * 				The priority of grouping functions is:
 * 					- group in language data,
 * 					- group in script data,
 *					- default grouping.
 *				
 * The default grouping function:
 * 			Checks initials and initialFallback.
 * 				If there are initials, it matches them first.
 * 				Initial matching ignores certain preceding characters,
 * 				e.g. hyphens.
 * 				If an initial is found, the matching portion is converted
 * 				to title case with ``titleCase`` and returned.
 *				Otherwise, we fall back to 'fallback' logic only if
 * 				`initialFallback` is `true`, and else return `undefined`
 * 				to keep the existing group.
 * 			We may fall into the fallback logic:
 * 				remove preceding characters (e.g. hyphens),
 * 				take the first remaining Unicode character
 * 						(or the first character in general if none would remain
 * 						 from the previous step),
 * 				convert it to title case with ``titleCase`` and return it.
 */
var REGROUPER_DATA_LANG = {
	"et": {
		initials: /[ŠšZzŽžÕõÄäÖöÜü]/,
	},
	"fi": {
		initials: /[Åå]/,
	},
	"hu": {
		initials: /(?:[ÁáÉéÍíÓóÖöŐőÚúÜüŰű]|[CcZz][Ss]|[Dd][Zz][Ss]?|[GgLlNnTt][Yy]|[Ss][Zz])/,
	},
};

var REGROUPER_DATA_SC = {
};
/* end of regrouper data */

function getRegrouperLanguageData(lang) {
	return REGROUPER_DATA_LANG[lang];
}

function getRegrouperScriptData(sc) {
	return REGROUPER_DATA_SC[sc];
}

function safeUpperCase(text, dottedDotlessI) {
	if (dottedDotlessI)
		return text.replace(/i/g, "İ").toUpperCase();
	else
		return text.toUpperCase();
}

function safeLowerCase(text, dottedDotlessI) {
	if (dottedDotlessI)
		return text.replace(/I/g, "ı").toLowerCase();
	else
		return text.toLowerCase();
}

function titleCase(text, lang, sc) {
	return safeUpperCase(text.charAt(0), this.dottedDotlessI)
		 + safeLowerCase(text.substring(1), this.dottedDotlessI);
}

var REGROUPER_INITIALS = "-";
function defaultGroup(title, lang, sc) {
	if (title.length < 1) return undefined;
	var cleaned = title.replace(this._clean_regex, "");
	if (this.initials) {
		var initialMatch = cleaned.match(this._initials_regex);
		if (initialMatch) {
			return titleCase(initialMatch[0], lang, sc);
		}
		if (!this.initialFallback) return undefined;
	}

	title = cleaned || title;
	return titleCase(title.charAt(0), lang, sc);
}

function makeGroup(groupText) {
	var groupDiv = document.createElement("div");
	groupDiv.className = "mw-category-group";
	var groupH3 = document.createElement("h3");
	groupH3.textContent = groupText;
	groupDiv.append(groupH3);
	var groupUl = document.createElement("ul");
	groupDiv.append(groupUl);
	return [groupDiv, groupUl];
}

function getLiText(el) {
	var child = $(el).find("a, span").first();
	var rawText = el.textContent || el.innerText;
	return (child.length > 0 && child.text()) || rawText;
}

jQuery(function () {
	'use strict';

	var catfix;

	// Apply only to pages in the Category namespace
	// containing an element with the id "catfix".
	// Set window.disableCatfixRegrouper to true to prevent this script from running.
	if (!(!window.disableCatfixRegrouper
				&& mw.config.get('wgNamespaceNumber') == 14
				&& (catfix = document.getElementById("catfix"))))
		return;

	// Get the language name and script catfix.
	var langName = catfix.className.split("CATFIX-")[1];
	catfix = catfix.getElementsByTagName("*")[0] || document.createElement("span");

	var lang = catfix.getAttribute("lang");
	var defaultSc = catfix.classList[0] || "None";
	var cachedScriptData = {};

	if (!lang)
		return;

	var UNPREFIXED_NAMESPACES = ["", "Talk", "Citations"];
	var PREFIXED_NAMESPACES = ["Appendix", "Appendix talk", "Reconstruction", "Reconstruction talk"];

	function isEntry(namespaceName, pageName) {
		// main, Talk, Citations,
		// Reconstruction/Appendix (Talk) if it starts with language name and "/"
		return UNPREFIXED_NAMESPACES.indexOf(namespaceName) != -1
			|| (PREFIXED_NAMESPACES.indexOf(namespaceName) != -1
				&& pageName.slice(0, langName.length + 1) == langName + "/");
	}

	var formattedNamespaces = mw.config.get("wgFormattedNamespaces");
	var regrouperData = getRegrouperLanguageData(lang);
	if (!regrouperData) return;

	// set up stuff for the default regrouper
	regrouperData._clean_regex = new RegExp("^[" + ((regrouperData.ignoreAdd || "") + (regrouperData.ignore || REGROUPER_INITIALS)) + "]+");
	if (regrouperData.initials)
		regrouperData._initials_regex = new RegExp("^" + regrouperData.initials.source);

	var groupFunction = regrouperData.group;
	var detectScriptFunction = regrouperData.detectScript;

	function getGroup(pageTitle, oldGroup) {
		var titleobj = new mw.Title(pageTitle);
		var namespaceId = titleobj.getNamespaceId();
		var namespaceName = formattedNamespaces[namespaceId];
		var pageName = titleobj.getMainText();
		var formattedTitle = pageName;
		var sc = defaultSc;

		if (!isEntry(namespaceName, pageName))
			return oldGroup;

		// verify language prefix if the namespace should have one
		var langPrefix = langName + "/";
		if (PREFIXED_NAMESPACES.indexOf(namespaceName) != -1) {
			if (formattedTitle.startsWith(langPrefix)) {
				formattedTitle = formattedTitle.substring(langPrefix.length);
			} else {
				return oldGroup;
			}
		}

		// ignore unsupported titles unless the language data requests otherwise
		if (formattedTitle.startsWith("Unsupported titles/") && !regrouperData.unsupported)
			return oldGroup;

		// script detection
		if (detectScriptFunction)
			sc = detectScriptFunction.call(regrouperData, formattedTitle, lang, sc, namespaceId) || sc;

		var scData = cachedScriptData[sc];
		if (!scData)
			scData = cachedScriptData[sc] = getRegrouperScriptData(sc) || {};

		var newGroup;
		if (groupFunction)
			newGroup = groupFunction.call(regrouperData, formattedTitle, lang, sc, namespaceId);
		else if (scData.group)
			newGroup = scData.group.call(regrouperData, formattedTitle, lang, sc, namespaceId);
		else
			newGroup = defaultGroup.call(regrouperData, formattedTitle, lang, sc, namespaceId);

		return newGroup || oldGroup;
	}

	var GROUP_QUERY = "#mw-pages > .mw-content-ltr .mw-category-group";

	var regroupOk = true;
	var regroupData = new Map();
	// Process each group in the category listing.
	jQuery(GROUP_QUERY)
		.each(function () {
			// Get the existing group.
			var group = $(this).find("h3").first().text();
			if (!group) {
				// Failed to get group -- something has gone wrong.
				regroupOk = false;
				return;
			}

			$(this).find("li")
				.each(function () {
					try {
						var liText = getLiText(this);
						var newGroup = getGroup(liText, group);
						regroupData.set(liText, newGroup);
					} catch (e) {
						console.error(e);
						regroupOk = false;
					}
				});
		});

	// Find the existing groups, which we will delete.
	var groups = jQuery(GROUP_QUERY);
	// Cannot regroup if there are no groups.
	if (!groups.length) return;

	var parent = groups.first().parent()[0];
	if (!parent) return;
	var fragment = document.createDocumentFragment();

	if (regroupOk) {
		var lastGroup, groupUl;
		jQuery(GROUP_QUERY + " li")
			.each(function () {
				var liText = getLiText(this);
				var newGroup = regroupData.get(liText) || "";
				if (lastGroup != newGroup) {
					var elements = makeGroup(newGroup);
					var groupDiv = elements[0];
					fragment.appendChild(groupDiv);
					groupUl = elements[1];
					lastGroup = newGroup;
				}
				groupUl.appendChild(this);
			});

		groups.remove();
		parent.appendChild(fragment);
	}
});