local export = {}
local lang = require("Module:languages").getByCode("sco")
local m_IPA = require("Module:IPA")
local gmatch = mw.ustring.gmatch
local gsplit = mw.text.gsplit
local match = mw.ustring.match
local gsubn = mw.ustring.gsub
local len = mw.ustring.len
local lower = mw.ustring.lower
local sub = mw.ustring.sub
-- version of gsubn() that discards all but the first return value
local function gsub(term, foo, bar, n)
local retval = gsubn(term, foo, bar, n)
return retval
end
--[[ Dialect abbreviations:
* Insular:
** Orkney: or
** Shetland: sh
* Northern:
** North Northern: nn
** Mid Northern: mn
** South Northern: sn
* Central:
** North East Central: nec
** South East Central: sec
** West Central: wc
** South West Central: swc
* Southern: s
* Ulster:
** Western Ulster: wu
** Central Ulster: cu
** Eastern Ulster: eu
--]]
--[[
TODO:
-- * Consider unstressed vowels (schwa)
-- * Place the morpheme splitting in the main evaluation function
-- * Work on consonant rules
-- * Consider adding unique dialects based on word inputted
-- * Consider unique pronunciation for suffixes
--]]
--[[ DATA STRUCTURES
--]]
-- list pronunciations for different vowel spellings
-- {"pattern", "pos"} represent the surrounding letters with before (-1) or after (1) a word, false represents every other condition
-- "ˑ" indicates a vowel affected by the scottish vowel length rule
local s = {
["a"] = {
[{"n[gd]?", 1}] = "a~ɑ",
[{nil, 1}] = "ɑˑ,e",
[false] = "aˑ"
},
["e"] = {
[false] = "ɛˑ~æˑ"
},
["i"] = {
[{"n?g", 1}] = "əi",
[{"ch", 1}] = "əi",
[{"wh", -1}] = "ʌ",
[false] = "ɪ"
},
["o"] = {
[{nil, -1}] = "wʌˑ",
[{"ch", 1}] = "ʌu",
[false] = "ɔˑ"
},
["u"] = {
[false] = "ʌ"
}
}
-- all possible multi-letter graphemes needed for tokenisation
local multigraphs = {
"a_e", "e_e", "i_e", "o_e", "owe", "u_e", "y_e",
"aa", "ae", "ai", "au", "aw", "ay", "ea", "ee", "ei", "eu", "ew", "ey", "ie", "oa", "oi", "oo", "ou", "ow", "oy", "ui",
"ch", "ck", "kn", "ld", "mb", "nd", "ng", "nk", "qu", "sh", "th", "wh", "wr"
}
-- common morphemes
local morphemes = {
unstressed = {"ae", "ane", "dae", "hae", "na", "nae", "sae", "tae", "the"}, -- unstressed particles
prefixes = {"a"}, -- prefixes
suffixes = {"fu", "le", "na", "the", "se"} -- suffixes
}
--[[ HELPER FUNCTIONS
--]]
-- handle vowel length according to scottish vowel length rule
local function handle_vowel_length(word)
-- long if before /r/ and voiced fricatives
word = gsub(word, "ˑ([rvzðʒ])", "ː%1")
-- also long morpheme-finally
word = gsub(word, "ˑ$", "ː")
-- otherwise short
word = gsub(word, "ˑ", "")
return word
end
-- handle stress
local function handle_stress(word)
-- apply morpheme rules if no explicit stress marker and not an unstressed particle
if not match(word, "ˈ") and not morphemes.unstressed[word] then
-- stress after prefix "a-"
if match(word, "^a[^aeiou][aeiou]") then
word = "aˈ" .. sub(word, 2)
-- otherwise add stress on the first syllable of a morpheme
else
word = "ˈ" .. word
end
end
return word
end
-- split any potential suffixes from word
local function split_suffixes(word)
-- loop over all possible suffixes
for _, suffix in ipairs(morphemes.suffixes) do
if sub(word, -len(suffix)) == suffix then
return sub(word, 1, -len(suffix)-1), suffix
end
end
-- return suffixless word otherwise
return word, nil
end
--[[ MAIN FUNCTIONS
--]]
-- tokenise word into individual graphemes and affixes
local function tokenise(word)
-- initialise index and tokenised array
local i = 1
local tokenised = {}
-- split any suffixes from the base word
local base_word, suffix = split_suffixes(word)
-- respell vowel + consonant + e as vowel + _e + consonant for easier parsing
base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e([^aeiou])", "%1_e%2%3")
base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e$", "%1_e%2")
-- loop over entire base word
while i <= len(base_word) do
-- loop over all possible multigraphs
local found = false
for _, multigraph in ipairs(multigraphs) do
-- check for a matching multigraph
if sub(base_word, i, i + len(multigraph) - 1) == multigraph then
-- add multigraph to tokenised
table.insert(tokenised, multigraph)
i = i + len(multigraph)
found = true
break
end
end
-- add single grapheme if no multigraph found
if not found then
table.insert(tokenised, sub(base_word, i, i))
i = i + 1
end
end
-- add suffix to the tokenized table at the end
if suffix then
table.insert(tokenised, suffix)
end
return tokenised
end
-- process phonemes for tokens
local function to_phonemes(tokens)
local phonemes = {}
for i = 1, #tokens do
local char = tokens[i]
-- ensure char is not nil and exists in table
if char and s[char] then -- use s temporarily
-- determine surrounding context
local before = i > 1 and tokens[i - 1] or nil
local after = i < #tokens and tokens[i + 1] or nil
local match_found = false
-- check conditions in table
for pattern, replacement in pairs(s[char]) do
-- if no specific condition were defined for character
if pattern == false then
phonemes[#phonemes + 1] = replacement
match_found = true
break
elseif type(pattern) == "table" then
local pos = pattern[2]
local context = pos == -1 and before or after
-- match true given a context and pattern
if (not context and not pattern[1]) or (context and match(pattern[1], context)) then
match_found = true
phonemes[#phonemes + 1] = replacement
break
end
end
end
-- add match to table
if not match_found then
phonemes[#phonemes + 1] = char
end
else
-- otherwise append char as is
phonemes[#phonemes + 1] = char or ''
end
end
return table.concat(phonemes)
end
-- generate IPA pronunciation of word
function export.toIPA(entry)
if type(entry) == "table" then
entry = entry.args[1]
end
-- make text lowercase
entry = lower(entry)
local words = {}
-- loop over each word
for word in gsplit(entry, "%s") do
-- tokenise word into graphemes
local tokenised = tokenise(word)
-- process phonemes for tokens
local processed = to_phonemes(tokenised)
-- add processed word to word array
table.insert(words, processed)
end
return table.concat(words, " ")
end
-- export function for IPA
function export.show(entry)
if type(entry) == "table" then
entry = entry.args[1]
end
-- return processed pronunciation
return export.toIPA(entry)
end
return export