User:KassadBot/code
Appearance
Inflecto-bot
[edit]Taken from User:MewBot, with a few adaptions.
formbot.py
[edit]#!/usr/bin/env python
#coding: utf-8
# Copyright CodeCat 2010
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This script is based on parts from
# http://en.wiktionary.org/wiki/User:SemperBlottoBot/verbs
import wikipedia, re, string, sys
class GenericFormBot:
"""A generic class for Wiktionary form bots.
This class is an abstract base class, and isn't meant to be instantiated
directly. To use it, derive a new class from it, and override the
generateForms method with a proper definition, and provide a call to
the base class constructor.
Once you're ready to let it run, just call run() and it's all sorted.
The purpose of this script is to provide automated generation of
Wiktionary entries for inflected forms. It does this by fetching a
Wiktionary page, then checks for the existence of certain on that page.
If found, it extracts the necessary information from the template
parameters, and passes it on to the generateForms method, which generates
the forms (just as the templates themselves do) and uploads the result as
new entries.
It will either create a new page or append a new section to the
page. It will skip the page if it already contains a section of the same
type as the one being created.
If the page already exists, it will add {{rfc-auto}} to it,
so that the AutoFormat bot can automatically place the section in the
proper place on the page.
"""
def __init__(self, head, templates, pos, langCode, langName,
cleanupCat = None, simulation = False, force = False, verbose = False):
self._head = head
self._templates = templates
self._pos = pos
self._langCode = langCode
self._langName = langName
self._cleanupCat = cleanupCat
self._simulation = simulation
self._force = force
self._verbose = verbose
def run(self):
"""Fetch a wiktionary entry and create entries from information in all form template occurrences."""
page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), self._head)
if page.exists():
contents = page.get()
# Find all occurrences of form templates
templates = getTemplates(contents, self._templates)
if not templates:
wikipedia.output(u"No form template on page [[{0}]].".format(self._head))
return
else:
for temp in templates:
wikipedia.output(u"Found: {0}".format(temp))
name, params = parseTemplate(temp)
self.makeFormEntries(name, params)
else:
wikipedia.output(u"Can't find page [[{0}]].".format(self._head))
def makeFormEntries(self, template, params):
"""Create entries from information in one form template."""
forms = self.generateForms(template, params)
if not forms:
return
try:
del forms[self._head]
except KeyError:
pass
result = False
# Merge the lists into a single string per entry
for form, entries in forms.iteritems():
entry = '# ' + '\n# '.join(entries)
changed = self.saveEntry(form, entry)
result = result or changed
if not result:
wikipedia.output(u"Note: Did not add any new entries from page [[{0}]].".format(self._head))
def generateForms(self, template, params):
"""Override this in a derived class."""
pass
def saveEntry(self, title, entry):
"""Save a new entry to Wiktionary."""
page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), title)
newContents = '=={0}==\n==={1}===\n{{{{infl|{2}}}}}\n\n'.format(self._langName, self._pos, self._langCode) + entry
if page.exists():
oldContents = page.get()
if entry in oldContents:
wikipedia.output(u"Skipped page [[{0}]]. Already contains the new entry.".format(title))
return False
langSections = getSections(oldContents, self._langName, 2)
newContents = '\n\n----\n' + newContents
if langSections:
# There is more than one section for this language already.
# The bot probably was here before!
if len(langSections) > 1:
if self._force:
wikipedia.output(u"WARNING: Forced append to [[{0}]]. More than one {1} section on page.".format(title, self._langName))
if self._cleanupCat:
newContents += '\n[[' + self._cleanupCat + ']]'
else:
wikipedia.output(u"Skipped page [[{0}]]. More than one {1} section on page.".format(title, self._langName))
return False
# There is a lang section on the page
langContents = oldContents[langSections[0][0]:langSections[0][1]]
# Does the lang section have numbered etymologies?
if re.search(ur'=== *Etymology \d+ *===', langContents, re.UNICODE):
if self._force:
wikipedia.output(u"WARNING: Forced append to [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
if self._cleanupCat:
newContents += '\n[[' + self._cleanupCat + ']]'
else:
wikipedia.output(u"Skipped page [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
return False
# Does the lang section have a verb section already in it?
else:
posHeaders = [self._pos, self._pos + u' form']
# Special case... this happened to me once, so I might as well code it in
if self._pos == 'Verb':
posHeaders.append(u'Participle')
if re.search(ur'=== *(?:{0}) *==='.format(u'|'.join(posHeaders)), langContents, re.UNICODE):
if self._force:
wikipedia.output(u"WARNING: Forced append to [[{0}]]. Already has {1} {2} section.".format(title, self._langName, self._pos))
if self._cleanupCat:
newContents += '\n[[' + self._cleanupCat + ']]'
else:
wikipedia.output(u"Skipped page [[{0}]]. Already has {1} {2} section.".format(title, self._langName, self._pos))
return False
else:
newContents += '\n{{rfc-auto}}'
else:
newContents += '\n{{rfc-auto}}'
if self._simulation:
wikipedia.output(u"Simulated update to page [[{0}]].".format(title))
else:
page.put(oldContents + newContents, comment = u'Auto-generated {0} verb forms - appended'.format(self._langName), minorEdit = False)
else:
newContents += '\n{{count page|[[Wiktionary:Page count]]}}'
if self._simulation:
wikipedia.output(u"Simulated creating page [[{0}]].".format(title))
else:
page.put(newContents, comment = u'Auto-generated {0} verb forms'.format(self._langName), minorEdit = True)
if self._verbose:
wikipedia.output(u"Page [[{0}]] new contents:\n".format(title) + '-' * 60, toStdout = True)
wikipedia.output(newContents, toStdout = True)
wikipedia.output('*' * 60, toStdout = True)
return True
def getTemplates(contents, names):
"""Get all template calls to a specific set of templates from a page."""
templates = []
matches = re.finditer(ur'{{\s*((?:' + '|'.join(names) + ur').*)\s*}}', contents, re.UNICODE)
for match in matches:
templates.append(match.group(1))
return templates
def parseTemplate(template):
"""Parse and convert parameters of a template into dictionaries."""
template = string.split(template, '|')
templateName = template[0]
params = {}
paramIndex = 1
for str in template[1:]:
str = string.split(str, '=', 1)
# The string contains an =
if len(str) >= 2:
paramName = string.strip(str[0])
# Is the name a number?
try:
paramName = int(paramName)
except ValueError:
pass
paramValue = string.strip(str[1])
if paramValue:
params[paramName] = paramValue
else:
paramValue = string.strip(str[0])
if paramValue:
params[paramIndex] = paramValue
paramIndex += 1
return templateName, params
def getSections(contents, name, level, inclHeader = True):
"""Get the start and end index of a section of a given name, or return None."""
sectionRegex = ur'({0} *{1} *{0}\s*)(.*?)(?:(?:\n{0} *[^\n=]+ *{0})|$)'.format('=' * level, name)
matches = re.finditer(sectionRegex, contents, re.DOTALL | re.UNICODE)
if not matches:
return None
ret = []
for match in matches:
if inclHeader:
ret.append((match.start(1), match.end(2)))
else:
ret.append((match.start(2), match.end(2)))
return ret
germanverbformbot.py
[edit]#!/usr/bin/env python
#coding: utf-8
# Copyright CodeCat 2010
#also: Prince Kassad (sometime in 2010)
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This script is based on parts from
# http://en.wiktionary.org/wiki/User:SemperBlottoBot/verbs
import wikipedia, re
from formbot import *
class GermanVerbFormBot(GenericFormBot):
"""A form bot for Dutch verb forms."""
def __init__(self, head, cleanupCat, simulation = False, force = False, verbose = False):
GenericFormBot.__init__(
self, head, ['de-conj-weak', 'de-conj-strong', 'de-conj-weak-eln', 'de-conj-weak-ern', 'de-conj-pp', 'de-conj-irr-stehen'], 'Verb', 'de', 'German',
cleanupCat, simulation, force, verbose)
def generateForms(self, template, params):
"""Overrides base class method."""
if template == 'de-conj-weak':
return self.conjugateWeak(params)
elif template == 'de-conj-strong':
return self.conjugateStrong(params)
elif template == 'de-conj-weak-eln':
return self.conjugateElnVerb(params)
elif template == 'de-conj-weak-ern':
return self.conjugateErnVerb(params)
elif template == 'de-conj-pp':
return self.conjugatePpVerb(params)
elif template == 'de-conj-irr-stehen':
return self.conjugateStehenVerb(params)
else:
return None
def conjugateWeak(self, params):
"""Conjugate a German verb using {{de-conj-weak}}."""
if 6 in params:
sep = params[6]
sepSuf = ' ' + params[6]
else:
sep = ''
sepSuf = ''
#super special stem check
if 4 in params:
StemEnd = 'dt'
elif 5 in params:
StemEnd = 'sz'
else:
StemEnd = ''
stem = params[1]
pastPtc = params.get(2, '')
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg = stem + 'e'
forms.setdefault(pres1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
if StemEnd == 'dt':
pres2Sg = stem + 'est'
pres3Sg = stem + 'et'
elif StemEnd == 'sz':
pres2Sg = stem + 't'
pres3Sg = stem + 't'
else:
pres2Sg = stem + 'st'
pres3Sg = stem + 't'
forms.setdefault(pres2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
# 1st and 3rd person plural are identical to the infinitive... so we will only bother with 2nd person.
pres1Pl = stem + 'en'
pres3Pl = stem + 'en'
if StemEnd == 'dt':
pres2Pl = stem + 'et'
else:
pres2Pl = stem + 't'
forms.setdefault(pres1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
if sep:
forms.setdefault(sep + pres1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g|a}}')
forms.setdefault(sep + pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g|a}}')
forms.setdefault(sep + pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g|a}}')
forms.setdefault(sep + pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g|a}}')
forms.setdefault(sep + pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g|a}}')
# Past indicative
if StemEnd == 'dt':
past1Sg = stem + 'ete'
past2Sg = stem + 'etest'
past3Sg = stem + 'ete'
past1Pl = stem + 'eten'
past2Pl = stem + 'etet'
past3Pl = stem + 'eten'
else:
past1Sg = stem + 'te'
past2Sg = stem + 'test'
past3Sg = stem + 'te'
past1Pl = stem + 'ten'
past2Pl = stem + 'tet'
past3Pl = stem + 'ten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v|a}}')
# Present subjunctive
presSubj1Sg = stem + 'e'
presSubj2Sg = stem + 'est'
presSubj3Sg = stem + 'e'
presSubj1Pl = stem + 'en'
presSubj3Pl = stem + 'en'
presSubj2Pl = stem + 'et'
forms.setdefault(presSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
if sep:
forms.setdefault(sep + presSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1|a}}')
# Past subjunctive
# this is a copy of the past indicative...
if StemEnd == 'dt':
pastSubj1Sg = stem + 'ete'
pastSubj2Sg = stem + 'etest'
pastSubj3Sg = stem + 'ete'
pastSubj1Pl = stem + 'eten'
pastSubj2Pl = stem + 'etet'
pastSubj3Pl = stem + 'eten'
else:
pastSubj1Sg = stem + 'te'
pastSubj2Sg = stem + 'test'
pastSubj3Sg = stem + 'te'
pastSubj1Pl = stem + 'ten'
pastSubj2Pl = stem + 'tet'
pastSubj3Pl = stem + 'ten'
forms.setdefault(pastSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + pastSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + pastSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + pastSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + pastSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + pastSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + pastSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
# Imperative
imperSg = stem + 'e'
forms.setdefault(imperSg + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
if StemEnd == 'dt':
imperPl = stem + 'et'
else:
imperPl = stem + 't'
forms.setdefault(imperPl + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|p}}')
# Participles
presPtc = stem + 'end'
forms.setdefault(sep + presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(sep + pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
def conjugateStrong(self, params):
"""Conjugate a German verb using {{de-conj-strong}}."""
if 10 in params:
sep = params[10]
sepSuf = ' ' + params[10]
else:
sep = ''
sepSuf = ''
#super special stem check
if 5 in params:
StemEnd = 'dt'
elif 12 in params:
StemEnd = 'sz'
else:
StemEnd = ''
stem = params[1]
stemA = params[2]
if 6 in params:
stemB = params[6]
else:
stemB = stem
if 7 in params:
stemC = params[7]
else:
stemC = stemA
if 11 in params:
stemD = stem
else:
stemD = stemB
pastPtc = params.get(3, '')
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg = stem + 'e'
forms.setdefault(pres1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
if StemEnd == 'dt':
if 6 in params:
pres2Sg = stemB + 'st'
pres3Sg = stemB
else:
pres2Sg = stemB + 'est'
pres3Sg = stemB + 'et'
elif StemEnd == 'sz':
pres2Sg = stemB + 't'
pres3Sg = stemB + 't'
else:
pres2Sg = stemB + 'st'
pres3Sg = stemB + 't'
forms.setdefault(pres2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
pres1Pl = stem + 'en'
pres3Pl = stem + 'en'
if StemEnd == 'dt':
pres2Pl = stem + 'et'
else:
pres2Pl = stem + 't'
forms.setdefault(pres1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
if sep:
forms.setdefault(sep + pres1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g|a}}')
forms.setdefault(sep + pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g|a}}')
forms.setdefault(sep + pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g|a}}')
forms.setdefault(sep + pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g|a}}')
forms.setdefault(sep + pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g|a}}')
# Past indicative
if 8 in params:
if StemEnd == 'dt':
past1Sg = stemA
past2Sg = stemA + 'est'
past3Sg = stemA
past1Pl = stemA + 'en'
past2Pl = stemA + 'et'
past3Pl = stemA + 'en'
else:
past1Sg = stemA
if StemEnd == 'sz':
past2Sg = stemA + 't'
else:
past2Sg = stemA + 'st'
past3Sg = stemA
past1Pl = stemA + 'en'
past2Pl = stemA + 't'
past3Pl = stemA + 'en'
else:
if StemEnd == 'dt':
past1Sg = stemA + 'ete'
past2Sg = stemA + 'etest'
past3Sg = stemA + 'ete'
past1Pl = stemA + 'eten'
past2Pl = stemA + 'etet'
past3Pl = stemA + 'eten'
else:
past1Sg = stemA + 'te'
past2Sg = stemA + 'test'
past3Sg = stemA + 'te'
past1Pl = stemA + 'ten'
past2Pl = stemA + 'tet'
past3Pl = stemA + 'ten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v|a}}')
# Present subjunctive
presSubj1Sg = stem + 'e'
presSubj2Sg = stem + 'est'
presSubj3Sg = stem + 'e'
presSubj1Pl = stem + 'en'
presSubj2Pl = stem + 'et'
presSubj3Pl = stem + 'en'
forms.setdefault(presSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
if sep:
forms.setdefault(sep + presSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1|a}}')
# Past subjunctive
if 8 in params:
pastSubj1Sg = stemC + 'e'
pastSubj2Sg = stemC + 'est'
pastSubj3Sg = stemC + 'e'
pastSubj1Pl = stemC + 'en'
pastSubj2Pl = stemC + 'et'
pastSubj3Pl = stemC + 'en'
else:
if StemEnd == 'dt':
pastSubj1Sg = stemC + 'ete'
pastSubj2Sg = stemC + 'etest'
pastSubj3Sg = stemC + 'ete'
pastSubj1Pl = stemC + 'eten'
pastSubj2Pl = stemC + 'etet'
pastSubj3Pl = stemC + 'eten'
else:
pastSubj1Sg = stemC + 'te'
pastSubj2Sg = stemC + 'test'
pastSubj3Sg = stemC + 'te'
pastSubj1Pl = stemC + 'ten'
pastSubj2Pl = stemC + 'tet'
pastSubj3Pl = stemC + 'ten'
forms.setdefault(pastSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + pastSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + pastSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + pastSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + pastSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + pastSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + pastSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
#this handles a special case. some verbs seem to have two possible forms for past subjunctive.
#why, I don't know. but we need to check for it, it does seem quite important to me.
if '7b' in params:
stemE = params['7b']
if 8 in params:
pastSubj1Sg2 = stemE + 'e'
pastSubj2Sg2 = stemE + 'est'
pastSubj3Sg2 = stemE + 'e'
pastSubj1Pl2 = stemE + 'en'
pastSubj2Pl2 = stemE + 'et'
pastSubj3Pl2 = stemE + 'en'
else:
if StemEnd == 'dt':
pastSubj1Sg2 = stemE + 'ete'
pastSubj2Sg2 = stemE + 'etest'
pastSubj3Sg2 = stemE + 'ete'
pastSubj1Pl2 = stemE + 'eten'
pastSubj2Pl2 = stemE + 'etet'
pastSubj3Pl2 = stemE + 'eten'
else:
pastSubj1Sg2 = stemE + 'te'
pastSubj2Sg2 = stemE + 'test'
pastSubj3Sg2 = stemE + 'te'
pastSubj1Pl2 = stemE + 'ten'
pastSubj2Pl2 = stemE + 'tet'
pastSubj3Pl2 = stemE + 'ten'
forms.setdefault(pastSubj1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + pastSubj1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + pastSubj2Sg2, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + pastSubj3Sg2, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + pastSubj1Pl2, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + pastSubj2Pl2, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + pastSubj3Pl2, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
# Imperative
if 9 in params:
imperSg = stemD
else:
imperSg = stemD + 'e'
forms.setdefault(imperSg + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
if StemEnd == 'dt':
imperPl = stem + 'et'
else:
imperPl = stem + 't'
forms.setdefault(imperPl + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|p}}')
# Participles
presPtc = stem + 'end'
forms.setdefault(sep + presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(sep + pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
def conjugateElnVerb(self, params):
"""Conjugate a German verb using {{de-conj-weak-eln}}."""
if 4 in params:
sep = params[4]
sepSuf = ' ' + params[4]
else:
sep = ''
sepSuf = ''
stem = params[1]
pastPtc = params.get(2, '')
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg1 = stem + 'le'
pres1Sg2 = stem + 'ele'
pres1Sg3 = stem + 'el'
forms.setdefault(pres1Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
forms.setdefault(pres1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
forms.setdefault(pres1Sg3 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
pres2Sg = stem + 'elst'
pres3Sg = stem + 'elt'
forms.setdefault(pres2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
pres1Pl = stem + 'eln'
pres2Pl = stem + 'elt'
pres3Pl = stem + 'eln'
forms.setdefault(pres1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
if sep:
forms.setdefault(sep + pres1Sg1, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres1Sg3, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g|a}}')
forms.setdefault(sep + pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g|a}}')
forms.setdefault(sep + pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g|a}}')
forms.setdefault(sep + pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g|a}}')
forms.setdefault(sep + pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g|a}}')
# Past indicative
past1Sg = stem + 'elte'
past2Sg = stem + 'eltest'
past3Sg = stem + 'elte'
past1Pl = stem + 'elten'
past2Pl = stem + 'eltet'
past3Pl = stem + 'elten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v|a}}')
# Present subjunctive
presSubj1Sg1 = stem + 'ele'
presSubj1Sg2 = stem + 'le'
presSubj2Sg1 = stem + 'elest'
presSubj2Sg2 = stem + 'lest'
presSubj3Sg1 = stem + 'ele'
presSubj3Sg2 = stem + 'le'
presSubj1Pl = stem + 'eln'
presSubj2Pl1 = stem + 'elet'
presSubj2Pl2 = stem + 'let'
presSubj3Pl = stem + 'eln'
forms.setdefault(presSubj1Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj2Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj3Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj2Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
if sep:
forms.setdefault(sep + presSubj1Sg1, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg1, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg1, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg2, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg2, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl1, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl2, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1|a}}')
# Past subjunctive
# this is a copy of the past indicative...
past1Sg = stem + 'elte'
past2Sg = stem + 'eltest'
past3Sg = stem + 'elte'
past1Pl = stem + 'elten'
past2Pl = stem + 'eltet'
past3Pl = stem + 'elten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
# Imperative
imperSg1 = stem + 'ele'
imperSg2 = stem + 'le'
imperSg3 = stem + 'el'
forms.setdefault(imperSg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
forms.setdefault(imperSg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
forms.setdefault(imperSg3 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
imperPl = stem + 'elt'
forms.setdefault(imperPl + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|p}}')
# Participles
presPtc = stem + 'elnd'
forms.setdefault(sep + presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(sep + pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
def conjugateErnVerb(self, params):
"""Conjugate a German verb using {{de-conj-weak-ern}}."""
if 4 in params:
sep = params[4]
sepSuf = ' ' + params[4]
else:
sep = ''
sepSuf = ''
stem = params[1]
pastPtc = params.get(2, '')
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg1 = stem + 're'
pres1Sg2 = stem + 'ere'
pres1Sg3 = stem + 'er'
forms.setdefault(pres1Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
forms.setdefault(pres1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
forms.setdefault(pres1Sg3 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
pres2Sg = stem + 'erst'
pres3Sg = stem + 'ert'
forms.setdefault(pres2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
pres1Pl = stem + 'ern'
pres2Pl = stem + 'ert'
pres3Pl = stem + 'ern'
forms.setdefault(pres1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
if sep:
forms.setdefault(sep + pres1Sg1, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres1Sg3, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g|a}}')
forms.setdefault(sep + pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g|a}}')
forms.setdefault(sep + pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g|a}}')
forms.setdefault(sep + pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g|a}}')
forms.setdefault(sep + pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g|a}}')
# Past indicative
past1Sg = stem + 'erte'
past2Sg = stem + 'ertest'
past3Sg = stem + 'erte'
past1Pl = stem + 'erten'
past2Pl = stem + 'ertet'
past3Pl = stem + 'erten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v|a}}')
# Present subjunctive
presSubj1Sg1 = stem + 'ere'
presSubj1Sg2 = stem + 're'
presSubj2Sg1 = stem + 'erest'
presSubj2Sg2 = stem + 'rest'
presSubj3Sg1 = stem + 'ere'
presSubj3Sg2 = stem + 're'
presSubj1Pl = stem + 'ern'
presSubj2Pl1 = stem + 'eret'
presSubj2Pl2 = stem + 'ret'
presSubj3Pl = stem + 'ern'
forms.setdefault(presSubj1Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj2Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj3Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj2Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
if sep:
forms.setdefault(sep + presSubj1Sg1, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg1, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg1, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg2, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg2, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl1, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl2, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1|a}}')
# Past subjunctive
# this is a copy of the past indicative...
past1Sg = stem + 'erte'
past2Sg = stem + 'ertest'
past3Sg = stem + 'erte'
past1Pl = stem + 'erten'
past2Pl = stem + 'ertet'
past3Pl = stem + 'erten'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
# Imperative
imperSg1 = stem + 'ere'
imperSg2 = stem + 're'
imperSg3 = stem + 'er'
forms.setdefault(imperSg1 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
forms.setdefault(imperSg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
forms.setdefault(imperSg3 + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
imperPl = stem + 'ert'
forms.setdefault(imperPl + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|p}}')
# Participles
presPtc = stem + 'ernd'
forms.setdefault(sep + presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(sep + pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
def conjugatePpVerb(self, params):
"""Conjugate a German verb using {{de-conj-pp}}."""
#there are no separable verbs - skip that entirely.
#super special stem check
if 7 in params:
StemEnd = 'sz'
else:
StemEnd = ''
#unlike the previous ones, all of these are required... this saves us countless if checks and lets us do it straight through
stem = params[1]
stemA = params[2]
stemB = params[3]
stemC = params[4]
pastPtc = params.get(5, '')
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg = stemA
if StemEnd == 'sz':
pres2Sg = stemA + 't'
else:
pres2Sg = stemA + 'st'
pres3Sg = stemA
forms.setdefault(pres1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
forms.setdefault(pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
# 1st and 3rd person plural are identical to the infinitive... so we will only bother with 2nd person.
pres1Pl = stem + 'en'
pres2Pl = stem + 't'
pres3Pl = stem + 'en'
forms.setdefault(pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
# Past indicative
past1Sg = stemB + 'te'
past2Sg = stemB + 'test'
past3Sg = stemB + 'te'
past1Pl = stemB + 'ten'
past2Pl = stemB + 'tet'
past3Pl = stemB + 'ten'
forms.setdefault(past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
# Present subjunctive
presSubj1Sg = stem + 'e'
presSubj2Sg = stem + 'est'
presSubj3Sg = stem + 'e'
presSubj1Pl = stem + 'en'
presSubj3Pl = stem + 'en'
presSubj2Pl = stem + 'et'
forms.setdefault(presSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
# Past subjunctive
pastSubj1Sg = stemC + 'te'
pastSubj2Sg = stemC + 'test'
pastSubj3Sg = stemC + 'te'
pastSubj1Pl = stemC + 'ten'
pastSubj2Pl = stemC + 'tet'
pastSubj3Pl = stemC + 'ten'
forms.setdefault(pastSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
# Imperative
#there is no imperative for these verbs... leave that out
# Participles
presPtc = stem + 'end'
forms.setdefault(presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
def conjugateStehenVerb(self, params):
"""Conjugate a German verb using {{de-conj-irr-stehen}}."""
#this is a copy of the strong verb code with all the values hardcoded. lol.
if 2 in params:
sep = params[2]
sepSuf = ' ' + params[2]
else:
sep = ''
sepSuf = ''
#super special stem check
if 3 in params:
insep = params[3]
stem = insep + 'steh'
stemA = insep + 'stand'
stemB = stem
stemC = insep + u'ständ'
stemD = stem
stemE = insep + u'stünd'
pastPtc = insep + 'standen'
else:
stem = 'steh'
stemA = 'stand'
stemB = stem
stemC = u'ständ'
stemD = stem
stemE = u'stünd'
pastPtc = 'gestanden'
# Make a dictionary of lists of the entries, with the word as key
# That way we automatically group cases where two forms are identical
forms = {}
# present indicative
pres1Sg = stem + 'e'
forms.setdefault(pres1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|g}}')
pres2Sg = stemB + 'st'
pres3Sg = stemB + 't'
forms.setdefault(pres2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|g}}')
forms.setdefault(pres3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|g}}')
pres1Pl = stem + 'en'
pres2Pl = stem + 't'
pres3Pl = stem + 'en'
forms.setdefault(pres1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|g}}')
forms.setdefault(pres2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|g}}')
forms.setdefault(pres3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|g}}')
if sep:
forms.setdefault(sep + pres1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|g|a}}')
forms.setdefault(sep + pres2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|g|a}}')
forms.setdefault(sep + pres3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|g|a}}')
forms.setdefault(sep + pres1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|g|a}}')
forms.setdefault(sep + pres2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|g|a}}')
forms.setdefault(sep + pres3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|g|a}}')
# Past indicative
past1Sg = stemA
past2Sg = stemA + 'est'
past3Sg = stemA
past1Pl = stemA + 'en'
past2Pl = stemA + 'et'
past3Pl = stemA + 'en'
forms.setdefault(past1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|v}}')
forms.setdefault(past2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|v}}')
forms.setdefault(past3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|v}}')
forms.setdefault(past1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|v}}')
forms.setdefault(past2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|v}}')
forms.setdefault(past3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|v}}')
if sep:
forms.setdefault(sep + past1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|v|a}}')
forms.setdefault(sep + past2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|v|a}}')
forms.setdefault(sep + past3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|v|a}}')
forms.setdefault(sep + past1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|v|a}}')
forms.setdefault(sep + past2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|v|a}}')
forms.setdefault(sep + past3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|v|a}}')
# Present subjunctive
presSubj1Sg = stem + 'e'
presSubj2Sg = stem + 'est'
presSubj3Sg = stem + 'e'
presSubj1Pl = stem + 'en'
presSubj2Pl = stem + 'et'
presSubj3Pl = stem + 'en'
forms.setdefault(presSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k1}}')
forms.setdefault(presSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k1}}')
forms.setdefault(presSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k1}}')
forms.setdefault(presSubj1Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k1}}')
forms.setdefault(presSubj2Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k1}}')
forms.setdefault(presSubj3Pl+ sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k1}}')
if sep:
forms.setdefault(sep + presSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k1|a}}')
forms.setdefault(sep + presSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k1|a}}')
forms.setdefault(sep + presSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k1|a}}')
forms.setdefault(sep + presSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k1|a}}')
forms.setdefault(sep + presSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k1|a}}')
forms.setdefault(sep + presSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k1|a}}')
# Past subjunctive
pastSubj1Sg = stemC + 'e'
pastSubj2Sg = stemC + 'est'
pastSubj3Sg = stemC + 'e'
pastSubj1Pl = stemC + 'en'
pastSubj2Pl = stemC + 'et'
pastSubj3Pl = stemC + 'en'
forms.setdefault(pastSubj1Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + pastSubj1Sg, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + pastSubj2Sg, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + pastSubj3Sg, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + pastSubj1Pl, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + pastSubj2Pl, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + pastSubj3Pl, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
#this handles a special case. some verbs seem to have two possible forms for past subjunctive.
#why, I don't know. but we need to check for it, it does seem quite important to me.
pastSubj1Sg2 = stemE + 'e'
pastSubj2Sg2 = stemE + 'est'
pastSubj3Sg2 = stemE + 'e'
pastSubj1Pl2 = stemE + 'en'
pastSubj2Pl2 = stemE + 'et'
pastSubj3Pl2 = stemE + 'en'
forms.setdefault(pastSubj1Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|s|k2}}')
forms.setdefault(pastSubj2Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|s|k2}}')
forms.setdefault(pastSubj3Sg2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|s|k2}}')
forms.setdefault(pastSubj1Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|1|p|k2}}')
forms.setdefault(pastSubj2Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|2|p|k2}}')
forms.setdefault(pastSubj3Pl2 + sepSuf, []).append('{{de-verb form of|' + self._head + '|3|p|k2}}')
if sep:
forms.setdefault(sep + pastSubj1Sg2, []).append('{{de-verb form of|' + self._head + '|1|s|k2|a}}')
forms.setdefault(sep + pastSubj2Sg2, []).append('{{de-verb form of|' + self._head + '|2|s|k2|a}}')
forms.setdefault(sep + pastSubj3Sg2, []).append('{{de-verb form of|' + self._head + '|3|s|k2|a}}')
forms.setdefault(sep + pastSubj1Pl2, []).append('{{de-verb form of|' + self._head + '|1|p|k2|a}}')
forms.setdefault(sep + pastSubj2Pl2, []).append('{{de-verb form of|' + self._head + '|2|p|k2|a}}')
forms.setdefault(sep + pastSubj3Pl2, []).append('{{de-verb form of|' + self._head + '|3|p|k2|a}}')
imperSg = stemD + 'e'
forms.setdefault(imperSg + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|s}}')
imperPl = stem + 't'
forms.setdefault(imperPl + sepSuf, []).append('{{de-verb form of|' + self._head + '|i|p}}')
# Participles
presPtc = stem + 'end'
forms.setdefault(sep + presPtc, []).append('{{de-verb form of|' + self._head + '|pr}}')
forms.setdefault(sep + pastPtc, []).append('{{de-verb form of|' + self._head + '|pp}}')
return forms
mewbot.py
[edit]#!/usr/bin/env python
#coding: utf-8
# Copyright CodeCat 2010
#also: Prince Kassad (sometime in 2010)
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This script is based on parts from
# http://en.wiktionary.org/wiki/User:SemperBlottoBot/verbs
import wikipedia, re, string, sys
from germanverbformbot import *
class MewBot:
"""A wrapper class that takes care of functionality specific to MewBot.
If you want to use this for your own bot, you'll need to make some small
adjustments to the run method. Everything else in this file can be kept
as-is."""
def __init__(self, list, simulation, force, verbose):
self._list = list
self._simulation = simulation
self._force = force
self._verbose = verbose
def run(self):
if not self._list:
self._list = self.getList(u'User:KassadBot', u'Requests')
for entry in self._list:
bot = GermanVerbFormBot(entry, 'Category:Requests for cleanup (MewBot)', self._simulation, self._force, self._verbose)
bot.run()
def getList(self, pageName, sectionName):
"""Get the list of entries given at a pre-specified page."""
page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), pageName)
if not page.exists():
wikipedia.output(u"Oh noes! Can't find list page!")
return []
contents = page.get()
sections = getSections(contents, sectionName, 2, False)
if not sections:
wikipedia.output(u"Can't find the {0} section on the list page!".format(sectionName))
return []
contents = string.split(string.strip(contents[sections[0][0]:sections[0][1]]), '\n')
list = []
for line in contents:
match = re.search(ur'# *\[\[:?(.+)\]\]', line, re.UNICODE)
# We're done
if not match:
break
list.append(match.group(1))
return list
def main():
list = []
simulation = False
force = False
verbose = False
for param in sys.argv[1:]:
if param[0] == '-':
options = param[1:]
for opt in options:
if opt == 'f':
force = True
elif opt == 's':
simulation = True
elif opt == 'v':
verbose = True
else:
wikipedia.output("Unknown option: {0}".format(opt))
return
else:
try:
param = unicode(param, 'utf-8')
except UnicodeDecodeError:
param = unicode(param, 'iso8859-1')
list.append(param)
bot = MewBot(list, simulation, force, verbose)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
AutoFormat
[edit]This has been adapted from User:AutoFormat/code.
autoformat.py
[edit]#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot looks for entries tagged for autoformatting, does a number of tasks
No command line arguments.
"""
import wikipedia
import catlib
import sys
import re
import pickle
import time
import xmlreader
import socket
def safe(s):
return pickle.dumps(s)[1:-5]
def lkey(l):
n = l.strip('[]')
if not n: return n
if n == 'Translingual': return '0' + n
if n == 'English': return '1' + n
# bad L2 headers
if n.lower() == 'cyrillic alphabet': return '0' + n
if n.lower() == 'arabic alphabet': return '0' + n
if n.lower() == 'see also': return '3' + n
if n.lower() == 'references': return '4' + n
# handle names like !Kung and 'Auhelawa: move non-alpha to the end of key
if not n[0].isalpha(): n = n[1:] + n[0]
return '2' + n
# this needs to be done here, else we won't be able to use the variables in subroutines which is bad
# make sure we are logged in
site = wikipedia.getSite("en", "wiktionary")
site.forceLogin(sysop = True)
site.forceLogin(sysop = False)
# get our config pages, throw exceptions: we have to stop if we can't read these
print "read languages"
page = wikipedia.Page(site, "User:AutoFormat/Languages")
langtab = page.get()
print "read headers"
page = wikipedia.Page(site, "User:AutoFormat/Headers")
headtab = page.get()
print "read contexts"
page = wikipedia.Page(site, "User:AutoFormat/Contexts")
ctxtab = page.get()
Lcodes = { }
Ltocode = { }
relangtab = re.compile(r'\| (.*?)\|\|(.*)')
i = 0
for line in langtab.splitlines():
mo = relangtab.match(line)
if mo:
for code in mo.group(1).split(','):
Lcodes[code.strip()] = mo.group(2).strip()
i += 1
Ltocode[mo.group(2).strip()] = mo.group(1).split(',')[0].strip()
print "found %d language codes" % i
# treat a couple of other codes as Mandarin etc, since they are in cats:
Lcodes['zh-cn'] = 'Mandarin'
Lcodes['zh-tw'] = 'Mandarin'
Lcodes['nan-cn'] = 'Min Nan'
Lcodes['nan-tw'] = 'Min Nan'
Lcodes['yue-cn'] = 'Cantonese'
Lcodes['yue-hk'] = 'Cantonese'
Scripts = { 'ARchar' : 'Arab',
'Cuneiform' : 'Xsux',
'ELchar' : 'Grek',
'FAchar' : 'fa-Arab',
'HEchar' : 'Hebr',
'JAchar' : 'Jpan',
'KMchar' : 'Khmr',
'LOchar' : 'Laoo',
'RUchar' : 'Cyrl',
'THchar' : 'Thai',
'URchar' : 'ur-Arab',
'ZHchar' : 'Hani',
'ZHsim' : 'Hans',
'ZHtra' : 'Hant' }
PSK = { }
from random import random
from math import log as ln
AH = set()
#newpages = set()
Regex = { }
Prex = {}
# work cache, record time last looked at entry
# each record is key: lc:word, pickled with safe(), value is integer time()
import shelve
cache = shelve.open("af-cache")
def prescreen():
while True: # indef repeat
cis = 0
rdex = re.compile(r'title="(.+?)"')
site = wikipedia.getSite("en", "wiktionary")
#we read the random pages to get pages to edit. it seems more efficient than
#using the dump to me.
print '(%d, reading random pages)' % now()
try:
rdt = site.getUrl("/w/api.php?action=query&list=random&format=xml&rnnamespace=0" +
"&rnlimit=10", sysop = True)
except wikipedia.NoPage:
print "Can't get random pages from en.wikt!"
rdt = ''
time.sleep(30)
continue
if '</random>' not in rdt:
print "some bad return from random pages, end tag not found"
rdt = ''
time.sleep(30)
continue
for title in rdex.findall(rdt):
if ':' in title: continue # other stray stuff in NS:0
yield title
def now(): return int(time.clock())
# share timer with main
naptime = 0
def rcpages(site):
# generator which yields recentchanges, but not unpatrolled changes
# also entries in category
# in between, yields pages that satisfy the prescreen in random order
global naptime
site = wikipedia.getSite("en", "wiktionary")
cat = catlib.Category(site, "Category:Requests for autoformat")
seen = set()
nextcat = now() - 1
nextrc = now() - 1
hold = { }
rcex = re.compile(r'title="(.+?)"')
for title in prescreen():
seen.add(title)
print '(%d, from random pages)' % now()
page = wikipedia.Page(site, title)
yield page
nf = 0
nd = 0
# get our category, every 10-15 minutes or so
if now() > nextcat:
#cat.catlist(purge = True)
#attn Kassad: raised priority of autoformat category - it seems too stuffed up to me. previous was 7
for page in cat.articles():
nf += 1
if nf > 500: break # just munch the cat, not too hungry ;-)
# if len(hold) > 100 and nf > 1: break # try to keep up, cat can wait? needed?
print '(%d)' % now()
seen.add(page.title())
if page.title() in hold: del hold[page.title()]
yield page
nextcat = now() + 740
# recent changes
#reducing duration
if now() > nextrc:
print '(%d, reading recent changes)' % now()
try:
rct = site.getUrl("/w/api.php?action=query&list=recentchanges&format=xml&rcprop=title" +
"&rclimit=5000&rcshow=patrolled|!bot&rctype=edit|new&rcnamespace=0", sysop = True)
except wikipedia.NoPage:
print "Can't get recentchanges from en.wikt!"
rct = ''
time.sleep(30)
continue
if '</recentchanges>' not in rct:
print "some bad return from recentchanges, end tag not found"
rct = ''
time.sleep(30)
continue
nextrc = now() + 600
ht = 200
for title in rcex.findall(rct):
if ':' in title: continue # other stray stuff in NS:0
if title not in seen:
seen.add(title)
hold[title] = now() + ht
# scatter out into future ... (numbers fairly arbitrary, but work well)
ht += 12
if ht > 21 * 3600: ht /= 7 # ? if more than most of a day
nf += 1
print "found: [%s] hold until %d" % (safe(title), hold[title])
pastime = now()
for title in sorted(hold):
# 10 on a pass is enough
if nd > 9: break
if hold[title] > pastime: continue
print '(%d, rc held to %d)' % (now(), hold[title])
del hold[title]
nd += 1
page = wikipedia.Page(site, title)
yield page
if not nd and not nf and naptime > 5:
naptime = min(naptime, 340) # max to keep timers running
print "(%d, sleeping %d)" % (now(), naptime)
# also rely on put throttle
time.sleep(naptime)
print '(%d, %d held)' % (now(), len(hold))
continue
# now have some serious recursion fun!
# fuzzy returns string match score
# r is min required, calls may have neg r, may return value < r
def fuzzy(a, b, r):
if not a or len(a) < r: return 0
if not b or len(b) < r: return 0
if a == b: return len(a)
if a[0] == b[0]: return 1 + fuzzy(a[1:], b[1:], r-1)
if a[-1] == b[-1]: return 1 + fuzzy(a[:-1], b[:-1], r-1)
# try with each char forward
p = a.find(b[0])
if p >= 0: sca = 1 + fuzzy(a[p+1:], b[1:], r-1)
else: sca = 0
p = b.find(a[0])
if p >= 0: scb = 1 + fuzzy(b[p+1:], a[1:], r-1)
else: scb = 0
# no match either/or way, skip this char, one or both
if not sca and not scb: sk = fuzzy(a[1:], b[1:], r)
elif not sca: sk = fuzzy(a, b[1:], r)
elif not scb: sk = fuzzy(a[1:], b, r)
else: sk = 0
return max(sk, sca, scb)
def infline(title, lang, header):
pos = header.lower()
if pos.startswith('{{'):
pos = pos[2:-2].split('|')[0]
if lang == 'en':
if pos in ['verb', 'noun', 'adjective', 'adverb']:
return "{{infl|en|" + pos + "}}[[Category:English "+ pos +"s that lack inflection template]]"
a = ord(title[0:1])
# Arabic:
if 0x0600 <= a < 0x0780:
return "{{infl|%s|%s|sc=Arab}}" % (lang, pos)
# Han:
# this is planes 1-2, needs closer check
if 0x3400 <= a < 0xA000 or 0xd800 <= a < 0xdc00:
if lang == 'ko':
return "{{infl|%s|%s|sc=Hant}}{{attention|ko|may need inflection template}}" % (lang, pos)
elif lang == 'ja':
return "{{infl|%s|%s|sc=Jpan}}{{attention|ja|needs inflection template}}" % (lang, pos)
elif lang == 'vi':
return "{{infl|%s|%s|sc=Hant}}{{attention|vi|may need inflection template}}" % (lang, pos)
else:
return "{{infl|%s|%s|sc=Hani}}{{attention|zh|needs inflection template}}" % (lang, pos)
if lang == 'ja':
return "{{infl|%s|%s}}{{attention|ja|needs inflection template}}" % (lang, pos)
if lang == 'ko':
return "{{infl|%s|%s}}{{attention|ko|may need inflection template}}" % (lang, pos)
if lang in ['zh', 'cmn', 'yue', 'nan']:
return "{{infl|%s|%s}}{{attention|zh|may need inflection template}}" % (lang, pos)
return "{{infl|%s|%s}}" % (lang, pos)
MOD = [ 'chiefly', 'coarse', 'especially', 'extremely', 'frequently', 'generally', 'mainly', 'markedly',
'mildly', 'mostly', 'often', 'particularly', 'primarily', 'sometimes', 'usually', 'very' ]
reunlink = re.compile(r'\[\[(.*?)\]\]')
# match a simple context, words but no odd puncuation etc
resimctx = re.compile(r'[-\w ]*$')
PRETULIP = ('of ', 'by ')
def cpar(cstr, ctxs):
# convert context string to template name(s)
tname = ''
cstr = re.sub(r'[,;\|]+', ',', cstr)
for cs in cstr.split(','):
cs = cs.strip(" '")
if '[' in cs: cs = reunlink.sub(r'\1', cs)
# handles n modifiers, does context? yes.
while cs.split(' ')[0].lower() in MOD:
mod = cs.split(' ')[0].lower()
tname += mod + '|'
cs = cs[len(mod):].strip()
if cs.lower() in ctxs:
tname += ctxs[cs.lower()] + '|'
elif cs.startswith(PRETULIP):
if not tname: tname = 'context|'
tname += cs + '|'
elif tname and resimctx.match(cs):
tname += cs + '|'
else: return ''
tname = tname.rstrip('|')
return tname
def ibsub(imo):
# some prefix captured
pref = imo.group(1)
istr = imo.group(2)
s = reunlink.sub(r'\1', istr)
# not general enough, bar pipes in match for now in re precomp
#if s != istr and '|' in s: s = s.split('|')[1]
s = re.sub(r',\s*', '|', s)
if imo.group(3) == ':':
return pref + '{{i-c|' + s + '}}'
else:
return pref + '{{i|' + s + '}}'
def sdif(a, b):
# returns -(a stuff) +(b stuff) when one change
i = 0
while a[i:i+1] and a[i:i+1] == b[i:i+1]: i += 1
an = a[i:]
bn = b[i:]
j = 1
while j < len(an) and an[-j:] == bn[-j:]: j += 1
j -= 1
# special case: improve on -}} {{ +| :
if j >= 3 and an.startswith('}} {{') and bn[:-j].endswith('|'):
an = a[i-3:]
bn = b[i-3:]
j -= 3
# return '-' + a[i-3:][:11] + ' +' + b[i-3:][:7] # gaa ...
if j: return '-' + an[:-j] + ' +' + bn[:-j]
else: return '-' + an + ' +' + bn
# okay, try that! not so pretty is it?
# sort language sections:
retransline = re.compile(r'\* \[*([^\]:\{\}]+?)\]*:') # match an already canonicalized line
retransreq = re.compile(r'\* \{\{trreq\|([^\}]+?)\}\}') # trans req template
retranstbc = re.compile(r'\* \{\{ttbc\|([^\}]+?)\}\}') # trans to be checked, allow here?
redetemp = re.compile(r'\{\{\w*\|')
redechar = re.compile(r'[\{\}\|\[\]]')
redecomm = re.compile(r'<!--.*?-->')
def nlen(s):
# simplest form:
# return 1 + len(s)/135 # +1 for each length of line that will probably wrap (WAG)
# this routine can be twaeked more if needed
# better:
s2 = redetemp.sub('', s)
s2 = redechar.sub('', s2)
s2 = redecomm.sub('', s2)
# dbg:
# if len(s2) >= 85: print "long line (%d): %s" % (1+len(s2)/85, safe(s2))
return 1 + len(s2)/85
# reduce text to "safe" for wiki as a template parameter:
rewsafe = re.compile(r'[\{\}\[\]\|\<\>]+')
# match a see-only case:
reseeonly = re.compile(r"\{\{trans-top\|(.+?)\}\}\n+[ :']*[Ss]ee[ ':]*(\[\[.+?\]\])(.*)$", re.S)
def transort(tmo):
ts = { }
tsk = { }
# take apart by language, treat header as "language" nil
prob = ''
prev = ''
k = 0
for tline in tmo.group(0).splitlines():
if tline.startswith('{{trans-top'):
if '' in ts:
prob = "trans-top found inside section, missing trans-bottom?"
break
ts[''] = tline
tsk[''] = 0
continue
if tline == '{{trans-mid}}': continue
if tline == '{{trans-bottom}}': continue
if not tline: continue
mo = retransline.match(tline)
if not mo: mo = retransreq.match(tline)
if not mo: mo = retranstbc.match(tline)
if mo:
lang = mo.group(1)
if lang in ts:
prob = "duplicate language: " + lang
break
if lang.startswith('{{'):
prob = "unexpected template: " + lang
break
if lang in Lcodes:
llstemp = Lcodes[lang]
lang = llstemp
ts[lang] = tline
nl = nlen(tline)
tsk[lang] = nl
k += nl
prev = lang
continue
if tline.startswith('* '):
prob = "unparsed language line: " + tline
break
# [tbd: treat ** as a sub language, eg key is "Chinese | Mandarin"]
if tline.startswith('*:') or tline.startswith('**'): # allow both here
ts[prev] += '\n' + tline
nl = nlen(tline)
tsk[prev] += nl
k += nl
continue
if tline.startswith(': ') and not prev: # e.g. : ''see'' reference
ts[prev] += '\n' + tline
tsk[prev] += 1
k += 1
continue
if tline.startswith('<!--') and not prev:
ts[prev] += '\n' + tline
# no addition to counts
continue
prob = "unknown line format: " + tline
break
# blank section or nothing worth sorting, do nothing? um, format it default
# if not k: return tmo.group(0)
# pick up see-only case before looking at prob:
if not prev:
# no languages found
mo = reseeonly.match(tmo.group(0))
if mo:
print "matched see in trans section"
gloss = mo.group(1).strip() # leaves ''s as an issue
target = mo.group(2).strip()
if '#' not in target and '|' not in target: target = target.strip('[]')
rest = mo.group(3)
# check remainder
rest = rest.replace("{{trans-mid}}", '')
rest = rest.replace("{{trans-bottom}}", '')
if not rest.strip(" '\n"):
if gloss == target: return "{{trans-see|" + target + "}}"
else: return "{{trans-see|" + gloss + "|" + target + "}}"
else: pass # something else, leave alone or tag problem ...
if prob:
print "in trans section,", safe(prob)
prob = rewsafe.sub(' ', prob) # wiki-safe ;-)
return "{{rfc-tsort|" + prob + "}}\n" + tmo.group(0) # rfc tag + unchanged
# re-assemble, balance columns
m = 0
tsnew = ''
for lang in sorted(ts, key=lkey):
tsnew += ts[lang] + '\n'
m += tsk[lang]
if k and m >= (k + 1) / 2:
tsnew += '{{trans-mid}}\n'
k = 0
# if not m: tsnew += '{{trans-mid}}\n'
if '{{trans-mid}}' not in tsnew: tsnew += '{{trans-mid}}\n' # better test? should be the same as not m
tsnew += '{{trans-bottom}}\n'
return tsnew
def prokey(s):
# is (sorted) stable? as of Python 2.3, yes ;-)
# simple prolog sort, LHS after RHS, unknown in the middle
if s.startswith('{{was wotd'): return '0' # moved in monobook
if s.startswith('{{wiki'): return '1' # sister templates
if s.startswith('{{commons'): return '1' # sister templates
if s.startswith('{{inter'): return '1' # sister templates
if s.startswith('{{zh-'): return '2' # Chinese floatright
if s.startswith('{{ja-'): return '2' # Japanese floatright
if s.startswith('[[Image'): return '3' # images
if s.startswith('[[image'): return '3' # images
#attn Kassad: we now call them files. remove?
if s.startswith('[[File'): return '3' # images
if s.startswith('[[file'): return '3' # images
# LHS:
if s.startswith('{{selfref'): return '6'
if s.startswith('{{also'): return '7'
if s.startswith('{{xsee'): return '7'
if s.startswith('{{xalso'): return '7'
if s: print "prolog sort: no key for %s" % safe(s)
else: return '9' # blank lines usually are at end, will be removed
return '5'
def main():
global naptime
socket.setdefaulttimeout(30)
# regex precomp, force headers to canonical:
# first allows singleton =
rehead1 = re.compile(r'(={2,6})(.+?)={2,6}(.*)$')
rehead2 = re.compile(r'(={1,6})([^=<]+?)={1,6}(.*)$')
rehead3 = re.compile(r'(={1,6})([^=<]+?)=+(.*)$')
rehead4 = re.compile(r'(=+)([^=<]+)(.*)$')
realleq = re.compile(r'=+$')
# L2 headers
reL2head = re.compile(r'==?\s*([^=]+)={1,6}(.*)')
# lang= on bad headers, so allow singleton ='s:
reheader = re.compile(r'(={3,6})\s*(.+?)={2,6}(.*)')
reiwiki = re.compile(r'\[\[[-a-z]{2,11}:(.*)\]\]')
recat = re.compile(r'\[\[category:.*?\]\]', re.I)
retrans1 = re.compile(r'\* \[\[w:.+\|([^\]]+?)\]\]\s*:(.*)')
retrans2 = re.compile(r'\* \[\[([^\]]+?)\]\]\s*:(.*)')
retrans3 = re.compile(r'\* ([^:]+?):(.*)')
# the below should hopefully fix a bug that happened with Serbo-Croatian
retrans4 = re.compile(r'\* ([\w\-]+)(.*)') # missing :
retag = re.compile(r'\{\{rfc-auto(\|.*?|)}}')
regender = re.compile(r"''([mfcn])''")
reglossfix = re.compile(r'(.+)\(\d+\)$')
retopgloss = re.compile(r'\{\{top(\|.*?|)}}$')
recontext = re.compile(r"^# *\(''(.+?)''\):? ?(.*)$", re.M)
recontext2 = re.compile(r"^# *''\((.+?)\):?'' ?(.*)$", re.M)
recontext3 = re.compile(r"^# *\{\{italbrac\|([^}]+?)}}:? ?(.*)$", re.M)
repronn = re.compile(r'Pronunciation \d+')
# be careful to match and remove newline in these unless they happen to be at the very end:
rerfclevel = re.compile(r"^\{\{rfc-level\|.*\+.*\}\}\n?", re.M)
rerfcxphrase = re.compile(r"^\{\{rfc-xphrase\|.*\}\}\n?", re.M)
rerfcheader = re.compile(r"^\{\{rfc-header\|.*\}\}\n?", re.M)
rerfcsubst = re.compile(r"^\{\{rfc-subst\}\}\n?", re.M)
rerfcpronn = re.compile(r"^\{\{rfc-pron-n\|.*\}\}\n?", re.M)
# italbracs not on context/defn lines, template italbrac->i replacement separate
# limited forms ... nowilink with pipes, no templates, look for : in mo.g3
# look for gloss, etc, * lines to start ...
reibcomma = re.compile(r"^(\*\s*)\(''([^\)^'^\|^\{]+):?''\)(:?)")
reibcomma2 = re.compile(r"^(\*\s*)''\(([^\)^'^\|^\{]+):?\)''(:?)")
# match "stackable" format characters at start of lines, so we can have one space exactly
restack = re.compile(r"^([:#\*]+)\s*")
# regex table (dict, name = tuple of compiled object and replacement)
Regex['subst:PAGENAME'] = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}')
Regex['template -cattag +context'] = (re.compile(r'\{\{cattag\|'), '{{context|')
Regex['template -Unicode +unicode'] = (re.compile(r'\{\{Unicode\|'), '{{unicode|')
Regex['template -Wikipedia +wikipedia'] = (re.compile(r'\{\{Wikipedia([\|\}])'), r'{{wikipedia\1')
Regex['template -WP +wikipedia'] = (re.compile(r'\{\{WP([\|\}])'), r'{{wikipedia\1')
Regex['template -Acronym +acronym'] = (re.compile(r'\{\{Acronym([\|\}])'), r'{{acronym\1')
Regex['template -Initialism +initialism'] = (re.compile(r'\{\{Initialism([\|\}])'), r'{{initialism\1')
Regex['template -Abbreviation +abbreviation'] = (re.compile(r'\{\{Abbreviation([\|\}])'), r'{{abbreviation\1')
Regex['template -AHD +enPR'] = (re.compile(r'\{\{AHD([\|\}])'), r'{{enPR\1')
# translations
#Regex['template -trans-bot +trans-bottom'] = (re.compile(r'\{\{trans-bot\}\}'), '{{trans-bottom}}')
#Regex['template -trans-middle +trans-mid'] = (re.compile(r'\{\{trans-middle\}\}'), '{{trans-mid}}')
Regex['elided Translations to be checked header'] = (re.compile(
r'^={3,6}Translations to be checked={3,6}\n*\{\{checktrans', re.M), '{{checktrans')
Regex['elided Translations to be checked header and comment'] = (re.compile(
r'^={3,6}Translations to be checked={3,6}\n*<!--\s*Remove this section.*\n*\{\{checktrans', re.M),
'{{checktrans')
Regex['checktrans and trans-top to checktrans-top'] = (re.compile(
r'^\{\{checktrans\}\}\n*\{\{trans-top\|\w*lations to be \w*\}\}', re.M), '{{checktrans-top}}')
Regex['checktrans/top/mid/bottom to checktrans-top etc'] = (re.compile(
r'^\{\{checktrans\}\}\n*\{\{top\}\}(.*?)^\{\{mid\}\}(.*?)^\{\{bottom\}\}', re.M|re.S),
r'{{checktrans-top}}\1{{checktrans-mid}}\2{{checktrans-bottom}}')
Regex['template -ttbc-top +checktrans-top'] = (re.compile(r'\{\{ttbc-top\}\}'), '{{checktrans-top}}')
Regex['template -ttbc-mid +checktrans-mid'] = (re.compile(r'\{\{ttbc-mid\}\}'), '{{checktrans-mid}}')
Regex['template -ttbc-bottom +checktrans-bottom'] = (re.compile(r'\{\{ttbc-bottom\}\}'),
'{{checktrans-bottom}}')
Regex['template -trad +t'] = (re.compile(r'\{\{trad\|'), '{{t|')
Regex['template -trad- +t-'] = (re.compile(r'\{\{trad-\|'), '{{t-|')
Regex['un-indent {{also}} template'] = (re.compile(r'^:\{\{also\|', re.M), '{{also|')
# given name, preferred syntax
Regex['xx: to lang=xx in given name template'] = (
re.compile(r'(\{\{given name[^\}]*?\|)\|?([-a-z]{2,10}):\}\}'), r'\1lang=\2}}')
Regex['from language to from=language in given name template'] = (
re.compile(r'(\{\{given name[^\}]*?\|)from ([-a-zA-Z ]+)\|?([\}\|])'), r'\1from=\2\3')
# table format lines, row divs to one "-"
Regex['table |--* to |-'] = (re.compile(r'^\|--+', re.M), r'|-')
# stuff left from preload templates
# careful this first one starts with 3 {'s, check previous character? not for now
Regex['remove template subst detritus'] = (re.compile('\{\{\{[0-9a-z]+\|(.*?)\}\}\}'), r'\1')
Regex['remove template subst detritus #if etc'] = (re.compile('\{\{#\w+:\|\|?\}\}'), r'')
# temp for esbot leftovers:
Regex['remove esbot:catline'] = (re.compile('\{\{esbot:catline.*\{\{ending\}{5,5}'), r'')
# script code replacements, first a dict, then generate the two regex forms for each:
for sc in Scripts:
Regex['script template -'+sc+' +'+Scripts[sc]] = (re.compile(r'\{\{'+sc+r'\|'), '{{'+Scripts[sc]+'|')
Regex['script parameter -sc='+sc+' +sc='+Scripts[sc]] = (
re.compile(r'\|sc='+sc+r'([\}\|])'), '|sc='+Scripts[sc]+r'\1')
# whoa(!)
# see templates
Regex['template -see +also'] = (re.compile(r'\{\{see\|'), r'{{also|')
Regex['template -See +also'] = (re.compile(r'\{\{See\|'), r'{{also|')
Regex['template -see also +also'] = (re.compile(r'\{\{see also\|'), r'{{also|')
# fix Japanese sees, allow a line for kanjitab after header (do not use re.S)
Regex['Japanese see/also in section to ja-see-also'] = \
(re.compile(r'^(==Japanese==\n*.*\n*){\{(see|also)\|', re.M), \
r'\1{{ja-see-also|')
Regex['add language in front of {{t}}'] = (re.compile(r'^\*? *\{\{t(\+|-|)\|([a-z-]+)\|', re.M), \
r'* {{\2}}: {{t\1|\2|')
# (a few more general Regex below)
StarTemp = set([ 'Han ref', 'ja-readings', 'ethnologue', 'websters-online', 'pedialite',
'Hanja ref', 'Linguist List', 'IPA', 'SAMPA', 'enPR', 'ISO 639', 'R:1913' ])
restartemp = re.compile(r'\{\{(.+?)[\|\}]')
# trans lines gender templates regex, ordered list:
Trex = [ ]
# first replace ' cases with templates, look for leading space:
Trex.append((re.compile(r" ''([mfcn])''"), r' {{\1}}'))
Trex.append((re.compile(r" ''(pl|plural)''"), ' {{p}}'))
Trex.append((re.compile(r" ''(sg|sing|singular)''"), ' {{s}}'))
Trex.append((re.compile(r" ''m( and| or|,|/|) ?f''"), ' {{m|f}}'))
# now look for combinations:
Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcn])}},? \{\{([cnps])}}"), r'{{\1|\2|\3}}'))
Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcnps])}}"), r'{{\1|\2}}'))
# hmmm...
Trex.append((re.compile(r"\{\{t([\+\-]?)\|([^\|]*?)\|([^\|]*?)\|mf}}"), r'{{t\1|\2|\3|m|f}}'))
# match trans sections
retransect = re.compile(r"^\{\{trans-top.*?^\{\{trans-bottom\}\}\n", re.M|re.S)
# Pronunciate
# like Regex, but applied line by line only in pronunciation sections
# use ^ and $ as needed with re.M for prescreen
Prex['template enPR/IPA/SAMPA'] = \
(re.compile(r'^\*? ?([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* {{enPR|\1}}, {{IPA|/\2/}}, {{SAMPA|/\3/}}')
Prex['template enPR/IPA/SAMPA (RP, UK, US)'] = \
(re.compile(r"^\*? ?\(''(RP|UK|US)''\):? *"
r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* {{a|\1}} {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')
Prex['template enPR/IPA/SAMPA with {a}'] = \
(re.compile(r"^\*? ?(\{\{a\|[^\}]+\}\}):? *"
r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* \1 {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')
Prex['+rhymes template'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:English:-(?P<s>.+?)\|-(?P=s)\]\]"),
r'{{rhymes|\1}}')
# w/O "Rhymes:":
Prex['+rhymes template w/Rhymes: in link'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:English:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2}}')
Prex['+rhymes template (Finnish)'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|-(?P=s)\]\]"),
r'{{rhymes|\1|lang=fi}}')
Prex['+rhymes template w/Rhymes: in link (Finnish)'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2|lang=fi}}')
Prex['+rhymes template w/Rhymes: in link (French)'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:French:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2|lang=fr}}')
Prex['+rhymes template (Icelandic)'] = \
(re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Icelandic:-(?P<s>.+?)\|-(?P=s)\]\]"),
r'{{rhymes|\1|lang=is}}')
Prex['template -Rhymes +rhymes'] = (re.compile(r'\{\{Rhymes([\|\}])'), r'{{rhymes\1')
# multiple rhymes (assume language matches! ;-)
Prex['add additional rhyme to template'] = \
(re.compile(r'(\{\{rhymes\|[^\}]+)\}\} *(,|or|) *\[\[[Rr]hymes:[A-Za-z -]+:-(?P<s>.+?)\| ?-(?P=s)\]\]'),
r'\1|\3}}')
Prex["rm /'s from enPR template"] = (re.compile(r'\{\{enPR\|/([^ /\[\]\{\}]+?)/\}\}'), r'{{enPR|\1}}')
# RP, UK, and US in a wide variety of cases
Prex['(RP) to {{a|RP}}'] = (re.compile(r"^\*? ?[\(\[\{']+RP[\]\)\}:']+", re.M), r'* {{a|RP}}')
Prex['(UK) to {{a|UK}}'] = (re.compile(r"^\*? ?[\(\[\{']+UK[\]\)\}:']+", re.M), r'* {{a|UK}}')
Prex['(US) to {{a|US}}'] = (re.compile(r"^\*? ?[\(\[\{']+US[\]\)\}:']+", re.M), r'* {{a|US}}')
Prex['(italbrac RP) to {{a|RP}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*RP\]*\}\}:?", re.M), r'* {{a|RP}}')
Prex['(italbrac UK) to {{a|UK}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*UK\]*\}\}:?", re.M), r'* {{a|UK}}')
Prex['(italbrac US) to {{a|US}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*US\]*\}\}:?", re.M), r'* {{a|US}}')
Prex['IPA: [[WEAE]] to {{a|WEAE}} IPA:'] = \
(re.compile(r"^\*? ?IPA: [\(\[\{']+WEAE[\]\)\}:']+", re.M), r'* {{a|WEAE}} IPA:')
Prex['(GenAm) to {{a|GenAm}}'] = (re.compile(r"^\*? ?\[\[w:G[^\|]+\|GenAm\]\]", re.M), r'* {{a|GenAM}}')
Prex['(Canada) to {{a|Canada}}'] = (re.compile(r"^\*? ?[\(\[\{']+Canada[\]\)\}:']+", re.M), r'* {{a|Canada}}')
Prex['(Australia) to {{a|Australia}}'] = \
(re.compile(r"^\*? ?[\(\[\{']+Australia[\]\)\}:']+", re.M), r'* {{a|Australia}}')
Prex['(Aus) to {{a|Aus}}'] = (re.compile(r"^\*? ?[\(\[\{']+Aus[\]\)\}:']+", re.M), r'* {{a|Aus}}')
Prex['(GenAm|US) to {{a|GenAm}}'] = \
(re.compile('^' + re.escape("* (''[[General American|US]]'')"), re.M),
r'* {{a|GenAm}}')
Prex['(RecPr|UK) to {{a|RP}}'] = \
(re.compile('^' + re.escape("* (''[[Received Pronunciation|UK]]'')"), re.M),
r'* {{a|RP}}')
# untemplated SAMPA and IPA, several combinations, also for "AHD", allow an {{a}} template in front
Prex['template IPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:IPA\||)IPA\]*:? *([/\[][^\{\|\}/\]]+?[/\]])$", re.M),
r'* \1{{IPA|\3}}')
Prex['template IPA -IPAchar'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:IPA\||)IPA\]*:? *\{\{IPAchar\|([/\[][^\{\|\}/\]]+?[/\]])\}\}$", re.M),
r'* \1{{IPA|\3}}')
Prex['template SAMPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:SAMPA\||)SAMPA\]*:? *([/\[])(<tt>|)([^\|\}/]+?)(</tt>|)([/\]])$", re.M),
r'* \1{{SAMPA|\3\5\7}}')
Prex['template enPR (was AHD)'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)\[*(w:AHD\||)AHD\]*:? *([^ \{\|\}/]+?)$", re.M),
r'* \1{{enPR|\3}}')
Prex['template X-SAMPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:X-SAMPA\||)X-SAMPA\]*:? *([/\[])(<tt>|)([^\{\|\}/]+?)(</tt>|)([/\]])$", re.M),
r'* \1{{X-SAMPA|\3\5\7}}')
Prex['or/comma to multiple parameters in IPA template'] = \
(re.compile(r"\{\{IPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{IPA|\1|\3}}')
Prex['or/comma to multiple parameters in enPR template'] = \
(re.compile(r"\{\{enPR\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{enPR|\1|\3}}')
Prex['or/comma to multiple parameters in SAMPA template'] = \
(re.compile(r"\{\{SAMPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{SAMPA|\1|\3}}')
# accent templates, try to cover the A-cai/Min Nan cases and others, up to 4
Prex['+accent template 1'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2}}')
Prex['+accent template 2'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4}}')
Prex['+accent template 3'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4|\6}}')
Prex['+accent template 4'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4|\6|\8}}')
# hyphenation ...
Prex['+hyphenation template'] = (re.compile(r"'*Hyphenation:?'*:? *([^ \{\}]+)$"), r'{{hyphenation|\1}}')
Prex['middot to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u00B7' + '(.+?\}\})'),
r'\1|\2')
Prex['hyphpt to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2027' + '(.+?\}\})'),
r'\1|\2')
Prex['middot (HTML) to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)·(.+?\}\})'),
r'\1|\2')
# "blank" IPA/SAMPA/AHD, include new-line, so put these in general regex
Regex['replaced IPA // with {{rfp}}'] = (re.compile(r'^\* \[\[IPA\]\]:? *//\n', re.M), '{{rfp}}\n')
Regex['removed SAMPA //'] = (re.compile(r'^\* \[\[SAMPA\]\]:? *//\n', re.M), '')
Regex['removed AHD //'] = (re.compile(r'^\* \[\[AHD\]\]:? *//\n', re.M), '')
# IPA template fix to add lang=, capture all but }} without =
reIPAlang = re.compile(r'(\{\{IPA\|[^}=]+)\}\}')
# combine to single lines, lines are canonical
repronsing3 = re.compile(r"^\* \{\{enPR\|(.*?)\}\}\n\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)
repronsing3a = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}\n\* \{\{enPR\|(.*?)\}\}", re.M)
repronsing2 = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)
Level = { }
L43 = { }
POS = { }
EOS = [ 'See also', 'References', 'External links', 'Anagrams', 'Dictionary notes', 'Trivia', 'Statistics']
TOS = [ 'Pronunciation', 'Alternative spellings', 'Alternative forms', 'Production' ]
HAN = ['Han character', 'Kanji', 'Hanzi', 'Hanza']
HT = ( '{{abbreviation', '{{initialism', '{{acronym', '{{numeral' )
NS = { }
Hfix = { }
reheadtab = re.compile(r'\| (.*?)\|\|\s*([1-5/]*)\s*\|\|(.*?)\|\|(.*?)\|\|(.*)')
i = 0
for line in headtab.splitlines():
mo = reheadtab.match(line)
if mo:
header = mo.group(1).strip()
if mo.group(2).strip() == '4/3':
L43[header] = True
Level[header] = 4
print "header %s is 4/3" % header
else: Level[header] = int(mo.group(2))
if mo.group(3).strip() == 'NS': ns = NS[header] = True
else: ns = False
if mo.group(4).strip() == 'POS': POS[header] = True
for variant in mo.group(5).split(','):
variant = variant.lower().strip()
if not variant: continue
Hfix[variant] = header
"""
if not ns:
if variant.endswith('s'): Hfix[variant[-1]] = header
else: Hfix[variant + 's'] = header
"""
Hfix[header.lower()] = header
if not ns:
if header.endswith('s'): Hfix[header.lower()[-1]] = header
else: Hfix[header.lower() + 's'] = header
i += 1
print "found %d headers" % i
# lots of possible ety sects, 1 to 24
for i in range(1, 25):
Hfix['etymology %d'%i] = 'Etymology %d'%i
Level['Etymology %d'%i] = 3
Contexts = { }
rectxtab = re.compile(r"\|\s*''(.*?)''\s*\|\|(.*)")
i = 0
for line in ctxtab.splitlines():
mo = rectxtab.match(line)
if mo:
m1 = mo.group(1).strip()
m2 = mo.group(2).strip()
if not m1 or not m2: continue
# only use first, table at top over-rides auto, templates over-ride redirects
if m1 not in Contexts: Contexts[m1] = m2
i += 1
print "found %d context templates" % i
# turn on/off for now
contextp = True
entries = 0
fixed = 0
# (specific stats)
# Set up set of all headers that are valid (at L3 or higher)
for header in Level:
AH.add(header)
# Sigh. True means prohibited from changing 4/3 levels
Connel = True
for page in rcpages(site):
naptime += 3
days = (time.time() - 1199145600) / 86400 # days since 1 Jan 08
if random() < days/370: Connel = False # some of the time, as they need to be checked
else: Connel = True
title = page.title()
print "page %s" % safe(title)
if ':' in title:
print "not in main namespace"
continue
if title.lower() == 'main page':
print "skip Main page ..."
continue
entries += 1
try:
text = page.get()
origtext = text
except wikipedia.NoPage:
print "Can't get %s from en.wikt" % safe(title)
text = ''
continue
except wikipedia.IsRedirectPage:
print "Redirect page %s" % safe(title)
text = ''
continue
except wikipedia.LockedPage:
print "Locked/protected page %s" % safe(title)
text = ''
continue
acts = set()
mo = retag.search(text)
if mo:
if mo.group(1).strip(' |'):
acts.add('rm tag:' + mo.group(1).strip(' |'))
else:
acts.add('rm tag')
text = retag.sub('', text)
# rfc level trickery
newtext = rerfclevel.sub('', text)
if newtext != text:
print 'took out rfc-level'
acts.add('rm rfc-level tag')
text = newtext
# same for xphrase
newtext = rerfcxphrase.sub('', text)
if newtext != text:
print 'took out rfc-xphrase'
acts.add('rm rfc-xphrase tag')
text = newtext
# same for header
newtext = rerfcheader.sub('', text)
if newtext != text:
print 'took out rfc-header'
acts.add('rm rfc-header tag')
text = newtext
# same for subst
newtext = rerfcsubst.sub('', text)
if newtext != text:
print 'took out rfc-subst'
acts.add('rm rfc-subst tag')
text = newtext
# same for pron-n
newtext = rerfcpronn.sub('', text)
if newtext != text:
print 'took out rfc-pron-n'
acts.add('rm rfc-pron-n tag')
text = newtext
if '{{rfc' in text: rfc = True
#elif '{{rfc|' in text: rfc = True
#elif '{{rfc-' in text: rfc = True
else: rfc = False
rfcact = ''
# overall regex, using table
for rx in Regex:
newtext = Regex[rx][0].sub(Regex[rx][1], text)
if newtext != text:
acts.add(rx)
text = newtext
# report multiple blank lines (force save), will be taken out by parsing
if '\n\n\n\n' in text:
# 3 or more, not just 2
acts.add("remove multiple blank lines")
# categories found in the entry or implied by context and perhaps inflection templates
catseen = set()
# now parse. take the entry apart into languages (ha!)
curr = '*prolog'
last = ''
Lsect = { '*prolog':[ ], '*iwiki':[ ] }
Lcats = { }
waslinked = [ ]
divs = 0
header = ''
for line in text.splitlines():
# canonical headers first. some later code is redundant, but so what? it does "rest"
if line and line.startswith('='):
mo = rehead1.match(line)
if not mo: mo = rehead2.match(line)
if not mo: mo = rehead3.match(line)
if not mo: mo = rehead4.match(line)
# must match 4 or else what?! (all eq = is the answer to this question!)
if not mo:
mo = realleq.match(line)
if mo: acts.add("remove line of only ='s")
else: acts.add('remove bogus = line')
continue
oline = line
level = len(mo.group(1))
if not mo.group(2).strip():
acts.add('removed nil header') # !!!
line = ''
else: line = '='*level + mo.group(2).strip() + '='*level + mo.group(3)
if line != oline: acts.add('format headers')
# L2 headers
mo = reL2head.match(line)
if mo:
header = mo.group(1).strip()
hf = reunlink.sub(r'\1', header)
if hf != header:
if '|' in hf: hf = hf.split('|')[1]
if hf not in Level: acts.add('unlink language header ' + hf)
header = hf
# validate language [needs to be fixed for case before first lang section!]
if header.capitalize() in Level:
"""
if not rfc:
text = '{{rfc-level|' + header + ' as level 2 header}}\n' + text
rfcact = 'add rfc-level tag for L1/2 header ' + header
rfc = True
else:
print "(no edit, bad L2 header and rfc)"
rfcact = 'bad L1/2 header ' + header
"""
# try fixing, move to min level for this header:
level = Level[header.capitalize()]
acts.add('L1/2 header ' + header + ' to L' + str(level))
# header + anything else, will get moved later
Lsect[curr].append('='*level + header + '='*level + mo.group(2))
continue # with current language section
# subst code template
if header.startswith('{{'):
if header[2:-2] in Lcodes:
hf = Lcodes[header[2:-2]]
acts.add('L2 header -' + header + ' +' + hf)
header = hf
# check sort order
if header and last and lkey(header) < lkey(last):
acts.add(last + '/' + header + ' sorted into order')
last = header
if header not in Lsect:
Lsect[header] = [ ]
Lcats[header] = [ ]
else:
acts.add('merged ' + header + ' sections')
curr = header
if mo.group(2).strip():
acts.add('stuff after L2 header moved')
Lsect[curr].append(mo.group(2).strip())
continue
# look for iwiki
mo = reiwiki.match(line)
if mo and mo.group(1) == title:
Lsect['*iwiki'].append(line)
continue
# wiki format + one space
line = restack.sub(r'\1 ', line)
# trailing spaces
if len(line) > 2 and line.startswith('=') and line.endswith(' '): acts.add('rm spaces after header')
line = line.rstrip()
# take out dividers
if line.startswith('----'):
if line == '----': divs += 1
continue
# other lines
Lsect[curr].append(line)
# any language sections?
if len(Lsect) == 2:
# no, tag if not tagged
if ( 'nolanguage/box' not in text and '{{wikify' not in text and
'{{delete' not in text and '{{only in' not in text ):
text = '{{subst:nolanguage}}\n' + text
rfcact = 'tagged nolanguage'
rfc = True
else:
print "(no edit, tagged nolanguage, wikify or delete)"
continue # next entry
#ATTN
if '{{list' in text:
print "(wafflebread debug: skipped)"
continue
# each section
for lang in Lsect:
if lang.startswith('*'): continue
if lang in Ltocode: lcode = Ltocode[lang]
else: lcode = ''
# find Etymologies first
etys = [ ]
etycount = 0
fh = True
for i, line in enumerate(Lsect[lang]):
# look for ety headers, and Pronunciation first at L4
mo = reheader.match(line)
if mo:
level = len(mo.group(1))
header = mo.group(2).strip()
# rest = mo.group(3)
# special case pronunciation, occurs with some frequency
if fh and level != 3 and fuzzy(header.lower(), 'pronunciation', 11) >= 11 and len(header) < 15:
acts.add('Pronunciation changed to level 3')
Lsect[lang][i] = '===' + header + '==='
# and leave fh set:
continue
# just do fuzzy!
if fuzzy(header.lower(), 'etymology', 7) >= 7 and len(header) < 20:
if level != 3:
if fh:
# first header, okay to fix!
acts.add('Etymology changed to level 3')
# and leave fh set:
etycount += 1
etys.append(i)
continue
elif not rfc:
Lsect[lang][i] = line + '{{rfc-level|Etymology not at level 3|lang=%s}}'%lcode
acts.add('+{{rfc-level|Etymology not at level 3}}')
rfc = True
continue
else:
print "(ety not at L3 and already rfc)"
continue
etycount += 1
etys.append(i)
fh = False
# then fix/rewrite the ety headers, use sub to handle rest, report any changes (spacing an issue):
if etycount:
for i in range(etycount):
line = Lsect[lang][etys[i]]
# print 'ety check replace ' + line
if etycount > 1: newline = reheader.sub(r'===Etymology %d===\3' % (i+1), line)
else: newline = reheader.sub(r'===Etymology===\3', line)
if newline.strip('= ') != line.strip('= '):
acts.add('header -' + line.strip('= ') + ' +' + newline.strip('= '))
Lsect[lang][etys[i]] = newline
# sigh, think that's it? Sweet, if true...
# general format
newlines = [ ]
inPos = inTrans = inPro = inext = defnext = False
npos = 0
ety = nety = 0
levelact = ''
rfctag = ''
header = ''
for line in Lsect[lang]:
# minor spacing on stackable wiktext ...
# already done line = restack.sub(r'\1 ', line)
# move cats, may be something else on the line too, or multicats ...
# first we need a cat-present predicate
catp = False
for cat in recat.findall(line):
ocat = cat
catp = True
catname = cat[11:-2].split('|')[0]
catname = re.sub('_', ' ', catname).strip()
cf = cat.find('|')
if cf > 0: cat = '[[Category:' + catname + cat[cf:]
else: cat = '[[Category:' + catname + ']]'
# we have a canonical cat! is it a novel cat?
if cat in catseen:
acts.add('rm dup cat [[:' + cat[2:])
continue
catseen.add(cat)
# rm bad cats from substs left around, see how this works
if '{{{' in cat:
acts.add('rm bad cat [[:' + cat[2:])
continue
if cat != ocat: acts.add('canonical cats')
# see if it belongs in a different sect
catmove = False
if ':' in catname:
catcode = catname.split(':')[0]
if catcode in Lcodes:
catlang = Lcodes[catcode]
if catlang != lang and catlang in Lcats:
acts.add('category ' + catname + ' moved to ' + catlang + ' section')
Lcats[catlang].append(cat)
catmove = True
elif not catname.lstrip(' 01').startswith(lang) and not catname.endswith('derivations') and not catname.endswith('fiction') and not catname.endswith('mythology'):
for other in Lcats:
if other == lang: continue
if catname.lstrip(' 01').startswith(other+' '):
acts.add('category ' + catname + ' moved to ' + other + ' section')
Lcats[other].append(cat)
catmove = True
break
# not moved
if not catmove: Lcats[lang].append(cat)
if catp:
line = recat.sub('', line).strip()
if not line: continue
# headers
mo = reheader.match(line)
if mo:
# hit header with no infl/defn line in previous section?
if inext:
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
newlines.append('')
inext = False
defnext = True
if defnext and header not in HAN:
newlines.append('# {{defn|%s}}' % lang)
acts.add('no definition line for %s/%s added {defn}' % (lang, header))
level = len(mo.group(1))
header = mo.group(2).strip()
rest = mo.group(3)
# unlink header
hf = reunlink.sub(r'\1', header)
if hf != header:
if hf.find('|') > 0: hf = hf.split('|')[1]
acts.add('header -' + header + ' +' + hf)
header = hf
# fix header
if header.lower() in Hfix:
hf = Hfix[header.lower()]
if hf != header:
acts.add('header -' + header + ' +' + hf)
header = hf
# try a fuzzy!
if header.lower() not in Hfix and not header.startswith('{{'):
high = 0
replac = ''
hf = header.strip('[]{}').lower()
for val in sorted(Hfix):
# first character must match
if hf[0] != val[0]: continue
rawsc = fuzzy(hf, val, len(val) - 4)
print safe('fuzzy "%s" "%s" score %d' % (hf, val, rawsc))
if rawsc > high and rawsc > max(max(len(hf), len(val)) - 3, 5):
high = rawsc
replac = val
print safe('fuzzy for %s: %s score %d' % (hf, replac, high))
if high:
hf = Hfix[replac]
acts.add('header -' + header + ' +' + hf)
header = hf
# tag Transitive and Intransitive verb, and Reflexive
if header.lower() in ('transitive verb', 'intransitive verb', 'reflexive verb') and not rfc:
rfctag = '{{rfc-trverb|' + header + '}}'
rfc = True
# print "trans/intrans header: %s" % safe(header)
# tag X phrase
if header.endswith(' phrase') and not rfc and not header.lower() in ('prepositional phrase'):
rfctag = '{{rfc-xphrase|' + header + '}}'
rfc = True
# print "X phrase header: %s" % safe(header)
# tag Pronunciation N headers, preventing the level errors later
if repronn.match(header) and not rfc:
# not sure if we need the header in the template, but follows the pattern (with a |)
rfctag = '{{rfc-pron-n|' + header + '}}'
rfc = True
# rfc unrecognized, ignore templates for now, use NS later
if header.lower() not in Hfix and not rfc and not header.startswith('{{'):
rfctag = '{{rfc-header|' + header + '}}'
rfc = True
# print "unknown header: %s" % safe(header)
# min level, set and comp for nested ety
if level == 3 and header.startswith("Etymology") and etycount > 1:
ety = 1
nety += 1
npos = 0
push = False
else:
if ety:
# if we are in the last ety sect, and see end of section things at L3:
if level < 4 and nety == etycount and header in EOS: inPos = ety = 0
# and ... independent of connel flag, because we always push ;-)
if level < 4 and nety == etycount and header in L43: inPos = ety = 0
# push POS (or level 3?) sections down in ety, push flag because of Connel fix
# may be a good idea anyway ... yes, but if we rfc, stop
if ety and not rfc:
if (header in POS and header not in HAN or header in TOS) and level == 3:
level = 4
acts.add('header in ety sect ' + header + ' to L' + str(level))
if header == 'Pronunciation':
rfctag = '{{rfc-level|check placement of Pronunciation}}'
push = True
elif header in POS and header not in HAN or header in TOS:
# at correct level! (or too deep already)
push = False
elif push and header in Level and (level == 4 or level < Level[header] + ety):
level += 1
acts.add('header in ety sect ' + header + ' to L' + str(level))
elif level < 4: push = False
# code to shift header levels (general case in POS), disabled per Connel, 18.4.7
if inPos and header in L43:
if npos < 2 and level < 4 + ety:
if not Connel:
level = 4 + ety
acts.add('header ' + header + ' to L' + str(level))
else: levelact = ' (AutoFormat would have corrected level of ' + header +')'
elif inPos and header in Level:
if level < Level[header] + ety:
if not Connel:
level = Level[header] + ety
acts.add('header ' + header + ' to L' + str(level))
else: levelact = ' (AutoFormat would have corrected level of ' + header +')'
# now tag remaining problems if any, various cases
# should all contain "+" for the re-visit trick ...
if not rfc:
if level == 4 + ety and not inPos and header in POS and header not in NS:
rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 Ety section' + levelact + '}}'
elif level == 4 + ety and not inPos and header in Level and header not in NS:
rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 POS section' + levelact + '}}'
elif level == 3 + ety and header.startswith('Translation'):
rfctag = '{{rfc-level|' + header + ' at L3+' + levelact + '}}'
elif level == 5 + ety and not inTrans and header.startswith('Translations to'):
rfctag = '{{rfc-level|' + header + ' at L5+, not in Translations' + levelact + '}}'
# blank line
newlines.append('')
# header + anything else that wasn't blank
newlines.append('='*level + header + '='*level)
if rest.strip():
if not rest.startswith('{{rfc-'): acts.add('moved stuff after ' + header + ' header')
newlines.append(rest.strip())
# Usage notes can be anywhere (see ELE)
if 'rfc-level|Usage notes' in rfctag: rfctag = ''
# suppress the "AF would have" now, just don't tag:
if "AutoFormat would have" in rfctag: rfctag = ''
if rfctag:
if lcode: rfctag = rfctag[:-2] + '|lang=%s}}'%lcode
acts.add('+' + rfctag)
if 'check placement' not in rfctag: rfc = True
newlines.append(rfctag)
rfctag = ''
# set flags:
inext = defnext = False
if level < 4 + ety and (header in POS or header.startswith(HT)):
inext = inPos = True
npos += 1
elif level < 4 + ety: inPos = False
inTrans = (header == 'Translations')
tt = False
inPro = (header == 'Pronunciation')
continue
# look for inflection line
if inext:
if line.startswith('{{') and not line.startswith('{{wikipedia') or line.startswith("'''") or \
fuzzy(line, title, len(title) - 1) > len(title) - 1:
if line == title:
acts.add('replace unformatted headword')
continue
inext = False
defnext = True
if line and line.startswith('#'):
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
defnext = True
inext = False
# and also do next case for defnext
# elide blanks above inflection line
if not line: continue
# look for definition lines
if defnext and line.startswith('#'):
newlines.append('')
defnext = False
# # used where it shouldn't be
if line.startswith('#') and header not in POS:
if header in TOS or header in EOS or (header in Level and Level[header] == 4):
line = '*' + line[1:]
acts.add("-# +* in %s section" % header)
# serious stuff ...
if line.startswith('# '):
# look for context tag
if lang in Ltocode:
ctxn = 1
mo = recontext.match(line)
if not mo:
ctxn = 2
mo = recontext2.match(line)
if not mo:
ctxn = 3
mo = recontext3.match(line)
if mo:
print "match context tag %s" % safe(mo.group(1))
tname = cpar(mo.group(1), Contexts)
if mo and tname:
if lang != 'English': tname += '|lang=' + Ltocode[lang]
if contextp and ctxn == 1:
acts.add("-(''" + mo.group(1) + "'') +{{" + tname + "}}")
line = recontext.sub(r'# {{' + tname + r'}} \2', line)
elif contextp and ctxn == 2:
acts.add("-''(" + mo.group(1) + ")'' +{{" + tname + "}}")
line = recontext2.sub(r'# {{' + tname + r'}} \2', line)
elif contextp and ctxn == 3:
acts.add("-{{italbrac|" + mo.group(1) + "}} +{{" + tname + "}}")
line = recontext3.sub(r'# {{' + tname + r'}} \2', line)
else: print "would have replaced %s with %s" % (safe(mo.group(1)), safe(tname))
# elide cats that correspond
for catname in tname.split('|'):
if catname == 'context' or catname.startswith('lang='): continue
catname = catname[0].upper() + catname[1:]
# code is prefix ...
if lang != 'English': catname = Ltocode[lang] + ':' + catname
if contextp:
catseen.add('[[Category:' + catname + ']]')
# catseen.add('[[Category:' + catname + 's]]')
print "added catseen %s" % safe(catname)
# wikilinking?
"""
# (remember to correct for spacing)
elif not line.startswith('#') and not inTrans and "''" in line:
# look for italbrac cases not on defn lines
newl = reibcomma.sub(ibsub, line)
newl = reibcomma2.sub(ibsub, newl)
if newl != line:
# acts.add('-' + line + ' +' + newl)
# acts.add('template i')
# in pronunciation, use a, anywhere else, we want i-c if at start of * line
if inPro:
newl = re.sub(r'\{\{(i|i-c)\|', '{{a|', newl)
else:
newl = re.sub(r'\{\{i\|', '{{i-c|', newl)
acts.add(sdif(line, newl))
line = newl
# think that will work?
"""
# translations lines
# stopgap check: (should be improved, tsort knows haow to handle this)
if '{{ttbc|' in line: inTrans = False
if inTrans:
# special indent rule, we know there is a previous line
if line.startswith(': ') and newlines[-1:][0].startswith('*'):
acts.add('-: +*: in trans')
line = '*' + line
# similar rule for :*, we leave ** alone (is correct for grouped language)
# may have intended **, but this is better than leaving it :*
if line.startswith(':* ') and newlines[-1:][0].startswith('*'):
acts.add('-:* +*: in trans')
line = '*:' + line[2:]
was = False
mo = retrans1.match(line)
if not mo: mo = retrans2.match(line)
if mo: was = True
if not mo: mo = retrans3.match(line)
if not mo:
mo = retrans4.match(line)
if mo: # missing ':'
tlang = mo.group(1).strip()
acts.add("added : after %s in translations" % tlang)
if mo:
tlang = mo.group(1).strip()
if was and tlang.find('|') > 0: tlang = tlang.split('|')[1]
trest = mo.group(2).strip()
if tlang.startswith('{{') and tlang[2:-2] in Lcodes:
acts.add('subst %s in trans' % tlang)
tlang = Lcodes[tlang[2:-2]]
was = False
if was:
acts.add('trans unlink ' + tlang)
# conform gender specification templates
# tr = regender.sub(r'{{\1}}', trest)
tr = trest
for rx in Trex:
tr = rx[0].sub(rx[1], tr)
if tr != trest:
#acts.add('gender -' + trest + ' +' + tr)
acts.add('gender ' + sdif(trest, tr))
trest = tr
if trest: line = '* ' + tlang + ': ' + trest
else: line = '* ' + tlang + ':'
# convert templates
# has to be a non-blank previous line, we are in trans section
if line == '{{rfc-trans}}': inTrans = False
if line == '{{checktrans}}': inTrans = False
if line == '{{checktrans-top}}': inTrans = False
if line == '{{ttbc-top}}': inTrans = False
mo = retopgloss.match(line)
if mo:
if mo.group(1):
gloss = mo.group(1)[1:]
else:
prev = newlines[-1:][0]
while not prev:
newlines = newlines[:-1]
prev = newlines[-1:][0]
if prev.startswith(';'): gloss = prev[1:]
elif prev.startswith("'''") and prev.endswith("'''"): gloss = prev[3:-3]
else: gloss = ''
if gloss: newlines = newlines[:-1]
if gloss:
gloss = reglossfix.sub(r'\1', gloss).strip()
prev = line
line = '{{trans-top|' + gloss + '}}'
# <- else: line = '{{trans-top}}'
acts.add('-' + prev + ' +' + line)
tt = True
if tt and line == '{{mid}}':
line = '{{trans-mid}}'
if tt and line == '{{bottom}}':
newlines.append('{{trans-bottom}}')
# add blank line
line = ''
tt = False
# end of trans
# templates that should have * outside them
mo = restartemp.match(line)
if mo and mo.group(1) in StarTemp:
line = '* ' + line
acts.add('* before ' + mo.group(1))
# pronunciation specific
if inPro:
refire = True
while refire:
refire = False
for rx in Prex:
if "enPR" in rx and lcode != "en": continue
line, k = Prex[rx][0].subn(Prex[rx][1], line)
if k:
acts.add(rx)
refire = True # fire ruleset again
if 'IPA' in line and lcode and lcode != 'en' and '|lang=' not in line:
line, k = reIPAlang.subn(r'\1|lang=' + lcode + '}}', line)
if k: acts.add('added lang=' + lcode + ' to IPA')
if line == '{{rfp}}' and lcode and lcode != 'en':
line = '{{rfp|lang=' + lcode + '}}'
acts.add('added lang=' + lcode + ' to rfp')
# move {{also}} to prolog, we are in a language section
if line.startswith("{{also|"):
Lsect['*prolog'].append(line)
acts.add("moved {{also}} to prolog")
continue
# all else
newlines.append(line)
# at end with no infl / defn line in previous section?
if inext:
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
newlines.append('')
inext = False
defnext = True
if defnext and (header not in HAN or npos == 1):
newlines.append('# {{defn|%s}}' % lang)
acts.add('no definition line for %s/%s added {defn}' % (lang, header))
# done with sect
Lsect[lang] = newlines
# reassemble ...
newtext = ''
prior = False
# sort prolog, and add to newtext
if len(Lsect) > 2:
pcopy = sorted(Lsect['*prolog'], key=prokey) # shallow copy, sorted
if pcopy != Lsect['*prolog']: acts.add('sorted prolog')
else: pcopy = Lsect['*prolog'] # no language sections, leave "prolog" alone
for line in pcopy:
# no blank lines
if line: newtext += line + '\n'
if line.startswith('=') and not rfc:
newtext += '{{rfc-level|header line in prolog, before first L2 header}}\n'
acts.add('tagged header before first L2 header')
del Lsect['*prolog']
blank = True # not really, this is to suppress blank before 1st L2 header
for lang in sorted(Lsect, key=lkey):
if lang == '*iwiki': continue
if prior:
if not blank: newtext += '\n'
newtext += '----\n\n'
divs -= 1
prior = True
if lang not in waslinked: newtext += '==' + lang + '==\n'
else: newtext += '==[[' + lang + ']]==\n'
blank = False
for line in Lsect[lang]:
# no dup blank lines
if line or not blank: newtext += line + '\n'
if line: blank = False
else: blank = True
if Lcats[lang]:
if not blank: newtext += '\n'
# (note lkey is a different function, but does strip brackets, so works ...)
for cat in sorted(Lcats[lang], key=lkey): newtext += cat + '\n'
blank = False
del Lsect[lang]
# residual tag(s):
if ('{{{' in newtext and '}}}' in newtext) or '{{#' in newtext:
acts.add('+{{rfc-subst}} syntax tag')
newtext += '{{rfc-subst}}\n\n' # force newline even if at end
blank = True
# add the iwikis
if not blank: newtext += '\n'
for line in Lsect['*iwiki']:
# no blank lines
if line: newtext += line + '\n'
if divs != 0: acts.add("fixed ----'s")
# rfc-level, etc trickery
for rfname in ('level', 'xphrase', 'header', 'subst', 'pron-n'):
if 'rm rfc-' + rfname + ' tag' in acts:
for ac in sorted(acts):
if ac.startswith('+{{rfc-' + rfname):
acts.remove('rm rfc-' + rfname + ' tag')
acts.remove(ac)
print 'elided -' + rfname + ' +' + rfname
break
# sort translations if any, if not tagged already:
if "{{trans-top" in newtext and "{{rfc-tsort" not in newtext:
new2 = retransect.sub(transort, newtext)
if new2 != newtext:
if "{{trans-see" in new2 and "{{trans-see" not in newtext: acts.add("+trans-see template")
if "{{rfc-tsort" not in new2: acts.add("sorted/rebalanced translations")
else: acts.add("tagged translations table problem")
newtext = new2
# do some combining of pron lines, now that we've done the rulesets:
newtext, k = repronsing3.subn(r"* {{enPR|\1}}, {{IPA|\2}}, {{SAMPA|\3}}", newtext)
if k: acts.add("combined enPR, IPA, SAMPA on one line")
# variant order
newtext, k = repronsing3a.subn(r"* {{enPR|\3}}, {{IPA|\1}}, {{SAMPA|\2}}", newtext)
if k: acts.add("combined enPR, IPA, SAMPA on one line")
newtext, k = repronsing2.subn(r"* {{IPA|\1}}, {{SAMPA|\2}}", newtext)
if k: acts.add("combined IPA and SAMPA on one line")
# if page isn't "countable", see if we can add a link in a form-of template
#looks like this isn't quite supported anymore. commented out.
#if '[[' not in newtext:
#for rx in Frex:
# newtext, k = Frex[rx][0].subn(Frex[rx][1], newtext)
# if k:
# acts.add(rx)
# break # only need one
#if '[[' not in newtext:
# print "page still not counted in stats"
#newtext += '{{count page|[[Wiktionary:Page count]]}}'
#acts.add("+{{count page}} for statistics")
# do minor spacing 1% of the time that there is nothing else to do
if not acts and random() < 0.01 and newtext.rstrip(' \n') != text.rstrip(' \n'):
acts.add('minor spacing')
# if we added a major rfc, just do that, dump the rest of the work!!
if rfcact:
acts = set()
acts.add(rfcact)
newtext = text
act = ', '.join(sorted(acts))
# some change, write it (even just rm tag)
if act:
fixed += 1
naptime /= 2
print "format %s: %s" % (safe(title), safe(act))
# try to fix the entry
try:
wikipedia.setAction(act)
page.put(newtext)
except wikipedia.PageNotSaved:
print "failed to save page"
# other action?
except socket.timeout:
print "socket timeout, maybe not saving page"
except socket.error:
print "socket error, maybe not saving page"
except Exception, e:
print "some other error saving page, no retry"
print str(e)
break
# put throttle will do: if not saved: time.sleep(30)
#retries -= 1
# end loop
print "entries fixed %d" % fixed
# done
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()