This comes with several very important caveats:
- I am a professional software engineer, this is what I do; however this code was written for my own use, and is not warranted, and does not carry any implication of merchantability or fitness for use.
- Like everything else in the Wiktionary, this is under GFDL. GFDL is not compatible with the GPL, this document is not licensed under the GPL as software. (!)
- At any given moment, this code may not represent what is being run; I have no intention of updating this page every time I make a change.
technical notes
I don't have my ego attached to code I write; I routinely dump code that has gotten too complex, and re-write it. On the other hand, even if something is sloppy, if it is tested and works, I leave it alone.
- Some of the comments may be snarky.
- The comments are often (usually) written to remind me of something, not to explicate the code.
- Since I modify this regularly, there is code that is not reached or otherwise redundant.
- The pre-parsing should go deeper; a fairly major restructuring would be helpful at some point soon.
- There are a small number of known (to me ;-) bugs that I handle by monitoring the edits done, having not yet fixed them. (Like handling multi-line comments.)
- The module AF uses is heavily modified from the distro; however the interface is the same. In the presence of network problems/failures/outages AF may abort when the modified version would have recovered. The exceptions thrown are the same, but under differing conditions.
- On Linux, the clock timing works, but will display ugly large values.
- The code to handle headers is largely hacked to implement the "Connel" flag ....
- The code that handles Etymology headers is based on the current WT:ELE; there is no problem changing it when we figure out how Etymology and Pronunciation are supposed to play nicely together in the general case.
- It must have a sysop account as well, to read patrolled flags in RC; "enhanced" RC mode must be turned off.
- prescreen
Reads the XML dump, uses simple regex to find entries that may need attention, and builds a random index
- rcpages
Generator called by the main routine. Calls prescreen, then cycles through reading Recent Changes, looking at the request category, and yielding pages found.
- main
Reads configuration pages, builds tables to be used. Loops on rcpages generator, for each entry:
- runs regex on the entire text
- breaks entry into language sections, plus prolog (above first section), and iwikis
- in each language section:
- looks for and fixes Etymology headers
- herds cats
- fixes bad headers
- fixes linking in trans tables
- fixes top to trans-top
- subst's (replaces) language code template
- etc
- then reassembles the entry, removing multiple blank lines, adding ---- rules, and so on
- checks the actions performed
- if any resulting action, rewrites the page
# -*- coding: utf-8 -*-
This bot looks for entries tagged for autoformatting, does a number of tasks
No command line arguments.
import wikipedia
import catlib
import sys
import re
import pickle
import time
import xmlreader
import socket
from mwapi import getwikitext, getedit
def safe(s):
return pickle.dumps(s)[1:-5]
def lkey(l):
n = l.strip('[]')
if not n: return n
if n == 'Translingual': return '0' + n
if n == 'English': return '1' + n
# bad L2 headers
if n.lower() == 'cyrillic alphabet': return '0' + n
if n.lower() == 'arabic alphabet': return '0' + n
if n.lower() == 'see also': return '3' + n
if n.lower() == 'references': return '4' + n
# handle names like !Kung and 'Auhelawa: move non-alpha to the end of key
if not n[0].isalpha(): n = n[1:] + n[0]
return '2' + n
Scripts = { 'ARchar' : 'Arab',
'Cuneiform' : 'Xsux',
'ELchar' : 'Grek',
'FAchar' : 'fa-Arab',
'HEchar' : 'Hebr',
'JAchar' : 'Jpan',
'KMchar' : 'Khmr',
'LOchar' : 'Laoo',
'RUchar' : 'Cyrl',
'THchar' : 'Thai',
'URchar' : 'ur-Arab',
'ZHchar' : 'Hani',
'ZHsim' : 'Hans',
'ZHtra' : 'Hant' }
PSK = { }
from random import random
from math import log as ln
AH = set()
#newpages = set()
Regex = { }
Prex = {}
# work cache, record time last looked at entry
# each record is key: lc:word, pickled with safe(), value is integer time()
import shelve
cache ="af-cache")
def prescreen():
while True: # indef repeat
cis = 0
# get XML dump
dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")
srx = { }
srx['lcode header'] = re.compile(r'^== *\{\{.{2,3}\}\} *==', re.M)
srx['lcode trans'] = re.compile(r'^\* *\{\{.{2,3}\}\} *:', re.M)
srx['top template w/param'] = re.compile(r'\{\{top\|')
srx['top template w/semi gloss'] = re.compile(r'^;.*\n\{\{top\}', re.M)
srx['top template w/qbold gloss'] = re.compile(r"'''\n\{\{top\}")
# srx['gender'] = re.compile(r"^\*.*:.*''[fmcn]''", re.M)
""" block covered by regex
srx['Wikipedia'] = re.compile(r"\{\{Wikipedia")
srx['Unicode'] = re.compile(r"\{\{Unicode")
srx['Acronym'] = re.compile(r"\{\{Acronym")
srx['Initialism'] = re.compile(r"\{\{Initialism")
srx['Abbreviation'] = re.compile(r"\{\{Abbreviation")
srx['cattag'] = re.compile(r"\{\{cattag")
srx['trad'] = re.compile(r"\{\{trad-?\|")
srx['rest after header'] = re.compile('^=+[^=\n]+=+[^=\n]+$', re.M)
srx['Pronounciation'] = re.compile('Pronounciation')
srx['categorized'] = re.compile('[Cc]ategori[sz]ed')
# srx['etymology with parens'] = re.compile('Etymology ?\(')
# srx['etymology at L4'] = re.compile('^==== ?Etymology', re.M)
# srx['also see'] = re.compile('= ?Also see')
# srx['indented see'] = re.compile(r'^:\{\{see\|', re.M)
# srx['indented Cyrillic'] = re.compile(r'^:Cyrillic', re.M)
# srx['indented Roman'] = re.compile(r'^:Roman', re.M)
srx['Han ref w/o *'] = re.compile(r'^\{\{Han ref\|', re.M)
# srx['AHD'] = re.compile(r'\{\{AHD')
# next really needs something re lang sects, try it for now, sorta works
# srx['maybe un-numbered ety'] = re.compile(r'^=== ?Etymology ?===.*Etymology', re.M|re.S)
# srx['PAGENAME'] = re.compile('\{PAGENAME')
srx['-----'] = re.compile('-----')
# header case problems ...
srx['lc header'] = re.compile(r'^={1,6} ?[a-z][-a-zA-Z ]*=+$', re.M)
srx['non sc header'] = re.compile(r'^={3,6} ?[A-Z][-a-z ]*[A-Z][-a-zA-Z ]*=+$', re.M)
# contexts
srx['context tag'] = re.compile(r"^# *\(''.+?''\)", re.M)
srx['context tag 2'] = re.compile(r"^# *''\(.+?\)''", re.M)
srx['context italbrac'] = re.compile(r"^# *\{\{italbrac", re.M)
# hunt down trans fixes, several cases ... ;-) regex will do
# srx['translations to be checked'] = re.compile(r'ranslations to be')
# re-work rfc level tags (maybe? try 25.2.8?)
# srx['rfc level'] = re.compile(r'^\{\{rfc-level.*\+',re.M)
# X phrase is pretty much gone anyway
# srx['X phrase'] = re.compile(r'^={3,5} *[-a-zA-Z ]* phrase *=+$', re.M)
# and so on
# srx['gender pls'] = re.compile(r'\{\{[mfnc]\.?pl\.?}}')
# srx['gender comb'] = re.compile(r'\{\{[mfnc]}} \{\{[ps]}}')
# srx['gender f/m.'] = re.compile(r'\{\{[mf]\.}}')
# srx['gender pl.'] = re.compile(r'\{\{pl\.}}')
# scripts: covered by regex
# for s in Scripts:
# srx[s + ' script template'] = re.compile(r'[\{=]' + s + r'[\{\|]')
# srx['given name lang'] = re.compile(r':}}') # !? probably enough / now automatic
# canonical cats, headers, did all these 19.8.8, put these back when we get a new dump
srx['canonical cat'] = re.compile(r'category:')
srx['canonical cat space'] = re.compile(r'ory: ')
srx['canonical head'] = re.compile(r'== ')
srx['multiple blanks'] = re.compile(r'\n\n\n\n') # 3 or more, not just 2
srx['template detritus'] = re.compile(r'\{\{\{') # should never be in an entry
srx['IPA star'] = re.compile(r'^\{\{IPA\|', re.M)
reah = re.compile(r'^={3,6} *([-a-zA-Z ]+) *=+$', re.M)
rel2 = re.compile(r'^==[^=]', re.M)
rehr = re.compile(r'^----', re.M)
counts = { }
entries = 0
tags = 0
tran = 0 # tagged at random
piscine = set()
# skip a few others besides the level 3-6 headers
AH.add('Min Nan')
for entry in dump.parse():
text = entry.text
title = entry.title
if ':' in title: continue
if text and text.startswith('#'): continue
entries += 1
if entries % 1000 == 0: print "prescreen: %d entries, %d tagged" % (entries, tags)
ckey = safe(title) # must be string for bsd dbm
if ckey in cache:
last = cache[ckey]
if last > time.time() - (35 * 24 * 3600):
# print "prescreen: %s (35 day cache)" % safe(title)
# screen entries:
tag = False
for reason in srx:
if srx[reason].search(text):
tag = True
if not tag and '{{rfc' not in text:
for mo in reah.finditer(text):
h =
if h not in AH:
if h not in piscine:
print "prescreen: header %s is tagged" % safe(h)
reason = "unknown header"
tag = True
if not tag and '=Pronunciation ' in text and '{{rfc-pron-n' not in text:
reason = 'Pronunciation n header'
tag = True
if not tag and '[[' not in text and random() * 17.0 < 1.0:
reason = 'no [[ in text'
tag = True
# some exceptions, various that may not be usefully fixed:
if tag and reason == 'translations to be checked' and '=Translations=' not in text:
# some cases we don't want ... should be fixed in main routine
tag = False
# overall regex prescreen, using table for format
if not tag:
for rx in Regex:
# skip a trans case, see above
if rx.startswith('elided Translations to be checked') and '=Translations=' not in text: continue
if Regex[rx][0].search(text):
reason = 'regex ' + rx
# but slow this down by canonical factor, too many for now
if '+also' in reason and random() * 17.0 > 1.0: continue
tag = True
# Pron section regex, may cause some false hits in other sections
if not tag:
for rx in Prex:
if Prex[rx][0].search(text):
reason = 'pron regex ' + rx
tag = True
# look for L2 header and horiz rules mismatch
# L2 header match is not perfect (requires canonical form), but that just means we will
# be looking at entries that may need looking at
if not tag and len(rel2.findall(text)) - 1 != len(rehr.findall(text)) and '{{only in' not in text:
reason = 'horiz rules'
tag = True
# tag at random, limiting to something proportionate to other tags
if not tag and tran < (tags/2)+100 and random() < 0.007:
tran += 1
reason = 'at random'
tag = True
if not tag: continue
r = random()
# collisions don't matter much, but easy to fix (careful about significance! hence *):
while r in PSK:
print "prescreen: collision, r bumped %f to %f" % (r, (r+0.0001)*1.0001)
r = (r+0.0001)*1.0001
# (debug) if 'accent' in reason:
# print "prescreen: %s, %s (%.4f)" % (safe(title), reason, r)
if reason not in counts: counts[reason] = 0
counts[reason] += 1
PSK[r] = (title, reason)
tags += 1
# some (1/10) of the time we return an entry, else just pool
if tags % 10 != 0: continue
# yield best/minimum
m = min(PSK.keys())
title, reason = PSK[m]
del PSK[m]
# print "prescreen return: %s, %s (%.4f)" % (safe(title), reason, m)
ckey = safe(title) # must be string for bsd dbm
cache[ckey] = time.time() # entry has been fixed for now
cis += 1
if cis % 20 == 0: cache.sync()
yield title, reason, m
# end of file:
for r in sorted(counts):
print 'prescreen: count for %s is %d' % (r, counts[r])
# return/yield the rest
for m in sorted(PSK.keys()):
title, reason = PSK[m]
del PSK[m]
# print "prescreen return: %s, %s (%.4f)" % (safe(title), reason, m)
ckey = safe(title) # must be string for bsd dbm
cache[ckey] = time.time() # entry has been fixed for now
cis += 1
if cis % 20 == 0: cache.sync()
yield title, reason, m
def now(): return int(time.clock())
# share timer with main
naptime = 0
def rcpages(site):
# generator which yields recentchanges, but not unpatrolled changes
# also entries in category
# in between, yields pages that satisfy the prescreen in random order
global naptime
site = wikipedia.getSite("en", "wiktionary")
cat = catlib.Category(site, "Category:Requests for autoformat")
seen = set()
nextcat = now() - 1
nextrc = now() - 1
hold = { }
rcex = re.compile(r'title="(.+?)"')
for title, reason, m in prescreen():
print '(%d, from prescreen %s, %.4f)' % (now(), reason, m)
page = wikipedia.Page(site, title)
yield page
nf = 0
nd = 0
# get our category, every 10-15 minutes or so
if now() > nextcat:
cat.catlist(purge = True)
for page in cat.articles():
nf += 1
if nf > 7: break # just munch the cat, not too hungry ;-)
# if len(hold) > 100 and nf > 1: break # try to keep up, cat can wait? needed?
print '(%d)' % now()
if page.title() in hold: del hold[page.title()]
yield page
nextcat = now() + 740
# recent changes
if now() > nextrc:
print '(%d, reading recent changes)' % now()
rct = site.getUrl("/w/api.php?action=query&list=recentchanges&format=xml&rcprop=title" +
"&rclimit=5000&rcshow=patrolled|!bot&rctype=edit|new&rcnamespace=0", sysop = True)
except wikipedia.NoPage:
print "Can't get recentchanges from en.wikt!"
rct = ''
if '</recentchanges>' not in rct:
print "some bad return from recentchanges, end tag not found"
rct = ''
nextrc = now() + 1400
ht = 480
for title in rcex.findall(rct):
if ':' in title: continue # other stray stuff in NS:0
if title not in seen:
hold[title] = now() + ht
# scatter out into future ... (numbers fairly arbitrary, but work well)
ht += 34
if ht > 21 * 3600: ht /= 7 # ? if more than most of a day
nf += 1
print "found: [%s] hold until %d" % (safe(title), hold[title])
pastime = now()
for title in sorted(hold):
# 10 on a pass is enough
if nd > 9: break
if hold[title] > pastime: continue
print '(%d, rc held to %d)' % (now(), hold[title])
del hold[title]
nd += 1
page = wikipedia.Page(site, title)
yield page
if not nd and not nf and naptime > 5:
naptime = min(naptime, 340) # max to keep timers running
print "(%d, sleeping %d)" % (now(), naptime)
# also rely on put throttle
print '(%d, %d held)' % (now(), len(hold))
# now have some serious recursion fun!
# fuzzy returns string match score
# r is min required, calls may have neg r, may return value < r
def fuzzy(a, b, r):
if not a or len(a) < r: return 0
if not b or len(b) < r: return 0
if a == b: return len(a)
if a[0] == b[0]: return 1 + fuzzy(a[1:], b[1:], r-1)
if a[-1] == b[-1]: return 1 + fuzzy(a[:-1], b[:-1], r-1)
# try with each char forward
p = a.find(b[0])
if p >= 0: sca = 1 + fuzzy(a[p+1:], b[1:], r-1)
else: sca = 0
p = b.find(a[0])
if p >= 0: scb = 1 + fuzzy(b[p+1:], a[1:], r-1)
else: scb = 0
# no match either/or way, skip this char, one or both
if not sca and not scb: sk = fuzzy(a[1:], b[1:], r)
elif not sca: sk = fuzzy(a, b[1:], r)
elif not scb: sk = fuzzy(a[1:], b, r)
else: sk = 0
return max(sk, sca, scb)
def infline(title, lang, header):
pos = header.lower()
if pos.startswith('{{'):
pos = pos[2:-2].split('|')[0]
if lang == 'en':
if pos in ['verb', 'noun', 'adjective', 'adverb']:
return "{{infl|en|" + pos + "}}[[Category:English "+ pos +"s that lack inflection template]]"
a = ord(title[0:1])
# Arabic:
if 0x0600 <= a < 0x0780:
return "{{infl|%s|%s|sc=Arab}}" % (lang, pos)
# Han:
# this is planes 1-2, needs closer check
if 0x3400 <= a < 0xA000 or 0xd800 <= a < 0xdc00:
if lang == 'ko':
return "{{infl|%s|%s|sc=Hant}}{{ko-attention|may need inflection template}}" % (lang, pos)
elif lang == 'ja':
return "{{infl|%s|%s|sc=Jpan}}{{ja-attention|needs inflection template}}" % (lang, pos)
elif lang == 'vi':
return "{{infl|%s|%s|sc=Hant}}{{vi-attention|may need inflection template}}" % (lang, pos)
return "{{infl|%s|%s|sc=Hani}}{{zh-attention|needs inflection template}}" % (lang, pos)
if lang == 'ja':
return "{{infl|%s|%s}}{{ja-attention|needs inflection template}}" % (lang, pos)
if lang == 'ko':
return "{{infl|%s|%s}}{{ko-attention|may need inflection template}}" % (lang, pos)
if lang in ['zh', 'cmn', 'yue', 'nan']:
return "{{infl|%s|%s}}{{zh-attention|may need inflection template}}" % (lang, pos)
return "{{infl|%s|%s}}" % (lang, pos)
MOD = [ 'chiefly', 'coarse', 'especially', 'extremely', 'frequently', 'generally', 'mainly', 'markedly',
'mildly', 'mostly', 'often', 'particularly', 'primarily', 'sometimes', 'usually', 'very' ]
reunlink = re.compile(r'\[\[(.*?)\]\]')
# match a simple context, words but no odd puncuation etc
resimctx = re.compile(r'[-\w ]*$')
PRETULIP = ('of ', 'by ')
def cpar(cstr, ctxs):
# convert context string to template name(s)
tname = ''
cstr = re.sub(r'[,;\|]+', ',', cstr)
for cs in cstr.split(','):
cs = cs.strip(" '")
if '[' in cs: cs = reunlink.sub(r'\1', cs)
# handles n modifiers, does context? yes.
while cs.split(' ')[0].lower() in MOD:
mod = cs.split(' ')[0].lower()
tname += mod + '|'
cs = cs[len(mod):].strip()
if cs.lower() in ctxs:
tname += ctxs[cs.lower()] + '|'
elif cs.startswith(PRETULIP):
if not tname: tname = 'context|'
tname += cs + '|'
elif tname and resimctx.match(cs):
tname += cs + '|'
else: return ''
tname = tname.rstrip('|')
return tname
def ibsub(imo):
# some prefix captured
pref =
istr =
s = reunlink.sub(r'\1', istr)
# not general enough, bar pipes in match for now in re precomp
#if s != istr and '|' in s: s = s.split('|')[1]
s = re.sub(r',\s*', '|', s)
if == ':':
return pref + '{{i-c|' + s + '}}'
return pref + '{{i|' + s + '}}'
def sdif(a, b):
# returns -(a stuff) +(b stuff) when one change
i = 0
while a[i:i+1] and a[i:i+1] == b[i:i+1]: i += 1
an = a[i:]
bn = b[i:]
j = 1
while j < len(an) and an[-j:] == bn[-j:]: j += 1
j -= 1
# special case: improve on -}} {{ +| :
if j >= 3 and an.startswith('}} {{') and bn[:-j].endswith('|'):
an = a[i-3:]
bn = b[i-3:]
j -= 3
# return '-' + a[i-3:][:11] + ' +' + b[i-3:][:7] # gaa ...
if j: return '-' + an[:-j] + ' +' + bn[:-j]
else: return '-' + an + ' +' + bn
# okay, try that! not so pretty is it?
# sort language sections:
retransline = re.compile(r'\* \[*([^\]:\{\}]+?)\]*:') # match an already canonicalized line
retransreq = re.compile(r'\* \{\{trreq\|([^\}]+?)\}\}') # trans req template
retranstbc = re.compile(r'\* \{\{ttbc\|([^\}]+?)\}\}') # trans to be checked, allow here?
redetemp = re.compile(r'\{\{\w*\|')
redechar = re.compile(r'[\{\}\|\[\]]')
redecomm = re.compile(r'<!--.*?-->')
def nlen(s):
# simplest form:
# return 1 + len(s)/135 # +1 for each length of line that will probably wrap (WAG)
# this routine can be twaeked more if needed
# better:
s2 = redetemp.sub('', s)
s2 = redechar.sub('', s2)
s2 = redecomm.sub('', s2)
# dbg:
# if len(s2) >= 85: print "long line (%d): %s" % (1+len(s2)/85, safe(s2))
return 1 + len(s2)/85
# reduce text to "safe" for wiki as a template parameter:
rewsafe = re.compile(r'[\{\}\[\]\|\<\>]+')
# match a see-only case:
reseeonly = re.compile(r"\{\{trans-top\|(.+?)\}\}\n+[ :']*[Ss]ee[ ':]*(\[\[.+?\]\])(.*)$", re.S)
def transort(tmo):
ts = { }
tsk = { }
# take apart by language, treat header as "language" nil
prob = ''
prev = ''
k = 0
for tline in
if tline.startswith('{{trans-top'):
if '' in ts:
prob = "trans-top found inside section, missing trans-bottom?"
ts[''] = tline
tsk[''] = 0
if tline == '{{trans-mid}}': continue
if tline == '{{trans-bottom}}': continue
if not tline: continue
mo = retransline.match(tline)
if not mo: mo = retransreq.match(tline)
if not mo: mo = retranstbc.match(tline)
if mo:
lang =
if lang in ts:
prob = "duplicate language: " + lang
if lang.startswith('{{'):
prob = "unexpected template: " + lang
ts[lang] = tline
nl = nlen(tline)
tsk[lang] = nl
k += nl
prev = lang
if tline.startswith('* '):
prob = "unparsed language line: " + tline
# [tbd: treat ** as a sub language, eg key is "Chinese | Mandarin"]
if tline.startswith('*:') or tline.startswith('**'): # allow both here
ts[prev] += '\n' + tline
nl = nlen(tline)
tsk[prev] += nl
k += nl
if tline.startswith(': ') and not prev: # e.g. : ''see'' reference
ts[prev] += '\n' + tline
tsk[prev] += 1
k += 1
if tline.startswith('<!--') and not prev:
ts[prev] += '\n' + tline
# no addition to counts
prob = "unknown line format: " + tline
# blank section or nothing worth sorting, do nothing? um, format it default
# if not k: return
# pick up see-only case before looking at prob:
if not prev:
# no languages found
mo = reseeonly.match(
if mo:
print "matched see in trans section"
gloss = # leaves ''s as an issue
target =
if '#' not in target and '|' not in target: target = target.strip('[]')
rest =
# check remainder
rest = rest.replace("{{trans-mid}}", '')
rest = rest.replace("{{trans-bottom}}", '')
if not rest.strip(" '\n"):
if gloss == target: return "{{trans-see|" + target + "}}"
else: return "{{trans-see|" + gloss + "|" + target + "}}"
if prob:
print "in trans section,", safe(prob)
prob = rewsafe.sub(' ', prob) # wiki-safe ;-)
return "{{rfc-tsort|" + prob + "}}\n" + # rfc tag + unchanged
# re-assemble, balance columns
m = 0
tsnew = ''
for lang in sorted(ts, key=lkey):
tsnew += ts[lang] + '\n'
m += tsk[lang]
if k and m >= (k + 1) / 2:
tsnew += '{{trans-mid}}\n'
k = 0
# if not m: tsnew += '{{trans-mid}}\n'
if '{{trans-mid}}' not in tsnew: tsnew += '{{trans-mid}}\n' # better test? should be the same as not m
tsnew += '{{trans-bottom}}\n'
return tsnew
def prokey(s):
# is (sorted) stable? as of Python 2.3, yes ;-)
# simple prolog sort, LHS after RHS, unknown in the middle
if s.startswith('{{was wotd'): return '0' # moved in monobook
if s.startswith('{{wiki'): return '1' # sister templates
if s.startswith('{{commons'): return '1' # sister templates
if s.startswith('{{inter'): return '1' # sister templates
if s.startswith('{{zh-'): return '2' # Chinese floatright
if s.startswith('{{ja-'): return '2' # Japanese floatright
if s.startswith('[[Image'): return '3' # images
if s.startswith('[[image'): return '3' # images
# LHS:
if s.startswith('{{selfref'): return '6'
if s.startswith('{{also'): return '7'
if s.startswith('{{xsee'): return '7'
if s.startswith('{{xalso'): return '7'
if s: print "prolog sort: no key for %s" % safe(s)
else: return '9' # blank lines usually are at end, will be removed
return '5'
def main():
global naptime
# regex precomp, force headers to canonical:
# first allows singleton =
rehead1 = re.compile(r'(={2,6})(.+?)={2,6}(.*)$')
rehead2 = re.compile(r'(={1,6})([^=<]+?)={1,6}(.*)$')
rehead3 = re.compile(r'(={1,6})([^=<]+?)=+(.*)$')
rehead4 = re.compile(r'(=+)([^=<]+)(.*)$')
realleq = re.compile(r'=+$')
# L2 headers
reL2head = re.compile(r'==?\s*([^=]+)={1,6}(.*)')
# lang= on bad headers, so allow singleton ='s:
reheader = re.compile(r'(={3,6})\s*(.+?)={2,6}(.*)')
reiwiki = re.compile(r'\[\[[-a-z]{2,11}:(.*)\]\]')
recat = re.compile(r'\[\[category:.*?\]\]', re.I)
retrans1 = re.compile(r'\* \[\[w:.+\|([^\]]+?)\]\]\s*:(.*)')
retrans2 = re.compile(r'\* \[\[([^\]]+?)\]\]\s*:(.*)')
retrans3 = re.compile(r'\* ([^:]+?):(.*)')
retrans4 = re.compile(r'\* (\w+)(.*)') # missing :
retag = re.compile(r'\{\{rfc-auto(\|.*?|)}}')
regender = re.compile(r"''([mfcn])''")
reglossfix = re.compile(r'(.+)\(\d+\)$')
retopgloss = re.compile(r'\{\{top(\|.*?|)}}$')
recontext = re.compile(r"^# *\(''(.+?)''\):? ?(.*)$", re.M)
recontext2 = re.compile(r"^# *''\((.+?)\):?'' ?(.*)$", re.M)
recontext3 = re.compile(r"^# *\{\{italbrac\|([^}]+?)}}:? ?(.*)$", re.M)
repronn = re.compile(r'Pronunciation \d+')
# be careful to match and remove newline in these unless they happen to be at the very end:
rerfclevel = re.compile(r"^\{\{rfc-level\|.*\+.*\}\}\n?", re.M)
rerfcxphrase = re.compile(r"^\{\{rfc-xphrase\|.*\}\}\n?", re.M)
rerfcheader = re.compile(r"^\{\{rfc-header\|.*\}\}\n?", re.M)
rerfcsubst = re.compile(r"^\{\{rfc-subst\}\}\n?", re.M)
rerfcpronn = re.compile(r"^\{\{rfc-pron-n\|.*\}\}\n?", re.M)
# italbracs not on context/defn lines, template italbrac->i replacement separate
# limited forms ... nowilink with pipes, no templates, look for : in mo.g3
# look for gloss, etc, * lines to start ...
reibcomma = re.compile(r"^(\*\s*)\(''([^\)^'^\|^\{]+):?''\)(:?)")
reibcomma2 = re.compile(r"^(\*\s*)''\(([^\)^'^\|^\{]+):?\)''(:?)")
# match "stackable" format characters at start of lines, so we can have one space exactly
restack = re.compile(r"^([:#\*]+)\s*")
# regex table (dict, name = tuple of compiled object and replacement)
Regex['subst:PAGENAME'] = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}')
Regex['template -cattag +context'] = (re.compile(r'\{\{cattag\|'), '{{context|')
Regex['template -Unicode +unicode'] = (re.compile(r'\{\{Unicode\|'), '{{unicode|')
Regex['template -Wikipedia +wikipedia'] = (re.compile(r'\{\{Wikipedia([\|\}])'), r'{{wikipedia\1')
Regex['template -WP +wikipedia'] = (re.compile(r'\{\{WP([\|\}])'), r'{{wikipedia\1')
Regex['template -Acronym +acronym'] = (re.compile(r'\{\{Acronym([\|\}])'), r'{{acronym\1')
Regex['template -Initialism +initialism'] = (re.compile(r'\{\{Initialism([\|\}])'), r'{{initialism\1')
Regex['template -Abbreviation +abbreviation'] = (re.compile(r'\{\{Abbreviation([\|\}])'), r'{{abbreviation\1')
Regex['template -AHD +enPR'] = (re.compile(r'\{\{AHD([\|\}])'), r'{{enPR\1')
# translations
Regex['template -trans-bot +trans-bottom'] = (re.compile(r'\{\{trans-bot\}\}'), '{{trans-bottom}}')
Regex['template -trans-middle +trans-mid'] = (re.compile(r'\{\{trans-middle\}\}'), '{{trans-mid}}')
Regex['elided Translations to be checked header'] = (re.compile(
r'^={3,6}Translations to be checked={3,6}\n*\{\{checktrans', re.M), '{{checktrans')
Regex['elided Translations to be checked header and comment'] = (re.compile(
r'^={3,6}Translations to be checked={3,6}\n*<!--\s*Remove this section.*\n*\{\{checktrans', re.M),
Regex['checktrans and trans-top to checktrans-top'] = (re.compile(
r'^\{\{checktrans\}\}\n*\{\{trans-top\|\w*lations to be \w*\}\}', re.M), '{{checktrans-top}}')
Regex['checktrans/top/mid/bottom to checktrans-top etc'] = (re.compile(
r'^\{\{checktrans\}\}\n*\{\{top\}\}(.*?)^\{\{mid\}\}(.*?)^\{\{bottom\}\}', re.M|re.S),
Regex['template -ttbc-top +checktrans-top'] = (re.compile(r'\{\{ttbc-top\}\}'), '{{checktrans-top}}')
Regex['template -ttbc-mid +checktrans-mid'] = (re.compile(r'\{\{ttbc-mid\}\}'), '{{checktrans-mid}}')
Regex['template -ttbc-bottom +checktrans-bottom'] = (re.compile(r'\{\{ttbc-bottom\}\}'),
Regex['template -trad +t'] = (re.compile(r'\{\{trad\|'), '{{t|')
Regex['template -trad- +t-'] = (re.compile(r'\{\{trad-\|'), '{{t-|')
Regex['un-indent {{see}} template'] = (re.compile(r'^:\{\{see\|', re.M), '{{see|')
Regex['template -cpl +{{c|p}}'] = (re.compile(r'\{\{c\.?pl\.?}}'), '{{c|p}}')
Regex['template -fpl +{{f|p}}'] = (re.compile(r'\{\{f\.?pl\.?}}'), '{{f|p}}')
Regex['template -pl. +{{p}}'] = (re.compile(r'\{\{pl\.}}'), '{{p}}')
Regex['template -m. +{{m}}'] = (re.compile(r'\{\{m\.}}'), '{{m}}')
Regex['template -f. +{{f}}'] = (re.compile(r'\{\{f\.}}'), '{{f}}')
Regex['template -mf +{{m|f}}'] = (re.compile(r'\{\{mf}}'), '{{m|f}}')
Regex['template -fn +{{f|n}}'] = (re.compile(r'\{\{fn}}'), '{{f|n}}')
Regex['template -fp +{{f|p}}'] = (re.compile(r'\{\{fp}}'), '{{f|p}}')
Regex['template -mp +{{m|p}}'] = (re.compile(r'\{\{mp}}'), '{{m|p}}')
Regex['template -fm +{{m|f}}'] = (re.compile(r'\{\{fm}}'), '{{m|f}}')
Regex['template -nf +{{f|n}}'] = (re.compile(r'\{\{nf}}'), '{{f|n}}')
Regex['template -nm +{{m|n}}'] = (re.compile(r'\{\{nm}}'), '{{m|n}}')
# given name, preferred syntax
Regex['xx: to lang=xx in given name template'] = (
re.compile(r'(\{\{given name[^\}]*?\|)\|?([-a-z]{2,10}):\}\}'), r'\1lang=\2}}')
Regex['from language to from=language in given name template'] = (
re.compile(r'(\{\{given name[^\}]*?\|)from ([-a-zA-Z ]+)\|?([\}\|])'), r'\1from=\2\3')
# table format lines, row divs to one "-"
Regex['table |--* to |-'] = (re.compile(r'^\|--+', re.M), r'|-')
# stuff left from preload templates
# careful this first one starts with 3 {'s, check previous character? not for now
Regex['remove template subst detritus'] = (re.compile('\{\{\{[0-9a-z]+\|(.*?)\}\}\}'), r'\1')
Regex['remove template subst detritus #if etc'] = (re.compile('\{\{#\w+:\|\|?\}\}'), r'')
# temp for esbot leftovers:
Regex['remove esbot:catline'] = (re.compile('\{\{esbot:catline.*\{\{ending\}{5,5}'), r'')
# script code replacements, first a dict, then generate the two regex forms for each:
for sc in Scripts:
Regex['script template -'+sc+' +'+Scripts[sc]] = (re.compile(r'\{\{'+sc+r'\|'), '{{'+Scripts[sc]+'|')
Regex['script parameter -sc='+sc+' +sc='+Scripts[sc]] = (
re.compile(r'\|sc='+sc+r'([\}\|])'), '|sc='+Scripts[sc]+r'\1')
# whoa(!)
# see templates
Regex['template -see +also'] = (re.compile(r'\{\{see\|'), r'{{also|')
Regex['template -See +also'] = (re.compile(r'\{\{See\|'), r'{{also|')
Regex['template -see also +also'] = (re.compile(r'\{\{see also\|'), r'{{also|')
# fix Japanese sees, allow a line for kanjitab after header (do not use re.S)
Regex['Japanese see/also in section to ja-see-also'] = \
(re.compile(r'^(==Japanese==\n*.*\n*){\{(see|also)\|', re.M), \
Regex['add language in front of {{t}}'] = (re.compile(r'^\*? *\{\{t(\+|-|)\|([a-z-]+)\|', re.M), \
r'* {{\2}}: {{t\1|\2|')
# (a few more general Regex below)
StarTemp = set([ 'Han ref', 'ja-readings', 'ethnologue', 'websters-online', 'pedialite',
'Hanja ref', 'Linguist List', 'IPA', 'SAMPA', 'enPR', 'ISO 639', 'R:1913' ])
restartemp = re.compile(r'\{\{(.+?)[\|\}]')
# trans lines gender templates regex, ordered list:
Trex = [ ]
# first replace ' cases with templates, look for leading space:
Trex.append((re.compile(r" ''([mfcn])''"), r' {{\1}}'))
Trex.append((re.compile(r" ''(pl|plural)''"), ' {{p}}'))
Trex.append((re.compile(r" ''(sg|sing|singular)''"), ' {{s}}'))
Trex.append((re.compile(r" ''m( and| or|,|/|) ?f''"), ' {{m|f}}'))
# now look for combinations:
Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcn])}},? \{\{([cnps])}}"), r'{{\1|\2|\3}}'))
Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcnps])}}"), r'{{\1|\2}}'))
# hmmm...
Trex.append((re.compile(r"\{\{t([\+\-]?)\|([^\|]*?)\|([^\|]*?)\|mf}}"), r'{{t\1|\2|\3|m|f}}'))
# match trans sections
retransect = re.compile(r"^\{\{trans-top.*?^\{\{trans-bottom\}\}\n", re.M|re.S)
# Pronunciate
# like Regex, but applied line by line only in pronunciation sections
# use ^ and $ as needed with re.M for prescreen
Prex['template enPR/IPA/SAMPA'] = \
(re.compile(r'^\*? ?([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* {{enPR|\1}}, {{IPA|/\2/}}, {{SAMPA|/\3/}}')
Prex['template enPR/IPA/SAMPA (RP, UK, US)'] = \
(re.compile(r"^\*? ?\(''(RP|UK|US)''\):? *"
r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* {{a|\1}} {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')
Prex['template enPR/IPA/SAMPA with {a}'] = \
(re.compile(r"^\*? ?(\{\{a\|[^\}]+\}\}):? *"
r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
r'* \1 {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')
Prex['+rhymes template'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:English:-(?P<s>.+?)\|-(?P=s)\]\]"),
# w/O "Rhymes:":
Prex['+rhymes template w/Rhymes: in link'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:English:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2}}')
Prex['+rhymes template (Finnish)'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|-(?P=s)\]\]"),
Prex['+rhymes template w/Rhymes: in link (Finnish)'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2|lang=fi}}')
Prex['+rhymes template w/Rhymes: in link (French)'] = \
(re.compile("^([\*:]+) *\[\[[Rr]hymes:French:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
r'\1 {{rhymes|\2|lang=fr}}')
Prex['+rhymes template (Icelandic)'] = \
(re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Icelandic:-(?P<s>.+?)\|-(?P=s)\]\]"),
Prex['template -Rhymes +rhymes'] = (re.compile(r'\{\{Rhymes([\|\}])'), r'{{rhymes\1')
# multiple rhymes (assume language matches! ;-)
Prex['add additional rhyme to template'] = \
(re.compile(r'(\{\{rhymes\|[^\}]+)\}\} *(,|or|) *\[\[[Rr]hymes:[A-Za-z -]+:-(?P<s>.+?)\| ?-(?P=s)\]\]'),
Prex["rm /'s from enPR template"] = (re.compile(r'\{\{enPR\|/([^ /\[\]\{\}]+?)/\}\}'), r'{{enPR|\1}}')
# RP, UK, and US in a wide variety of cases
Prex['(RP) to {{a|RP}}'] = (re.compile(r"^\*? ?[\(\[\{']+RP[\]\)\}:']+", re.M), r'* {{a|RP}}')
Prex['(UK) to {{a|UK}}'] = (re.compile(r"^\*? ?[\(\[\{']+UK[\]\)\}:']+", re.M), r'* {{a|UK}}')
Prex['(US) to {{a|US}}'] = (re.compile(r"^\*? ?[\(\[\{']+US[\]\)\}:']+", re.M), r'* {{a|US}}')
Prex['(italbrac RP) to {{a|RP}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*RP\]*\}\}:?", re.M), r'* {{a|RP}}')
Prex['(italbrac UK) to {{a|UK}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*UK\]*\}\}:?", re.M), r'* {{a|UK}}')
Prex['(italbrac US) to {{a|US}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*US\]*\}\}:?", re.M), r'* {{a|US}}')
Prex['IPA: [[WEAE]] to {{a|WEAE}} IPA:'] = \
(re.compile(r"^\*? ?IPA: [\(\[\{']+WEAE[\]\)\}:']+", re.M), r'* {{a|WEAE}} IPA:')
Prex['(GenAm) to {{a|GenAm}}'] = (re.compile(r"^\*? ?\[\[w:G[^\|]+\|GenAm\]\]", re.M), r'* {{a|GenAM}}')
Prex['(Canada) to {{a|Canada}}'] = (re.compile(r"^\*? ?[\(\[\{']+Canada[\]\)\}:']+", re.M), r'* {{a|Canada}}')
Prex['(Australia) to {{a|Australia}}'] = \
(re.compile(r"^\*? ?[\(\[\{']+Australia[\]\)\}:']+", re.M), r'* {{a|Australia}}')
Prex['(Aus) to {{a|Aus}}'] = (re.compile(r"^\*? ?[\(\[\{']+Aus[\]\)\}:']+", re.M), r'* {{a|Aus}}')
Prex['(GenAm|US) to {{a|GenAm}}'] = \
(re.compile('^' + re.escape("* (''[[General American|US]]'')"), re.M),
r'* {{a|GenAm}}')
Prex['(RecPr|UK) to {{a|RP}}'] = \
(re.compile('^' + re.escape("* (''[[Received Pronunciation|UK]]'')"), re.M),
r'* {{a|RP}}')
# untemplated SAMPA and IPA, several combinations, also for "AHD", allow an {{a}} template in front
Prex['template IPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:IPA\||)IPA\]*:? *([/\[][^\{\|\}/\]]+?[/\]])$", re.M),
r'* \1{{IPA|\3}}')
Prex['template IPA -IPAchar'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:IPA\||)IPA\]*:? *\{\{IPAchar\|([/\[][^\{\|\}/\]]+?[/\]])\}\}$", re.M),
r'* \1{{IPA|\3}}')
Prex['template SAMPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:SAMPA\||)SAMPA\]*:? *([/\[])(<tt>|)([^\|\}/]+?)(</tt>|)([/\]])$", re.M),
r'* \1{{SAMPA|\3\5\7}}')
Prex['template enPR (was AHD)'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)\[*(w:AHD\||)AHD\]*:? *([^ \{\|\}/]+?)$", re.M),
r'* \1{{enPR|\3}}')
Prex['template X-SAMPA'] = \
(re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
r"\[*(w:X-SAMPA\||)X-SAMPA\]*:? *([/\[])(<tt>|)([^\{\|\}/]+?)(</tt>|)([/\]])$", re.M),
r'* \1{{X-SAMPA|\3\5\7}}')
Prex['or/comma to multiple parameters in IPA template'] = \
(re.compile(r"\{\{IPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{IPA|\1|\3}}')
Prex['or/comma to multiple parameters in enPR template'] = \
(re.compile(r"\{\{enPR\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{enPR|\1|\3}}')
Prex['or/comma to multiple parameters in SAMPA template'] = \
(re.compile(r"\{\{SAMPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{SAMPA|\1|\3}}')
# accent templates, try to cover the A-cai/Min Nan cases and others, up to 4
Prex['+accent template 1'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2}}')
Prex['+accent template 2'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4}}')
Prex['+accent template 3'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4|\6}}')
Prex['+accent template 4'] = (re.compile(r"^\* \(''"
r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
r"''\):?", re.M), r'* {{a|\2|\4|\6|\8}}')
# hyphenation ...
Prex['+hyphenation template'] = (re.compile(r"'*Hyphenation:?'*:? *([^ \{\}]+)$"), r'{{hyphenation|\1}}')
Prex['middot to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u00B7' + '(.+?\}\})'),
Prex['hyphpt to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2027' + '(.+?\}\})'),
Prex['middot (HTML) to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)·(.+?\}\})'),
# "blank" IPA/SAMPA/AHD, include new-line, so put these in general regex
Regex['replaced IPA // with {{rfp}}'] = (re.compile(r'^\* \[\[IPA\]\]:? *//\n', re.M), '{{rfp}}\n')
Regex['removed SAMPA //'] = (re.compile(r'^\* \[\[SAMPA\]\]:? *//\n', re.M), '')
Regex['removed AHD //'] = (re.compile(r'^\* \[\[AHD\]\]:? *//\n', re.M), '')
# IPA template fix to add lang=, capture all but }} without =
reIPAlang = re.compile(r'(\{\{IPA\|[^}=]+)\}\}')
# combine to single lines, lines are canonical
repronsing3 = re.compile(r"^\* \{\{enPR\|(.*?)\}\}\n\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)
repronsing3a = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}\n\* \{\{enPR\|(.*?)\}\}", re.M)
repronsing2 = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)
# add links to form of to make pages countable
Forms = [ 'es-verb form', 'superlative', 'comparative', 'alternative spelling', 'alternative form',
'past', 'archaic spelling', 'fi-participle', 'present participle',
'feminine', 'diminutive', 'obsolete spelling', 'infinitive',
'plural', 'fi-form', 'pt-verb form' ]
Frex = { }
for form in Forms:
Frex['make page count: add link in {{' + form + ' of}}'] = \
(re.compile(r'\{\{(' + form +r') of\|([\w-]+)(}}|\|[^=}\[]+=[^}\[]+}})'), r'{{\1 of|[[\2]]\3')
Frex['make page count: add link in {{' + form + ' of}} 2'] = \
(re.compile(r'\{\{(' + form +r') of([^=}\[]+=[^}\[]+)\|([\w-]+)}}'), r'{{\1 of\2|[[\3]]}}')
# make sure we are logged in
site = wikipedia.getSite("en", "wiktionary")
site.forceLogin(sysop = True)
site.forceLogin(sysop = False)
# get our config pages, throw exceptions: we have to stop if we can't read these
print "read languages"
page = wikipedia.Page(site, "User:AutoFormat/Languages")
langtab = getwikitext(page)
print "read headers"
page = wikipedia.Page(site, "User:AutoFormat/Headers")
headtab = getwikitext(page)
print "read Top40"
page = wikipedia.Page(site, "Wiktionary:Translations/Wikification")
top40tab = getwikitext(page)
print "read contexts"
page = wikipedia.Page(site, "User:AutoFormat/Contexts")
ctxtab = getwikitext(page)
print "read etys"
page = wikipedia.Page(site, "User:AutoFormat/Ety temps")
etytab = getwikitext(page)
Lcodes = { }
Ltocode = { }
relangtab = re.compile(r'\| (.*?)\|\|(.*)')
i = 0
for line in langtab.splitlines():
mo = relangtab.match(line)
if mo:
for code in','):
Lcodes[code.strip()] =
i += 1
Ltocode[] =',')[0].strip()
print "found %d language codes" % i
# treat a couple of other codes as Mandarin etc, since they are in cats:
Lcodes['zh-cn'] = 'Mandarin'
Lcodes['zh-tw'] = 'Mandarin'
Lcodes['nan-cn'] = 'Min Nan'
Lcodes['nan-tw'] = 'Min Nan'
Lcodes['yue-cn'] = 'Cantonese'
Lcodes['yue-hk'] = 'Cantonese'
Level = { }
L43 = { }
POS = { }
EOS = [ 'See also', 'References', 'External links', 'Anagrams', 'Dictionary notes', 'Trivia']
TOS = [ 'Pronunciation', 'Alternative spellings', 'Alternative forms', 'Production' ]
HAN = ['Han character', 'Kanji', 'Hanzi', 'Hanza']
HT = ( '{{abbreviation', '{{initialism', '{{acronym', '{{numeral' )
NS = { }
Hfix = { }
reheadtab = re.compile(r'\| (.*?)\|\|\s*([1-5/]*)\s*\|\|(.*?)\|\|(.*?)\|\|(.*)')
i = 0
for line in headtab.splitlines():
mo = reheadtab.match(line)
if mo:
header =
if == '4/3':
L43[header] = True
Level[header] = 4
print "header %s is 4/3" % header
else: Level[header] = int(
if == 'NS': ns = NS[header] = True
else: ns = False
if == 'POS': POS[header] = True
for variant in','):
variant = variant.lower().strip()
if not variant: continue
Hfix[variant] = header
if not ns:
if variant.endswith('s'): Hfix[variant[-1]] = header
else: Hfix[variant + 's'] = header
Hfix[header.lower()] = header
if not ns:
if header.endswith('s'): Hfix[header.lower()[-1]] = header
else: Hfix[header.lower() + 's'] = header
i += 1
print "found %d headers" % i
# lots of possible ety sects, 1 to 24
for i in range(1, 25):
Hfix['etymology %d'%i] = 'Etymology %d'%i
Level['Etymology %d'%i] = 3
Top40 = { }
Classics = { }
retop40tab = re.compile(r'\*\s*(.*)')
i = j = 0
inT40 = True
for line in top40tab.splitlines():
if line.startswith('----'): inT40 = False
mo = retop40tab.match(line)
if mo:
lang =' []')
else: continue
if inT40:
Top40[lang] = True
i += 1
if lang in Top40:
print "language %s in both Top40 and Classics?" % safe(lang)
Classics[lang] = True
j += 1
print "found %d Top 40 languages" % i
print "found %d Classic languages" % j
# add all other known languages not in Top40:
i = 0
for code in Lcodes:
lang = Lcodes[code]
if lang not in Top40 and lang not in Classics:
if lang == 'English': continue
Classics[lang] = True
i += 1
# print "added Classic: %s" % safe(lang)
print "added %d languages to Classics" % i
Contexts = { }
rectxtab = re.compile(r"\|\s*''(.*?)''\s*\|\|(.*)")
i = 0
for line in ctxtab.splitlines():
mo = rectxtab.match(line)
if mo:
m1 =
m2 =
if not m1 or not m2: continue
# only use first, table at top over-rides auto, templates over-ride redirects
if m1 not in Contexts: Contexts[m1] = m2
i += 1
print "found %d context templates" % i
# turn on/off for now
contextp = True
# Etyl conversions
reetytab = re.compile(r'\| ?\{\{temp\|([A-Z][A-Za-z]*)\.\}\} ?\|\| ?([a-z]{2,3}) ?\|\|')
i = 0
for line in etytab.splitlines():
mo = reetytab.match(line)
if mo:
m1 =
m2 =
if not m1 or not m2: continue
# add regex:
Regex['convert %s. to etyl|%s' % (m1, m2)] = \
(re.compile(r'\{\{' + m1 + r'\.(\|[a-z]{2,3}|)\}\}'), r'{{etyl|' + m2 + r'\1}}')
print "add regex to convert %s. to etyl|%s" % (m1, m2)
i += 1
print "found %d ety template conversions" % i
entries = 0
fixed = 0
# (specific stats)
# Set up set of all headers that are valid (at L3 or higher)
for header in Level:
# Sigh. True means prohibited from changing 4/3 levels
Connel = True
for page in rcpages(site):
naptime += 3
days = (time.time() - 1199145600) / 86400 # days since 1 Jan 08
if random() < days/370: Connel = False # some of the time, as they need to be checked
else: Connel = True
title = page.title()
print "page %s" % safe(title)
if ':' in title:
print "not in main namespace"
if title.lower() == 'main page':
print "skip Main page ..."
entries += 1
# text = page.get()
text = getwikitext(page)
origtext = text
except wikipedia.NoPage:
print "Can't get %s from en.wikt" % safe(title)
text = ''
except wikipedia.IsRedirectPage:
print "Redirect page %s" % safe(title)
text = ''
except wikipedia.LockedPage:
print "Locked/protected page %s" % safe(title)
text = ''
acts = set()
mo =
if mo:
if' |'):
acts.add('rm tag:' +' |'))
acts.add('rm tag')
text = retag.sub('', text)
# rfc level trickery
newtext = rerfclevel.sub('', text)
if newtext != text:
print 'took out rfc-level'
acts.add('rm rfc-level tag')
text = newtext
# same for xphrase
newtext = rerfcxphrase.sub('', text)
if newtext != text:
print 'took out rfc-xphrase'
acts.add('rm rfc-xphrase tag')
text = newtext
# same for header
newtext = rerfcheader.sub('', text)
if newtext != text:
print 'took out rfc-header'
acts.add('rm rfc-header tag')
text = newtext
# same for subst
newtext = rerfcsubst.sub('', text)
if newtext != text:
print 'took out rfc-subst'
acts.add('rm rfc-subst tag')
text = newtext
# same for pron-n
newtext = rerfcpronn.sub('', text)
if newtext != text:
print 'took out rfc-pron-n'
acts.add('rm rfc-pron-n tag')
text = newtext
if '{{rfc' in text: rfc = True
#elif '{{rfc|' in text: rfc = True
#elif '{{rfc-' in text: rfc = True
else: rfc = False
rfcact = ''
# overall regex, using table
for rx in Regex:
newtext = Regex[rx][0].sub(Regex[rx][1], text)
if newtext != text:
text = newtext
# report multiple blank lines (force save), will be taken out by parsing
if '\n\n\n\n' in text:
# 3 or more, not just 2
acts.add("remove multiple blank lines")
# categories found in the entry or implied by context and perhaps inflection templates
catseen = set()
# now parse. take the entry apart into languages (ha!)
curr = '*prolog'
last = ''
Lsect = { '*prolog':[ ], '*iwiki':[ ] }
Lcats = { }
waslinked = [ ]
divs = 0
header = ''
for line in text.splitlines():
# canonical headers first. some later code is redundant, but so what? it does "rest"
if line and line.startswith('='):
mo = rehead1.match(line)
if not mo: mo = rehead2.match(line)
if not mo: mo = rehead3.match(line)
if not mo: mo = rehead4.match(line)
# must match 4 or else what?! (all eq = is the answer to this question!)
if not mo:
mo = realleq.match(line)
if mo: acts.add("remove line of only ='s")
else: acts.add('remove bogus = line')
oline = line
level = len(
if not
acts.add('removed nil header') # !!!
line = ''
else: line = '='*level + + '='*level +
if line != oline: acts.add('format headers')
# L2 headers
mo = reL2head.match(line)
if mo:
header =
hf = reunlink.sub(r'\1', header)
if hf != header:
if '|' in hf: hf = hf.split('|')[1]
if hf not in Top40: waslinked.append(hf)
elif hf not in Level: acts.add('unlink language header ' + hf)
header = hf
# validate language [needs to be fixed for case before first lang section!]
if header.capitalize() in Level:
if not rfc:
text = '{{rfc-level|' + header + ' as level 2 header}}\n' + text
rfcact = 'add rfc-level tag for L1/2 header ' + header
rfc = True
print "(no edit, bad L2 header and rfc)"
rfcact = 'bad L1/2 header ' + header
# try fixing, move to min level for this header:
level = Level[header.capitalize()]
acts.add('L1/2 header ' + header + ' to L' + str(level))
# header + anything else, will get moved later
Lsect[curr].append('='*level + header + '='*level +
continue # with current language section
# subst code template
if header.startswith('{{'):
if header[2:-2] in Lcodes:
hf = Lcodes[header[2:-2]]
acts.add('L2 header -' + header + ' +' + hf)
header = hf
# check sort order
if header and last and lkey(header) < lkey(last):
acts.add(last + '/' + header + ' sorted into order')
last = header
if header not in Lsect:
Lsect[header] = [ ]
Lcats[header] = [ ]
acts.add('merged ' + header + ' sections')
curr = header
acts.add('stuff after L2 header moved')
# look for iwiki
mo = reiwiki.match(line)
if mo and == title:
# wiki format + one space
line = restack.sub(r'\1 ', line)
# trailing spaces
if len(line) > 2 and line.startswith('=') and line.endswith(' '): acts.add('rm spaces after header')
line = line.rstrip()
# take out dividers
if line.startswith('----'):
if line == '----': divs += 1
# other lines
# any language sections?
if len(Lsect) == 2:
# no, tag if not tagged
if ( 'nolanguage/box' not in text and '{{wikify' not in text and
'{{delete' not in text and '{{only in' not in text ):
text = '{{subst:nolanguage}}\n' + text
rfcact = 'tagged nolanguage'
rfc = True
print "(no edit, tagged nolanguage, wikify or delete)"
continue # next entry
# each section
for lang in Lsect:
if lang.startswith('*'): continue
if lang in Ltocode: lcode = Ltocode[lang]
else: lcode = ''
# find Etymologies first
etys = [ ]
etycount = 0
fh = True
for i, line in enumerate(Lsect[lang]):
# look for ety headers, and Pronunciation first at L4
mo = reheader.match(line)
if mo:
level = len(
header =
# rest =
# special case pronunciation, occurs with some frequency
if fh and level != 3 and fuzzy(header.lower(), 'pronunciation', 11) >= 11 and len(header) < 15:
acts.add('Pronunciation changed to level 3')
Lsect[lang][i] = '===' + header + '==='
# and leave fh set:
# just do fuzzy!
if fuzzy(header.lower(), 'etymology', 7) >= 7 and len(header) < 20:
if level != 3:
if fh:
# first header, okay to fix!
acts.add('Etymology changed to level 3')
# and leave fh set:
etycount += 1
elif not rfc:
Lsect[lang][i] = line + '{{rfc-level|Etymology not at level 3|lang=%s}}'%lcode
acts.add('+{{rfc-level|Etymology not at level 3}}')
rfc = True
print "(ety not at L3 and already rfc)"
etycount += 1
fh = False
# then fix/rewrite the ety headers, use sub to handle rest, report any changes (spacing an issue):
if etycount:
for i in range(etycount):
line = Lsect[lang][etys[i]]
# print 'ety check replace ' + line
if etycount > 1: newline = reheader.sub(r'===Etymology %d===\3' % (i+1), line)
else: newline = reheader.sub(r'===Etymology===\3', line)
if newline.strip('= ') != line.strip('= '):
acts.add('header -' + line.strip('= ') + ' +' + newline.strip('= '))
Lsect[lang][etys[i]] = newline
# sigh, think that's it? Sweet, if true...
# general format
newlines = [ ]
inPos = inTrans = inPro = inext = defnext = False
npos = 0
ety = nety = 0
levelact = ''
rfctag = ''
header = ''
for line in Lsect[lang]:
# minor spacing on stackable wiktext ...
# already done line = restack.sub(r'\1 ', line)
# move cats, may be something else on the line too, or multicats ...
# first we need a cat-present predicate
catp = False
for cat in recat.findall(line):
ocat = cat
catp = True
catname = cat[11:-2].split('|')[0]
catname = re.sub('_', ' ', catname).strip()
cf = cat.find('|')
if cf > 0: cat = '[[Category:' + catname + cat[cf:]
else: cat = '[[Category:' + catname + ']]'
# we have a canonical cat! is it a novel cat?
if cat in catseen:
acts.add('rm dup cat [[:' + cat[2:])
# rm bad cats from substs left around, see how this works
if '{{{' in cat:
acts.add('rm bad cat [[:' + cat[2:])
if cat != ocat: acts.add('canonical cats')
# see if it belongs in a different sect
catmove = False
if ':' in catname:
catcode = catname.split(':')[0]
if catcode in Lcodes:
catlang = Lcodes[catcode]
if catlang != lang and catlang in Lcats:
acts.add('category ' + catname + ' moved to ' + catlang + ' section')
catmove = True
elif not catname.lstrip(' 01').startswith(lang) and not catname.endswith('derivations'):
for other in Lcats:
if other == lang: continue
if catname.lstrip(' 01').startswith(other+' '):
acts.add('category ' + catname + ' moved to ' + other + ' section')
catmove = True
# not moved
if not catmove: Lcats[lang].append(cat)
if catp:
line = recat.sub('', line).strip()
if not line: continue
# headers
mo = reheader.match(line)
if mo:
# hit header with no infl/defn line in previous section?
if inext:
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
inext = False
defnext = True
if defnext and header not in HAN:
newlines.append('# {{defn|%s}}' % lang)
acts.add('no definition line for %s/%s added {defn}' % (lang, header))
level = len(
header =
rest =
# unlink header
hf = reunlink.sub(r'\1', header)
if hf != header:
if hf.find('|') > 0: hf = hf.split('|')[1]
acts.add('header -' + header + ' +' + hf)
header = hf
# fix header
if header.lower() in Hfix:
hf = Hfix[header.lower()]
if hf != header:
acts.add('header -' + header + ' +' + hf)
header = hf
# try a fuzzy!
if header.lower() not in Hfix and not header.startswith('{{'):
high = 0
replac = ''
hf = header.strip('[]{}').lower()
for val in sorted(Hfix):
# first character must match
if hf[0] != val[0]: continue
rawsc = fuzzy(hf, val, len(val) - 4)
print safe('fuzzy "%s" "%s" score %d' % (hf, val, rawsc))
if rawsc > high and rawsc > max(max(len(hf), len(val)) - 3, 5):
high = rawsc
replac = val
print safe('fuzzy for %s: %s score %d' % (hf, replac, high))
if high:
hf = Hfix[replac]
acts.add('header -' + header + ' +' + hf)
header = hf
# tag Transitive and Intransitive verb, and Reflexive
if header.lower() in ('transitive verb', 'intransitive verb', 'reflexive verb') and not rfc:
rfctag = '{{rfc-trverb|' + header + '}}'
rfc = True
# print "trans/intrans header: %s" % safe(header)
# tag X phrase
if header.endswith(' phrase') and not rfc:
rfctag = '{{rfc-xphrase|' + header + '}}'
rfc = True
# print "X phrase header: %s" % safe(header)
# tag Pronunciation N headers, preventing the level errors later
if repronn.match(header) and not rfc:
# not sure if we need the header in the template, but follows the pattern (with a |)
rfctag = '{{rfc-pron-n|' + header + '}}'
rfc = True
# rfc unrecognized, ignore templates for now, use NS later
if header.lower() not in Hfix and not rfc and not header.startswith('{{'):
rfctag = '{{rfc-header|' + header + '}}'
rfc = True
# print "unknown header: %s" % safe(header)
# min level, set and comp for nested ety
if level == 3 and header.startswith("Etymology") and etycount > 1:
ety = 1
nety += 1
npos = 0
push = False
if ety:
# if we are in the last ety sect, and see end of section things at L3:
if level < 4 and nety == etycount and header in EOS: inPos = ety = 0
# and ... independent of connel flag, because we always push ;-)
if level < 4 and nety == etycount and header in L43: inPos = ety = 0
# push POS (or level 3?) sections down in ety, push flag because of Connel fix
# may be a good idea anyway ... yes, but if we rfc, stop
if ety and not rfc:
if (header in POS and header not in HAN or header in TOS) and level == 3:
level = 4
acts.add('header in ety sect ' + header + ' to L' + str(level))
if header == 'Pronunciation':
rfctag = '{{rfc-level|check placement of Pronunciation}}'
push = True
elif header in POS and header not in HAN or header in TOS:
# at correct level! (or too deep already)
push = False
elif push and header in Level and (level == 4 or level < Level[header] + ety):
level += 1
acts.add('header in ety sect ' + header + ' to L' + str(level))
elif level < 4: push = False
# code to shift header levels (general case in POS), disabled per Connel, 18.4.7
if inPos and header in L43:
if npos < 2 and level < 4 + ety:
if not Connel:
level = 4 + ety
acts.add('header ' + header + ' to L' + str(level))
else: levelact = ' (AutoFormat would have corrected level of ' + header +')'
elif inPos and header in Level:
if level < Level[header] + ety:
if not Connel:
level = Level[header] + ety
acts.add('header ' + header + ' to L' + str(level))
else: levelact = ' (AutoFormat would have corrected level of ' + header +')'
# now tag remaining problems if any, various cases
# should all contain "+" for the re-visit trick ...
if not rfc:
if level == 4 + ety and not inPos and header in POS and header not in NS:
rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 Ety section' + levelact + '}}'
elif level == 4 + ety and not inPos and header in Level and header not in NS:
rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 POS section' + levelact + '}}'
elif level == 3 + ety and header.startswith('Translation'):
rfctag = '{{rfc-level|' + header + ' at L3+' + levelact + '}}'
elif level == 5 + ety and not inTrans and header.startswith('Translations to'):
rfctag = '{{rfc-level|' + header + ' at L5+, not in Translations' + levelact + '}}'
# blank line
# header + anything else that wasn't blank
newlines.append('='*level + header + '='*level)
if rest.strip():
if not rest.startswith('{{rfc-'): acts.add('moved stuff after ' + header + ' header')
# Usage notes can be anywhere (see ELE)
if 'rfc-level|Usage notes' in rfctag: rfctag = ''
# suppress the "AF would have" now, just don't tag:
if "AutoFormat would have" in rfctag: rfctag = ''
if rfctag:
if lcode: rfctag = rfctag[:-2] + '|lang=%s}}'%lcode
acts.add('+' + rfctag)
if 'check placement' not in rfctag: rfc = True
rfctag = ''
# set flags:
inext = defnext = False
if level < 4 + ety and (header in POS or header.startswith(HT)):
inext = inPos = True
npos += 1
elif level < 4 + ety: inPos = False
inTrans = (header == 'Translations')
tt = False
inPro = (header == 'Pronunciation')
# look for inflection line
if inext:
if line.startswith('{{') and not line.startswith('{{wikipedia') or line.startswith("'''") or \
fuzzy(line, title, len(title) - 1) > len(title) - 1:
if line == title:
acts.add('replace unformatted headword')
inext = False
defnext = True
if line and line.startswith('#'):
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
defnext = True
inext = False
# and also do next case for defnext
# elide blanks above inflection line
if not line: continue
# look for definition lines
if defnext and line.startswith('#'):
defnext = False
# # used where it shouldn't be
if line.startswith('#') and header not in POS:
if header in TOS or header in EOS or (header in Level and Level[header] == 4):
line = '*' + line[1:]
acts.add("-# +* in %s section" % header)
# serious stuff ...
if line.startswith('# '):
# look for context tag
if lang in Ltocode:
ctxn = 1
mo = recontext.match(line)
if not mo:
ctxn = 2
mo = recontext2.match(line)
if not mo:
ctxn = 3
mo = recontext3.match(line)
if mo:
print "match context tag %s" % safe(
tname = cpar(, Contexts)
if mo and tname:
if lang != 'English': tname += '|lang=' + Ltocode[lang]
if contextp and ctxn == 1:
acts.add("-(''" + + "'') +{{" + tname + "}}")
line = recontext.sub(r'# {{' + tname + r'}} \2', line)
elif contextp and ctxn == 2:
acts.add("-''(" + + ")'' +{{" + tname + "}}")
line = recontext2.sub(r'# {{' + tname + r'}} \2', line)
elif contextp and ctxn == 3:
acts.add("-{{italbrac|" + + "}} +{{" + tname + "}}")
line = recontext3.sub(r'# {{' + tname + r'}} \2', line)
else: print "would have replaced %s with %s" % (safe(, safe(tname))
# elide cats that correspond
for catname in tname.split('|'):
if catname == 'context' or catname.startswith('lang='): continue
catname = catname[0].upper() + catname[1:]
# code is prefix ...
if lang != 'English': catname = Ltocode[lang] + ':' + catname
if contextp:
catseen.add('[[Category:' + catname + ']]')
# catseen.add('[[Category:' + catname + 's]]')
print "added catseen %s" % safe(catname)
# wikilinking?
# (remember to correct for spacing)
elif not line.startswith('#') and not inTrans and "''" in line:
# look for italbrac cases not on defn lines
newl = reibcomma.sub(ibsub, line)
newl = reibcomma2.sub(ibsub, newl)
if newl != line:
# acts.add('-' + line + ' +' + newl)
# acts.add('template i')
# in pronunciation, use a, anywhere else, we want i-c if at start of * line
if inPro:
newl = re.sub(r'\{\{(i|i-c)\|', '{{a|', newl)
newl = re.sub(r'\{\{i\|', '{{i-c|', newl)
acts.add(sdif(line, newl))
line = newl
# think that will work?
# translations lines
# stopgap check: (should be improved, tsort knows haow to handle this)
if '{{ttbc|' in line: inTrans = False
if inTrans:
# special indent rule, we know there is a previous line
if line.startswith(': ') and newlines[-1:][0].startswith('*'):
acts.add('-: +*: in trans')
line = '*' + line
# similar rule for :*, we leave ** alone (is correct for grouped language)
# may have intended **, but this is better than leaving it :*
if line.startswith(':* ') and newlines[-1:][0].startswith('*'):
acts.add('-:* +*: in trans')
line = '*:' + line[2:]
was = False
mo = retrans1.match(line)
if not mo: mo = retrans2.match(line)
if mo: was = True
if not mo: mo = retrans3.match(line)
if not mo:
mo = retrans4.match(line)
if mo: # missing ':'
tlang =
if tlang in Top40 or tlang in Classics:
acts.add("added : after %s in translations" % tlang)
else: mo = None
if mo:
tlang =
if was and tlang.find('|') > 0: tlang = tlang.split('|')[1]
trest =
if tlang.startswith('{{') and tlang[2:-2] in Lcodes:
acts.add('subst %s in trans' % tlang)
tlang = Lcodes[tlang[2:-2]]
was = False
if was and (tlang in Top40 or title == tlang):
acts.add('trans unlink ' + tlang)
elif not was and tlang in Classics and title != tlang:
tlang = '[[' + tlang + ']]'
acts.add('trans link ' + tlang)
elif was:
# leave as is (was)
tlang = '[[' + tlang + ']]'
# conform gender specification templates
# tr = regender.sub(r'{{\1}}', trest)
tr = trest
for rx in Trex:
tr = rx[0].sub(rx[1], tr)
if tr != trest:
#acts.add('gender -' + trest + ' +' + tr)
acts.add('gender ' + sdif(trest, tr))
trest = tr
if trest: line = '* ' + tlang + ': ' + trest
else: line = '* ' + tlang + ':'
# convert templates
# has to be a non-blank previous line, we are in trans section
if line == '{{rfc-trans}}': inTrans = False
if line == '{{checktrans}}': inTrans = False
if line == '{{checktrans-top}}': inTrans = False
if line == '{{ttbc-top}}': inTrans = False
mo = retopgloss.match(line)
if mo:
gloss =[1:]
prev = newlines[-1:][0]
while not prev:
newlines = newlines[:-1]
prev = newlines[-1:][0]
if prev.startswith(';'): gloss = prev[1:]
elif prev.startswith("'''") and prev.endswith("'''"): gloss = prev[3:-3]
else: gloss = ''
if gloss: newlines = newlines[:-1]
if gloss:
gloss = reglossfix.sub(r'\1', gloss).strip()
prev = line
line = '{{trans-top|' + gloss + '}}'
# <- else: line = '{{trans-top}}'
acts.add('-' + prev + ' +' + line)
tt = True
if tt and line == '{{mid}}':
line = '{{trans-mid}}'
if tt and line == '{{bottom}}':
# add blank line
line = ''
tt = False
# end of trans
# templates that should have * outside them
mo = restartemp.match(line)
if mo and in StarTemp:
line = '* ' + line
acts.add('* before ' +
# pronunciation specific
if inPro:
refire = True
while refire:
refire = False
for rx in Prex:
if "enPR" in rx and lcode != "en": continue
line, k = Prex[rx][0].subn(Prex[rx][1], line)
if k:
refire = True # fire ruleset again
if 'IPA' in line and lcode and lcode != 'en' and '|lang=' not in line:
line, k = reIPAlang.subn(r'\1|lang=' + lcode + '}}', line)
if k: acts.add('added lang=' + lcode + ' to IPA')
if line == '{{rfp}}' and lcode and lcode != 'en':
line = '{{rfp|lang=' + lcode + '}}'
acts.add('added lang=' + lcode + ' to rfp')
# move {{also}} to prolog, we are in a language section
if line.startswith("{{also|"):
acts.add("moved {{also}} to prolog")
# all else
# at end with no infl / defn line in previous section?
if inext:
acts.add('added inflection line for %s/%s' % (lang, header))
newlines.append(infline(title, lcode, header))
inext = False
defnext = True
if defnext and (header not in HAN or npos == 1):
newlines.append('# {{defn|%s}}' % lang)
acts.add('no definition line for %s/%s added {defn}' % (lang, header))
# done with sect
Lsect[lang] = newlines
# reassemble ...
newtext = ''
prior = False
# sort prolog, and add to newtext
if len(Lsect) > 2:
pcopy = sorted(Lsect['*prolog'], key=prokey) # shallow copy, sorted
if pcopy != Lsect['*prolog']: acts.add('sorted prolog')
else: pcopy = Lsect['*prolog'] # no language sections, leave "prolog" alone
for line in pcopy:
# no blank lines
if line: newtext += line + '\n'
if line.startswith('=') and not rfc:
newtext += '{{rfc-level|header line in prolog, before first L2 header}}\n'
acts.add('tagged header before first L2 header')
del Lsect['*prolog']
blank = True # not really, this is to suppress blank before 1st L2 header
for lang in sorted(Lsect, key=lkey):
if lang == '*iwiki': continue
if prior:
if not blank: newtext += '\n'
newtext += '----\n\n'
divs -= 1
prior = True
if lang not in waslinked: newtext += '==' + lang + '==\n'
else: newtext += '==[[' + lang + ']]==\n'
blank = False
for line in Lsect[lang]:
# no dup blank lines
if line or not blank: newtext += line + '\n'
if line: blank = False
else: blank = True
if Lcats[lang]:
if not blank: newtext += '\n'
# (note lkey is a different function, but does strip brackets, so works ...)
for cat in sorted(Lcats[lang], key=lkey): newtext += cat + '\n'
blank = False
del Lsect[lang]
# residual tag(s):
if ('{{{' in newtext and '}}}' in newtext) or '{{#' in newtext:
acts.add('+{{rfc-subst}} syntax tag')
newtext += '{{rfc-subst}}\n\n' # force newline even if at end
blank = True
# add the iwikis
if not blank: newtext += '\n'
for line in Lsect['*iwiki']:
# no blank lines
if line: newtext += line + '\n'
if divs != 0: acts.add("fixed ----'s")
# rfc-level, etc trickery
for rfname in ('level', 'xphrase', 'header', 'subst', 'pron-n'):
if 'rm rfc-' + rfname + ' tag' in acts:
for ac in sorted(acts):
if ac.startswith('+{{rfc-' + rfname):
acts.remove('rm rfc-' + rfname + ' tag')
print 'elided -' + rfname + ' +' + rfname
# sort translations if any, if not tagged already:
if "{{trans-top" in newtext and "{{rfc-tsort" not in newtext:
new2 = retransect.sub(transort, newtext)
if new2 != newtext:
if "{{trans-see" in new2 and "{{trans-see" not in newtext: acts.add("+trans-see template")
if "{{rfc-tsort" not in new2: acts.add("sorted/rebalanced translations")
else: acts.add("tagged translations table problem")
newtext = new2
# do some combining of pron lines, now that we've done the rulesets:
newtext, k = repronsing3.subn(r"* {{enPR|\1}}, {{IPA|\2}}, {{SAMPA|\3}}", newtext)
if k: acts.add("combined enPR, IPA, SAMPA on one line")
# variant order
newtext, k = repronsing3a.subn(r"* {{enPR|\3}}, {{IPA|\1}}, {{SAMPA|\2}}", newtext)
if k: acts.add("combined enPR, IPA, SAMPA on one line")
newtext, k = repronsing2.subn(r"* {{IPA|\1}}, {{SAMPA|\2}}", newtext)
if k: acts.add("combined IPA and SAMPA on one line")
# if page isn't "countable", see if we can add a link in a form-of template
if '[[' not in newtext:
for rx in Frex:
newtext, k = Frex[rx][0].subn(Frex[rx][1], newtext)
if k:
break # only need one
if '[[' not in newtext: print "page still not counted in stats"
# do minor spacing 1% of the time that there is nothing else to do
if not acts and random() < 0.01 and newtext.rstrip(' \n') != text.rstrip(' \n'):
acts.add('minor spacing')
# if we added a major rfc, just do that, dump the rest of the work!!
if rfcact:
acts = set()
newtext = text
act = ', '.join(sorted(acts))
# some change, write it (even just rm tag)
if act:
fixed += 1
naptime /= 2
print "format %s: %s" % (safe(title), safe(act))
saved = False
retries = 5
while not saved and retries:
# try to fix the entry
currtext = getedit(page)
if currtext.strip('\n ') != origtext.strip('\n '):
print "page changed while doing format, not saved"
saved = True
except wikipedia.PageNotSaved:
print "failed to save page"
# other action?
except socket.timeout:
print "socket timeout, maybe not saving page"
except socket.error:
print "socket error, maybe not saving page"
except Exception, e:
print "some other error saving page, no retry"
print str(e)
# put throttle will do: if not saved: time.sleep(30)
retries -= 1
# end loop
print "entries fixed %d" % fixed
# done
if __name__ == "__main__":