User:Tbot/code/tbot

#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Tbot/code/tbot


"""
This bot looks for t/t-/t+ template calls and does updates

No command line arguments.

"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import socket
from script import script, scriptp
from random import random
from mwapi import getwikitext, getedit
import threading
import Queue
import shelve
from time import time, sleep

plock = threading.Lock()

def srep(s):
    return repr(u''+s)[2:-1]

# try having a bit of fun with threads, do XML reads in a thread than feeds main proc
# combined with the cache of sections and iwikis, allows fast start

# shelf that allows Unicode keys, just wrap around rather than sub-class, only need a few methods
# make thread safe as well

class myshelf():
    def __init__(s, fn):
        s.shelf = shelve.open(fn)
        s.lock = threading.Lock()
    def __contains__(s, k):
        with s.lock: r = (srep(k) in s.shelf)
        return r
    def __setitem__(s, k, v):
        # optimize no change case:
        with s.lock:
            sk = srep(k)
            if sk in s.shelf and s.shelf[sk] == v: return
            s.shelf[sk] = v
    def __getitem__(s, k):
        with s.lock: r = s.shelf[srep(k)]
        return r
    def sync(s):
        with s.lock: s.shelf.sync()
    def close(s):
        with s.lock: s.shelf.close()
    # del works when s.shelf deref'd

# work cache, record time last looked at local or FL entry
# each record is key: lc:word, pickled with srep(), value is integer time()

cache = myshelf("tbot-cache")
# persistent cache of sections and iwikis
Lsects = myshelf("tbot-lsects")
Iwikis = myshelf("tbot-iwikis")

from logger import Logger
logpage = Logger(wikipedia.getSite('en', 'wiktionary'), "User:Tbot/log")

from createflw import createFLentry

Lcodes = { }
Langtoc = { }

# set of titles seen and still on queue
from weakref import WeakValueDictionary
wtit = WeakValueDictionary()

tlocal = threading.local()
tlocal.repact = True

# thread routine to read XML and load section/iwiki cache, feed to main

pageq = Queue.Queue()
# with 5K loaded, process size ~70MB, soft limited below

retemp = re.compile(ur'\{\{t[\-\+\u00f8]?\|.*?}}')

def xread():

  tlocal.repact = False

  while True:

    re2head = re.compile(r'^==([^=]+?)==$', re.M)
    reiwiki = re.compile(r'^\[\[([-a-z]{2,10}):(.*?)\]\]$', re.M)

    # build table of section counts and iwikis:

    # get XML dump
    dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")

    ts = 0
    ti = 0
    entries = 0
    
    for entry in dump.parse():
        text = entry.text
        title = entry.title
        if title.find(':') >= 0: continue

        entries += 1
        if entries % 1000 == 0:
            with plock: print "prescan %d entries, %d iwikis, %d sections, %d on queue" % \
                (entries, ti, ts, pageq.qsize())
            Iwikis.sync()
            Lsects.sync()
            # pace a bit more, amusingly en.wikt is 1.44 million entries at this writing
            # nominally 4 hours
            if pageq.qsize() > 100: sleep(10)

        # for my system, temporarily (?) 10.1.10
        if entries % 10 == 0: 
            sleep(1)  # take a long time, not go CPU bound for long

        # filter for those we want to check
        tag = False

        newtext = retemp.sub(tsub, text)
        if newtext != text:
            # temporarily (?), do only some with sc= elisions (may miss some other changes ;-)
            if newtext.count('sc=') < text.count('sc='):
                if newtext.count('t+') != text.count('t+'): tag = True
                if newtext.count('t-') != text.count('t-'): tag = True
                if newtext.count('xs=') > text.count('xs='): tag = True
                # some others caught by the seven percent rule etc following
            else: tag = True

        # look at ones we are modifying anyway, plus try to capture more sometimes
        # if not in cache or expired, pick up sometime, scatter over ~14 days, 1/2 of cache time

        if '==English==' in text and '===Translations===' in text:
            if newtext.count('sc=') < text.count('sc='):
                # try to avoid more edits for sc= elision only
                if random() < 0.01: tag = True
            else:
                if random() < 0.07: tag = True

        if tag and title not in wtit:
            ckey = 'en:' + title
            if ckey not in cache or cache[ckey] < time() - (35 * 24 * 3600):
                wtit[title] = entry
                pageq.put(entry)
                # soft limit queue size, e.g. 20K if 20 seconds to eat each entry
                # larger effect is to cause process/thread schedule event
                sleep(pageq.qsize()/1000.0)

        # add iwikis:

        iw = [ ]
        for mo in reiwiki.finditer(text):
            if mo.group(2) == title:
                iw.append(mo.group(1))
                ti += 1
        Iwikis[title] = iw

        # now, Lsects is an optimization to preclude calls to createflw, filter so it won't be
        # uselessly large, it is okay if we miss some (for now): [ try keeping all ]
        """
        if "form of" in text: continue    # SemperBlottoBot, others
        if " of|" in text: continue       # Keenebot2, others (Jyril: "{{fi-form of|")
        """

        ll = [ ]
        for lang in re2head.findall(text):

            lang = lang.strip(' []')
            if lang == "English": continue
            if lang not in Langtoc: continue
            lc = Langtoc[lang]
            if lc not in Exists: continue

            ll.append(lc)
            ts += 1

        if ll: Lsects[title] = ll

        pass # end of prelim XML loop

    print "XML scan complete"
    Iwikis.sync()
    Lsects.sync()

    # put a pass marker on the queue, and then we continue
    pageq.put(None)

    # take a day off, and do it again
    # queue may very well last that long, and we may get new XML
    # sleep(86400)
    # don't sleep very long at once, makes interrupt difficult!
    # for i in range(24*60, 0, -1):
    # not one day yet, just some hours!

    hours = 14 # see what works well? shouldn't need to be much < 24?
    for i in range(hours*60, 0, -1):
        if not i%10: 
            with plock: print "(sleeping, next scan in %d minutes)" % i
        sleep(60)


Exists = set()
Tlang = set(['ar', 'da', 'de', 'el', 'es', 'fi', 'fr', 'he', 'it', 'ja', 'ko', 'nl', 'no', 'pt',
         'ru', 'sv', 'bg', 'bs', 'ca', 'cmn', 'cs', 'et', 'hr', 'hu', 'is', 'ku', 'la', 'pl', 'ro',
         'sr', 'sk', 'sl', 'te', 'th', 'tr', 'hy', 'mk'])

# |ar=Arab|fa=fa-Arab|ur=ur-Arab
# |xcl|hy=Armn|be|bg|mk|ru|uk=Cyrl|cu=Cyrs|sa|hi=Deva|got=Goth|el=Grek|grc=polytonic
# |he|yi|arc=Hebr|ja=Jpan|ko=Kore|ta=Taml|te=Telu|th=Thai

Xyzzy = dict( ar='Arab', fa='fa-Arab', ur='ur-Arab',
              xcl='Armn', hy='Armn',
              be='Cyrl', bg='Cyrl', mk='Cyrl', ru='Cyrl', uk='Cyrl', cu='Cyrs',
              sa='Deva', hi='Deva',
              got='Goth', el='Grek', grc='polytonic',
              he='Hebr', yi='Hebr', arc='Hebr',
              ja='Jpan', ko='Kore', 
              ta='Taml', te='Telu', th='Thai' )


# translatable POS from English, other things may be very different POS ...
POS = set(['Noun', 'Verb', 'Adverb', 'Adjective', 'Pronoun',
            'Proper noun', 'Preposition', 'Conjunction', 'Interjection', 'Article' ])
# others we want to recognize and avoid, at least for now
Stops = set(['Prefix', 'Suffix', 'Affix', 'Infix', 'Counter', 'Initialism', 'Abbreviation', 
             'Letter', 'Symbol', 'Acronym', 'Proverb', 'Contraction', 'Idiom', 'Phrase', 
             'Syllable', 'Reflexive verb', 'Transitive verb', 'Intransitive verb' ])

TTBC = ('{{checktrans', '{{ttbc', '{{rfc-trans')

# substitute templates:
relcodeword = re.compile(r'\{\{t[\+\-]?\|([-a-z]*?)\|(.*?)[\|\}]')
rexsect = re.compile(r'\|xs=[^\|\}]*([\|\}])')
rescect = re.compile(r'\|sc=[^\|\}]*([\|\}])')
reflag = re.compile(r'\{\{t[^\|]?\|')
recode = re.compile(r'(\{\{t[^\|]?\|)([-a-z]*?)\|')

refindsc = re.compile(r'\|sc=([^\|\}]*)[\|\}]')

# canonicalize option order: (match if not at end, so we can move to end)
reorderalt = re.compile(r'\|alt=([^\|]*)\|(.*)}}')
reordertr = re.compile(r'\|tr=([^\|]*)\|(.*)}}')
reordersc = re.compile(r'\|sc=([^\|]*)\|(.*)}}')
reorderxs = re.compile(r'\|xs=([^\|]*)\|(.*)}}')

repipe = re.compile(r'\[\[[^\]]*\|') # enough to catch a piped link [[...| 

def tsub(tmo):

    tin = tmo.group(0)

    # look for some bad cases, to be tagged:
    # either a piped link or a subtemplate call will scramble parsing

    if repipe.search(tin) or '{{' in tin[2:]:
        tout = '{{rfc-tbot}}' + tin
        return tout
    
    tout = tin

    while tout.endswith('|}}'): tout = tout[:-3] + '}}'

    # canonical order, move each to end in turn
    if 'alt=' in tout: tout = reorderalt.sub(r'|\2|alt=\1}}', tout)
    if 'tr=' in tout: tout = reordertr.sub(r'|\2|tr=\1}}', tout)
    if 'sc=' in tout: tout = reordersc.sub(r'|\2|sc=\1}}', tout)
    if 'xs=' in tout: tout = reorderxs.sub(r'|\2|xs=\1}}', tout)

    mo = relcodeword.match(tout)
    if mo:
        code = mo.group(1)
        word = mo.group(2)
    else:
        code = ''
        word = ''

    # fix some codes, ISO /3 to /1, and some other things, like nb to no
    if code in Lcodes:
        nc = Langtoc[Lcodes[code]]
        if nc != code:
            tout = recode.sub(r'\1' + nc + '|', tout)
            if tlocal.repact:
                with plock: print "changed code %s to %s" % (code, nc)
            code = nc

    if 'sc=' not in tout and code and word and ord(word[0:1]) >= 0x0370:
        sc = script(word, code, report = tlocal.repact)
        if sc and (code not in Xyzzy or sc != Xyzzy[code]):
             tout = tout[:-2] + '|sc=' + sc + '}}'
             # move xs to end again if needed
             if 'xs=' in tout: tout = reorderxs.sub(r'|\2|xs=\1}}', tout)

    if 'sc=' in tout:
        mo = refindsc.search(tout)
        if mo: sc = mo.group(1)
        else: sc = ''
        # script parameter not needed? (may be different from default, suppress if = to Xyzy code)
        if code in Xyzzy and sc == Xyzzy[code]:
            tout = rescect.sub(r'\1', tout)
            if tlocal.repact:
                with plock: print "elided script %s for code %s" % (sc, code)

    if code and code in Lcodes:
         if code not in Tlang:
             xs = '|xs=' + Lcodes[code]
         else: xs = ''
         if tout.find('|xs=') >= 0: tout = rexsect.sub(xs + r'\1', tout)
         else: tout = tout[:-2] + xs + '}}'

    if code:
        if code in Exists:
           if tout.startswith(u'{{t\u00f8|'): tout = reflag.sub('{{t|', tout)
           if word in Iwikis:
               if code in Iwikis[word]: tout = reflag.sub('{{t+|', tout)
               if code not in Iwikis[word]: tout = reflag.sub('{{t-|', tout)
        else:
           tout = reflag.sub(u'{{t\u00f8|', tout)

    if tout != tin and tlocal.repact:
        print "%s, code %s, word %s: %s" % (srep(tin), srep(code), srep(word), srep(tout))

    return u'' + tout

# transliterate words, use to match transliterations, use our own dict which we can preload as needed

import transliteration
trac = { }
tlitob = transliteration.transliterator() # work around gratuitious class defn

def tlit(s):

    n = u''
    for c in s:
       if c not in trac:
           # and work around nasties in the transliteration.trans routine
           try:
               trac[c] = u'' + tlitob.transliterate(c, default = c)
           except UnicodeDecodeError:
               trac[c] = c
       n += trac[c]

    n = n.lower()
    return n

# now have some serious recursion fun!
# fuzzy returns string match score
# r is min required, calls may have neg r, may return value < r

def fuzzy(a, b, r):

    if not a or len(a) < r: return 0
    if not b or len(b) < r: return 0

    if a == b: return len(a)
    if a[:1] == b[:1]: return 1 + fuzzy(a[1:], b[1:], r-1)
    if a[-1:] == b[-1:]: return 1 + fuzzy(a[:-1], b[:-1], r-1)

    # try with each char forward
    p = a.find(b[0:1])
    if p >= 0: sca = 1 + fuzzy(a[p+1:], b[1:], r-1)
    else: sca = 0

    p = b.find(a[0:1])
    if p >= 0: scb = 1 + fuzzy(b[p+1:], a[1:], r-1)
    else: scb = 0

    # no match either/or way, skip this char, one or both
    if not sca and not scb: sk = fuzzy(a[1:], b[1:], r)
    elif not sca: sk = fuzzy(a, b[1:], r)
    elif not scb: sk = fuzzy(a[1:], b, r)
    else: sk = 0

    return max(sk, sca, scb)

def main():

    socket.setdefaulttimeout(30)

    nap = 5 # +1,/2 adaptive naptime
    cis = 0

    skip = 0
    dmatch = False
    nocreate = False

    for arg in sys.argv[1:]:
        if arg.startswith('-skip:'):
            skip = int(arg[6:])
            print "will skip to update %d" % skip
        elif arg == '-dmatch':
            dmatch = True
            print "will report debug no match conditions"
        elif arg == '-nocreate':
            nocreate = True
            print "will not create new Tbot entries"
        else: print "unknown command line argument %s" % arg


    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    meta = wikipedia.getSite(code = "meta", fam = "meta")

    # get language codes
    page = wikipedia.Page(site, "User:AutoFormat/Languages")
    langtab = getwikitext(page)

    relangtab = re.compile(r'\|(.*?)\|\|(.*)')
    i = 0
    for line in langtab.splitlines():
        mo = relangtab.match(line)
        if mo:
            for code in mo.group(1).split(','):
                Lcodes[code.strip()] = mo.group(2).strip()
                i += 1

    # invert, with some forces, make sure nb goes to no, fix the Chinese languages
    # things like ido to io are handled by the length
    Lcodes['ido'] = 'Ido'
    Lcodes['nb'] = 'Norwegian'
    Lcodes['nob'] = 'Norwegian'
    Langtoc['Norwegian'] = 'no'
    Langtoc['Mandarin'] = 'cmn'
    Langtoc['Min Nan'] = 'nan'
    Langtoc['Cantonese'] = 'yue'
    for code in Lcodes:
        if Lcodes[code] not in Langtoc: Langtoc[Lcodes[code]] = code
        elif Lcodes[code] in ['Mandarin', 'Min Nan', 'Cantonese']: continue
        elif len(code) < len(Langtoc[Lcodes[code]]): Langtoc[Lcodes[code]] = code

    print "found %d language codes" % i
    if i < 460: return

    # get active wikt list
    # page = wikipedia.Page(meta, "List of Wiktionaries/Table")
    # gratuitous move, 20.4.9:
    page = wikipedia.Page(meta, "Wiktionary/Table")
    existtab = getwikitext(page)

    # reextab = re.compile(r'^\[\[:([a-z-]+):')
    reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
    i = 0
    for line in existtab.splitlines():
        i += 1
        mo = reextab.match(line)
        if mo:
            # toki pona and klingon we don't want, bo and sc are borked, to is closed (21.5.8)
            if mo.group(1) in [ 'tokipona', 'tlh', 'bo', 'sc', 'to', 'sh' ]: continue
            Exists.add(mo.group(1))
 
    print "%d lines" % i

    print "found %d active wikts" % len(Exists)
    if len(Exists) < 160: return

    # add codes for Mandarin and Min Nan, cantonese doesn't exist yet
    Exists.add('cmn')
    Exists.add('nan')

    # headers, only need 3,4,5
    rehead = re.compile(r'^={3,5}(.+?)={3,5}$', re.M)

    regloss = re.compile(r'\{\{trans-top\|(.+?)}}')

    # trans line, start simple, without and with wikilinking, 1 is lang, 2 is rest
    retrans = re.compile(r'\*\s*([-\w ]+):(.*)')
    retransw = re.compile(r'\*\s*\[\[([-\w ]+)\]\]:(.*)')

    # wikitext format stackable at the start of a line
    restack = re.compile('^([#:\*]+)\s*')

    Tlist = [
        # first sets are optimizations, catch common cases, and uncollapsed gender/number for simple
        # match patterns for translations lines, simplest first, must match all
        r'^\[\[(?P<flw>[- \w]+)\]\]$',
        r'^\[\[(?P<flw>[- \w]+)\]\]\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}}|)$',

        # with transliteration
        r'^\[\[(?P<flw>[- \w]+)\]\]\s*\((?P<tra>[\w]+)\)$',
        r'^\[\[(?P<flw>[- \w]+)\]\]\s*\((?P<tra>[\w]+)\)\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}})$',

        # with sections
        r'^\[\[(?P<flw>[- \w]+)#(?P<sect>[- \w]+)\|(?P<alt>[- \w]+)\]\]$',
        r'^\[\[(?P<flw>[- \w]+)#(?P<sect>[- \w]+)\|(?P<alt>[- \w]+)\]\]\s*\{\{(?P<g>[mfcn])}}$',

        # with scripts
        r'^\{\{(?P<scr>[-\w]+)\|\[\[(?P<flw>[- \w]+)\]\]}}$',
        r'^\{\{(?P<scr>[-\w]+)\|\[\[(?P<flw>[- \w]+)\]\]}}\s*\((?P<tra>[\w]+)\)(:?\s*\{\{(?P<g>[mfcn])}})$',

        # following are cases not caught by general regular expression
        # cases of link to FL wikt
        r'^\[\[:(?P<iwk>[-a-z]+):(?P<flw>[^\|\]#\{}:]+)\|(?P<alt>[^\|\]#\{}:]]+)\]\]'
            '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$',

        # three cases for a single unlinked word (don't allow phrases, trans might be one of the words)
        r'^(?P<flw>[\w]+)$',
        r'^(?P<flw>[\w]+)\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}}|)$',
        r'^(?P<flw>[\w]+)$'
            '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$',

        # other oddities, bold word (self page ref), ''m pl'' and friends.
        r"^'''(?P<flw>[\w]+)'''$",
        r"^\[\[(?P<flw>[- \w]+)\]\]\s+''(?P<g>[mfcn])\s+(?P<g2>[p])l''$"    ]

    Trex = [ ]
    for rel in Tlist:
        Trex.append(re.compile(rel, re.U))

    # then we will try general case (long regex ;-)
    Grex = re.compile(r'^'
        r'(?:\{\{(?P<scr>[-\w]+)\||)'          # script template, or nothing
        r'\[\[(?P<flw>[^\|\]#\{}:]*)'          # start of link, always, any chars not link/template syntax
        r'(?:#(?P<sect>[- \w]+)|)'             # possible section ref
        r'(?:\|(?P<alt>[^\|\]#\{}:]+)|)'       # possible alt, any chars not syntax
        r'\]\]'                                # end of link, always
        r'(?:}}|)'                             # end of script if present (not checked)
        r"(?:\s*\((?P<tra>[- '\w]+)\)|)"       # possible transliteration
        r'(?:\s*\{\{(?P<g>[mfcnps])'           # possible gender or number
        r'(?:\|(?P<g2>[fcnps])'                # within that, possible second gender/number
        r'(?:\|(?P<g3>[cnps])|)'               # within that, possible third gender/number or nothing
        r'|)}}|)'                              # or not second gender, or none at all
        r'$', re.U)                            # end of string, must match all, Unicode

    # match an existing t template so we can create entry:
    retmatch = re.compile(r'\{\{t[+-]?\|(?P<lc>[^|]*)\|(?P<flw>[^|}]*)'
            '(?:\|(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)|)'
            '(?:\|alt=(?P<alt>[^|}]*)|)(?:\|tr=(?P<tra>[^|}]*)|)(?:\|sc=(?P<scr>[^|}]*)|)'
            '(?:\|xs=[^|}]*|)}}')

    # merge in [one] following gender template, doesn't now put it in the canonical place [a problem]
    regmerge = re.compile(r'\{\{(t.?\|.*?)\}\}\s*\{\{([mfcnps])\}\}')

    # try he-translation, without "defective" options, with genders, must match all
    rehetrans = re.compile(r'\{\{he-translation\|(?P<flw>[^|}]*)\|(?P<tra>[^|}]*)'
            '(?:\|wv=(?P<alt>[^|}]*)|)}}'
            '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$')

    # pre-screen matches for other entries
    presect = re.compile(r'^\* ?\w*: ?\[\[.*#', re.M)       # section ref in trans?
    preflnk = re.compile(r'\[\[:[-a-z]{2,3}:')              # FL link in trans? might be elsewhere

    # now look for templates and t additions

    entries = 0
    probs = 0
    fixed = 0

    # start reader
    xt = threading.Thread(target=xread)
    xt.daemon = True # kill silently on exit (:-)
    xt.name = 'read XML'
    xt.start()

    tlocal.repact = True # main thread reports tsub details

    while True:
        entry = pageq.get()

        if not entry:
            # end of pass
            logpage.add("%d entries, %d checked, %d edits" % (entries, probs, fixed))
            logpage.write()
            entries = 0
            probs = 0
            fixed = 0
            # interruptable wait:
            while pageq.qsize() == 0: sleep(20)
            continue # start next pass

        text = entry.text
        title = entry.title
        if title.find(':') >= 0: continue

        entries += 1
        if entries % 100 == 0:
            with plock: print "%d entries, %d probs, %d edits, %d on queue" % \
                  (entries, probs, fixed, pageq.qsize())

        act = ''

        # check and update:

        if True: # [structure]

            with plock: print '%d: %s (%d)' % (probs, srep(title), pageq.qsize())

            # ... pick up current version from en.wikt
            try:
                page = wikipedia.Page(site, title)
                # text = page.get()
                text = getwikitext(page)
                origtext = text
            except wikipedia.NoPage:
                print "Can't get %s from en.wikt" % srep(page.aslink())
                text = ''
            except wikipedia.IsRedirectPage:
                print "Page %s is now a redirect" % srep(page.aslink())
                text = ''
            except KeyError:
                print "key error from the box"
                text = ''

            if not text: continue

            probs += 1
            nap += 1
            if nap > 70: nap = 70 # some outer bound

        else: continue

        # update cache time

        ckey = 'en:' + title
        cache[ckey] = time() # assume we will complete update now ...
        cis += 1
        if cis % 20 == 0 or pageq.qsize() == 0: cache.sync()

        # now parse the current entry see if we have a substitution

        newtext = retemp.sub(tsub, text)
        if newtext != text:
            act += 'updated t/t-/t+ templates, '
            text = newtext

        # simple hack since we will tag over and over ;-)
        if '{{rfc-tbot}}' in text:
            act += 'tagged problem, '  # (or was previously tagged ;-)
            text = text.replace('{{rfc-tbot}}{{rfc-tbot}}', '{{rfc-tbot}}')

        # look for templates we can add or entries to create

        added = False
        intrans = False
        pos = ''
        gloss = ''

        lines = [ ]
        for line in text.splitlines():
            lines.append(line)
        for i in range(0, len(lines)):

            mo = rehead.match(lines[i])
            if mo:
                header = mo.group(1).strip()
                if header == "Translations":
                    intrans = True
                    gloss = ''
                else: intrans = False
                if header in POS: pos = header
                if header in Stops: pos = ''
                continue

            if lines[i].startswith(TTBC): intrans = False

            if not intrans: continue

            mo = regloss.match(lines[i])
            if mo:
                gloss = mo.group(1).strip()
                continue

            # wiki format + one space, so lines are consistant if we change any
            lines[i] = restack.sub(r'\1 ', lines[i])

            mo = retrans.match(lines[i])
            if mo: waslinked = False
            else:
                mo = retransw.match(lines[1])
                if mo: waslinked = True
            if not mo: continue

            # if other cases, continue for now (not looking at the rest of the line)
            if '{{ko-inline' in lines[i]: continue
            # if '{{he-translation' in lines[i]: continue

            lang = mo.group(1).strip()
            trest = mo.group(2)

            # print srep("t lang %s, rest %s" % (lang, trest))

            # known language, wikt exists?
            if lang not in Langtoc: continue
            lc = Langtoc[lang]
            if lc not in Exists: continue

            nomatch = False
            changed = False
            tnew = ''
            for ipart in trest.split(','):
                ipart = ipart.strip()
                if not ipart:
                    # consecutive commas, begins or ends with comma?
                    nomatch = True
                    continue

                # see if existing t template but no entry, so we can create it
                # this is the best case for create if someone (not tbot) added the template
                # also merge in following gender templates here

                if '{{t' in ipart:
                    ip2 = regmerge.sub(r'{{\1|\2}}', ipart)
                    while ip2 != ipart:
                        ipart = ip2
                        ip2 = regmerge.sub(r'{{\1|\2}}', ipart)
                        changed = True
                    mo = retmatch.search(ipart)
                    if mo:
                        mod = mo.groupdict()
                        if (mod['flw'] not in Lsects or lc not in Lsects[mod['flw']]) and pos and gloss and not nocreate:
                            createFLentry(mod['flw'], lang, lc, pos, title, gloss, mod)
                    tnew += ipart + ', '
                    continue

                # same for Hebrew template, but must match all of the part
                if '{{he-translation' in ipart:
                    mo = rehetrans.match(ipart)
                    if mo:
                        mod = mo.groupdict()
                        mod['scr'] = 'Hebr'
                        if (mod['flw'] not in Lsects or lc not in Lsects[mod['flw']]) and pos and gloss and not nocreate:
                            createFLentry(mod['flw'], lang, lc, pos, title, gloss, mod)
                    tnew += ipart + ', '
                    continue

                mo = None

                # specific cases
                for rex in Trex:
                    mo = rex.match(ipart)
                    if mo: break

                # general case
                if not mo:
                    mo = Grex.match(ipart)
                    if dmatch and mo: print "match general case: %s" % srep(ipart)

                if mo:
                    mod = mo.groupdict()

                    # format t template:
                    t = '{{t'

                    flw = mod['flw']   # always there, but can be nil if a section link
                    # if self-link with section
                    if not flw and 'sect' in mod and mod['sect']: flw = title
                    if not flw: # can't happen? sect must exist and be non-nil to match
                        nomatch = True
                        continue

                    # try adding script, before the create FL step
                    if 'scr' not in mod or not mod['scr']:
                        scr = script(flw, lc)
                        if scr: mod['scr'] = scr

                    # check out transliteration
                    if 'tra' in mod and mod['tra']:
                        tra = mod['tra']
                        # close enough to be fairly certain it is the transliteration?
                        if fuzzy(tlit(flw), tlit(tra), len(tra)-2) < max(len(tra)-2, 4):
                            if dmatch: print srep("no match trans: %s to %s" % (tlit(flw), tlit(tra)))
                            nomatch = True
                            continue # no more with this one

                    # create if missing, or check iwikis, or no match
                    if flw != title and (flw not in Lsects or lc not in Lsects[flw]) and \
                                    pos and gloss and not nocreate:
                        if createFLentry(flw, lang, lc, pos, title, gloss, mod):
                            t += '+'
                        else:
                            # don't convert yet if no confirm on match
                            if flw not in Iwikis: nomatch = True
                            elif lc in Iwikis[flw]: t += '+'
                            else: t += '-'
                    elif flw in Iwikis:
                        if lc in Iwikis[flw]: t += '+'
                        else: t += '-'
                    else: nomatch = True

                    t += '|' + lc + '|' + flw

                    # if explicit iwiki, check code:
                    if 'iwk' in mod and mod['iwk']:
                        if mod['iwk'] != lc:
                            print "explicit iwiki link does not match language %s" % srep(lang)
                            nomatch = True

                    if 'sect' in mod and mod['sect'] and mod['sect'] != lang:
                        print "section doesn't match language %s" % srep(lang)
                        logpage.add("[[%s]] section %s doesn't match language %s" % \
                                     (title, mod['sect'], lang))
                        nomatch = True

                    # in canonical order ...
                    # gender
                    if 'g' in mod and mod['g']:
                        t += '|' + mod['g']
                    if 'g2' in mod and mod['g2']:
                        t += '|' + mod['g2']
                    if 'g3' in mod and mod['g3']:
                        t += '|' + mod['g3']

                    # alt link
                    if 'alt' in mod and mod['alt'] and mod['alt'] != flw:
                        alt = mod['alt']
                        t += '|alt=' + alt

                    # transliteration
                    if 'tra' in mod and mod['tra']:
                        t += '|tr=' + tra

                    # script
                    if 'scr' in mod and mod['scr']:
                        scr = mod['scr']
                        if scriptp(scr):
                            if lc not in Xyzzy or scr != Xyzzy[lc]: t += '|sc=' + scr
                        else:
                            print srep("no match script: %s" % scr)
                            nomatch = True

                    # xs last
                    if lc not in Tlang: t += '|xs=' + lang

                    t += '}}'
                    tnew += t + ', '
                    changed = True

                else:
                    if dmatch: print "no match pattern: %s" % srep(ipart)
                    nomatch = True

            if changed and not nomatch:
                tnew = tnew.strip(', ')
                if not waslinked: lines[i] = '* ' + lang + ': ' + tnew
                else: lines[i] = '* [[' + lang + ']]: ' + tnew
                if 'added t for ' not in act: act += 'added t for '
                if ' ' + lc + ',' not in act: act += lc + ', '
                added = True
                with plock: print srep("%s to %s" % (trest, tnew))

            pass # end of lines

        # reassemble, else use existing text
        if added: text = u'\n'.join(lines)

        # some change, write it
        if act:

            act = act.strip(', ')
            fixed += 1
            nap /= 2

            # use our own wait, not put throttle, so prescan can continue etc
            # also will be better when mwapi is doing puts
            sleep(70 - nap)

            with plock: print "Updating %s: %s" % (srep(title), srep(act))

            # try to fix the entry
            try:
                wikipedia.setAction(act)
                currtext = getedit(page)
                if currtext.strip('\n ') != origtext.strip('\n '):
                    with plock: print "page changed while updating templates, not saved"
                    continue
                with plock: page.put(text)
                # it is understood that taking plock here can stall the reader
            except wikipedia.EditConflict:
                print "Edit conflict?"
                continue
            except wikipedia.PageNotSaved:
                print "page not saved"
                continue
            except socket.timeout:
                print "socket timeout"
                continue
            except socket.error:
                print "socket error"
                continue
            except Exception, e:
                print "some exception writing page", repr(e)
                continue

        # limit number of fixes for testing
        # if fixed > 20: break

        if nap > 5:
            with plock: print "(sleeping %d seconds)" % nap
            sleep(nap)

    # [ notreached ]
    print "%d entries, %d possible updates, %d fixed" % (entries, probs, fixed)
    cache.close()

    # done

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print "(keyboard interrupt)"
    finally:
        logpage.write()
        wikipedia.stopme()
        cache.close()
        Iwikis.close()
        Lsects.close()