User:Tbot/code/tbot
Appearance
#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Tbot/code/tbot """ This bot looks for t/t-/t+ template calls and does updates No command line arguments. """ import wikipedia import xmlreader import sys import re import pickle import socket from script import script, scriptp from random import random from mwapi import getwikitext, getedit import threading import Queue import shelve from time import time, sleep plock = threading.Lock() def srep(s): return repr(u''+s)[2:-1] # try having a bit of fun with threads, do XML reads in a thread than feeds main proc # combined with the cache of sections and iwikis, allows fast start # shelf that allows Unicode keys, just wrap around rather than sub-class, only need a few methods # make thread safe as well class myshelf(): def __init__(s, fn): s.shelf = shelve.open(fn) s.lock = threading.Lock() def __contains__(s, k): with s.lock: r = (srep(k) in s.shelf) return r def __setitem__(s, k, v): # optimize no change case: with s.lock: sk = srep(k) if sk in s.shelf and s.shelf[sk] == v: return s.shelf[sk] = v def __getitem__(s, k): with s.lock: r = s.shelf[srep(k)] return r def sync(s): with s.lock: s.shelf.sync() def close(s): with s.lock: s.shelf.close() # del works when s.shelf deref'd # work cache, record time last looked at local or FL entry # each record is key: lc:word, pickled with srep(), value is integer time() cache = myshelf("tbot-cache") # persistent cache of sections and iwikis Lsects = myshelf("tbot-lsects") Iwikis = myshelf("tbot-iwikis") from logger import Logger logpage = Logger(wikipedia.getSite('en', 'wiktionary'), "User:Tbot/log") from createflw import createFLentry Lcodes = { } Langtoc = { } # set of titles seen and still on queue from weakref import WeakValueDictionary wtit = WeakValueDictionary() tlocal = threading.local() tlocal.repact = True # thread routine to read XML and load section/iwiki cache, feed to main pageq = Queue.Queue() # with 5K loaded, process size ~70MB, soft limited below retemp = re.compile(ur'\{\{t[\-\+\u00f8]?\|.*?}}') def xread(): tlocal.repact = False while True: re2head = re.compile(r'^==([^=]+?)==$', re.M) reiwiki = re.compile(r'^\[\[([-a-z]{2,10}):(.*?)\]\]$', re.M) # build table of section counts and iwikis: # get XML dump dump = xmlreader.XmlDump("../hancheck/en-wikt.xml") ts = 0 ti = 0 entries = 0 for entry in dump.parse(): text = entry.text title = entry.title if title.find(':') >= 0: continue entries += 1 if entries % 1000 == 0: with plock: print "prescan %d entries, %d iwikis, %d sections, %d on queue" % \ (entries, ti, ts, pageq.qsize()) Iwikis.sync() Lsects.sync() # pace a bit more, amusingly en.wikt is 1.44 million entries at this writing # nominally 4 hours if pageq.qsize() > 100: sleep(10) # for my system, temporarily (?) 10.1.10 if entries % 10 == 0: sleep(1) # take a long time, not go CPU bound for long # filter for those we want to check tag = False newtext = retemp.sub(tsub, text) if newtext != text: # temporarily (?), do only some with sc= elisions (may miss some other changes ;-) if newtext.count('sc=') < text.count('sc='): if newtext.count('t+') != text.count('t+'): tag = True if newtext.count('t-') != text.count('t-'): tag = True if newtext.count('xs=') > text.count('xs='): tag = True # some others caught by the seven percent rule etc following else: tag = True # look at ones we are modifying anyway, plus try to capture more sometimes # if not in cache or expired, pick up sometime, scatter over ~14 days, 1/2 of cache time if '==English==' in text and '===Translations===' in text: if newtext.count('sc=') < text.count('sc='): # try to avoid more edits for sc= elision only if random() < 0.01: tag = True else: if random() < 0.07: tag = True if tag and title not in wtit: ckey = 'en:' + title if ckey not in cache or cache[ckey] < time() - (35 * 24 * 3600): wtit[title] = entry pageq.put(entry) # soft limit queue size, e.g. 20K if 20 seconds to eat each entry # larger effect is to cause process/thread schedule event sleep(pageq.qsize()/1000.0) # add iwikis: iw = [ ] for mo in reiwiki.finditer(text): if mo.group(2) == title: iw.append(mo.group(1)) ti += 1 Iwikis[title] = iw # now, Lsects is an optimization to preclude calls to createflw, filter so it won't be # uselessly large, it is okay if we miss some (for now): [ try keeping all ] """ if "form of" in text: continue # SemperBlottoBot, others if " of|" in text: continue # Keenebot2, others (Jyril: "{{fi-form of|") """ ll = [ ] for lang in re2head.findall(text): lang = lang.strip(' []') if lang == "English": continue if lang not in Langtoc: continue lc = Langtoc[lang] if lc not in Exists: continue ll.append(lc) ts += 1 if ll: Lsects[title] = ll pass # end of prelim XML loop print "XML scan complete" Iwikis.sync() Lsects.sync() # put a pass marker on the queue, and then we continue pageq.put(None) # take a day off, and do it again # queue may very well last that long, and we may get new XML # sleep(86400) # don't sleep very long at once, makes interrupt difficult! # for i in range(24*60, 0, -1): # not one day yet, just some hours! hours = 14 # see what works well? shouldn't need to be much < 24? for i in range(hours*60, 0, -1): if not i%10: with plock: print "(sleeping, next scan in %d minutes)" % i sleep(60) Exists = set() Tlang = set(['ar', 'da', 'de', 'el', 'es', 'fi', 'fr', 'he', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ru', 'sv', 'bg', 'bs', 'ca', 'cmn', 'cs', 'et', 'hr', 'hu', 'is', 'ku', 'la', 'pl', 'ro', 'sr', 'sk', 'sl', 'te', 'th', 'tr', 'hy', 'mk']) # |ar=Arab|fa=fa-Arab|ur=ur-Arab # |xcl|hy=Armn|be|bg|mk|ru|uk=Cyrl|cu=Cyrs|sa|hi=Deva|got=Goth|el=Grek|grc=polytonic # |he|yi|arc=Hebr|ja=Jpan|ko=Kore|ta=Taml|te=Telu|th=Thai Xyzzy = dict( ar='Arab', fa='fa-Arab', ur='ur-Arab', xcl='Armn', hy='Armn', be='Cyrl', bg='Cyrl', mk='Cyrl', ru='Cyrl', uk='Cyrl', cu='Cyrs', sa='Deva', hi='Deva', got='Goth', el='Grek', grc='polytonic', he='Hebr', yi='Hebr', arc='Hebr', ja='Jpan', ko='Kore', ta='Taml', te='Telu', th='Thai' ) # translatable POS from English, other things may be very different POS ... POS = set(['Noun', 'Verb', 'Adverb', 'Adjective', 'Pronoun', 'Proper noun', 'Preposition', 'Conjunction', 'Interjection', 'Article' ]) # others we want to recognize and avoid, at least for now Stops = set(['Prefix', 'Suffix', 'Affix', 'Infix', 'Counter', 'Initialism', 'Abbreviation', 'Letter', 'Symbol', 'Acronym', 'Proverb', 'Contraction', 'Idiom', 'Phrase', 'Syllable', 'Reflexive verb', 'Transitive verb', 'Intransitive verb' ]) TTBC = ('{{checktrans', '{{ttbc', '{{rfc-trans') # substitute templates: relcodeword = re.compile(r'\{\{t[\+\-]?\|([-a-z]*?)\|(.*?)[\|\}]') rexsect = re.compile(r'\|xs=[^\|\}]*([\|\}])') rescect = re.compile(r'\|sc=[^\|\}]*([\|\}])') reflag = re.compile(r'\{\{t[^\|]?\|') recode = re.compile(r'(\{\{t[^\|]?\|)([-a-z]*?)\|') refindsc = re.compile(r'\|sc=([^\|\}]*)[\|\}]') # canonicalize option order: (match if not at end, so we can move to end) reorderalt = re.compile(r'\|alt=([^\|]*)\|(.*)}}') reordertr = re.compile(r'\|tr=([^\|]*)\|(.*)}}') reordersc = re.compile(r'\|sc=([^\|]*)\|(.*)}}') reorderxs = re.compile(r'\|xs=([^\|]*)\|(.*)}}') repipe = re.compile(r'\[\[[^\]]*\|') # enough to catch a piped link [[...| def tsub(tmo): tin = tmo.group(0) # look for some bad cases, to be tagged: # either a piped link or a subtemplate call will scramble parsing if repipe.search(tin) or '{{' in tin[2:]: tout = '{{rfc-tbot}}' + tin return tout tout = tin while tout.endswith('|}}'): tout = tout[:-3] + '}}' # canonical order, move each to end in turn if 'alt=' in tout: tout = reorderalt.sub(r'|\2|alt=\1}}', tout) if 'tr=' in tout: tout = reordertr.sub(r'|\2|tr=\1}}', tout) if 'sc=' in tout: tout = reordersc.sub(r'|\2|sc=\1}}', tout) if 'xs=' in tout: tout = reorderxs.sub(r'|\2|xs=\1}}', tout) mo = relcodeword.match(tout) if mo: code = mo.group(1) word = mo.group(2) else: code = '' word = '' # fix some codes, ISO /3 to /1, and some other things, like nb to no if code in Lcodes: nc = Langtoc[Lcodes[code]] if nc != code: tout = recode.sub(r'\1' + nc + '|', tout) if tlocal.repact: with plock: print "changed code %s to %s" % (code, nc) code = nc if 'sc=' not in tout and code and word and ord(word[0:1]) >= 0x0370: sc = script(word, code, report = tlocal.repact) if sc and (code not in Xyzzy or sc != Xyzzy[code]): tout = tout[:-2] + '|sc=' + sc + '}}' # move xs to end again if needed if 'xs=' in tout: tout = reorderxs.sub(r'|\2|xs=\1}}', tout) if 'sc=' in tout: mo = refindsc.search(tout) if mo: sc = mo.group(1) else: sc = '' # script parameter not needed? (may be different from default, suppress if = to Xyzy code) if code in Xyzzy and sc == Xyzzy[code]: tout = rescect.sub(r'\1', tout) if tlocal.repact: with plock: print "elided script %s for code %s" % (sc, code) if code and code in Lcodes: if code not in Tlang: xs = '|xs=' + Lcodes[code] else: xs = '' if tout.find('|xs=') >= 0: tout = rexsect.sub(xs + r'\1', tout) else: tout = tout[:-2] + xs + '}}' if code: if code in Exists: if tout.startswith(u'{{t\u00f8|'): tout = reflag.sub('{{t|', tout) if word in Iwikis: if code in Iwikis[word]: tout = reflag.sub('{{t+|', tout) if code not in Iwikis[word]: tout = reflag.sub('{{t-|', tout) else: tout = reflag.sub(u'{{t\u00f8|', tout) if tout != tin and tlocal.repact: print "%s, code %s, word %s: %s" % (srep(tin), srep(code), srep(word), srep(tout)) return u'' + tout # transliterate words, use to match transliterations, use our own dict which we can preload as needed import transliteration trac = { } tlitob = transliteration.transliterator() # work around gratuitious class defn def tlit(s): n = u'' for c in s: if c not in trac: # and work around nasties in the transliteration.trans routine try: trac[c] = u'' + tlitob.transliterate(c, default = c) except UnicodeDecodeError: trac[c] = c n += trac[c] n = n.lower() return n # now have some serious recursion fun! # fuzzy returns string match score # r is min required, calls may have neg r, may return value < r def fuzzy(a, b, r): if not a or len(a) < r: return 0 if not b or len(b) < r: return 0 if a == b: return len(a) if a[:1] == b[:1]: return 1 + fuzzy(a[1:], b[1:], r-1) if a[-1:] == b[-1:]: return 1 + fuzzy(a[:-1], b[:-1], r-1) # try with each char forward p = a.find(b[0:1]) if p >= 0: sca = 1 + fuzzy(a[p+1:], b[1:], r-1) else: sca = 0 p = b.find(a[0:1]) if p >= 0: scb = 1 + fuzzy(b[p+1:], a[1:], r-1) else: scb = 0 # no match either/or way, skip this char, one or both if not sca and not scb: sk = fuzzy(a[1:], b[1:], r) elif not sca: sk = fuzzy(a, b[1:], r) elif not scb: sk = fuzzy(a[1:], b, r) else: sk = 0 return max(sk, sca, scb) def main(): socket.setdefaulttimeout(30) nap = 5 # +1,/2 adaptive naptime cis = 0 skip = 0 dmatch = False nocreate = False for arg in sys.argv[1:]: if arg.startswith('-skip:'): skip = int(arg[6:]) print "will skip to update %d" % skip elif arg == '-dmatch': dmatch = True print "will report debug no match conditions" elif arg == '-nocreate': nocreate = True print "will not create new Tbot entries" else: print "unknown command line argument %s" % arg # make sure we are logged in site = wikipedia.getSite() site.forceLogin() meta = wikipedia.getSite(code = "meta", fam = "meta") # get language codes page = wikipedia.Page(site, "User:AutoFormat/Languages") langtab = getwikitext(page) relangtab = re.compile(r'\|(.*?)\|\|(.*)') i = 0 for line in langtab.splitlines(): mo = relangtab.match(line) if mo: for code in mo.group(1).split(','): Lcodes[code.strip()] = mo.group(2).strip() i += 1 # invert, with some forces, make sure nb goes to no, fix the Chinese languages # things like ido to io are handled by the length Lcodes['ido'] = 'Ido' Lcodes['nb'] = 'Norwegian' Lcodes['nob'] = 'Norwegian' Langtoc['Norwegian'] = 'no' Langtoc['Mandarin'] = 'cmn' Langtoc['Min Nan'] = 'nan' Langtoc['Cantonese'] = 'yue' for code in Lcodes: if Lcodes[code] not in Langtoc: Langtoc[Lcodes[code]] = code elif Lcodes[code] in ['Mandarin', 'Min Nan', 'Cantonese']: continue elif len(code) < len(Langtoc[Lcodes[code]]): Langtoc[Lcodes[code]] = code print "found %d language codes" % i if i < 460: return # get active wikt list # page = wikipedia.Page(meta, "List of Wiktionaries/Table") # gratuitous move, 20.4.9: page = wikipedia.Page(meta, "Wiktionary/Table") existtab = getwikitext(page) # reextab = re.compile(r'^\[\[:([a-z-]+):') reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org') i = 0 for line in existtab.splitlines(): i += 1 mo = reextab.match(line) if mo: # toki pona and klingon we don't want, bo and sc are borked, to is closed (21.5.8) if mo.group(1) in [ 'tokipona', 'tlh', 'bo', 'sc', 'to', 'sh' ]: continue Exists.add(mo.group(1)) print "%d lines" % i print "found %d active wikts" % len(Exists) if len(Exists) < 160: return # add codes for Mandarin and Min Nan, cantonese doesn't exist yet Exists.add('cmn') Exists.add('nan') # headers, only need 3,4,5 rehead = re.compile(r'^={3,5}(.+?)={3,5}$', re.M) regloss = re.compile(r'\{\{trans-top\|(.+?)}}') # trans line, start simple, without and with wikilinking, 1 is lang, 2 is rest retrans = re.compile(r'\*\s*([-\w ]+):(.*)') retransw = re.compile(r'\*\s*\[\[([-\w ]+)\]\]:(.*)') # wikitext format stackable at the start of a line restack = re.compile('^([#:\*]+)\s*') Tlist = [ # first sets are optimizations, catch common cases, and uncollapsed gender/number for simple # match patterns for translations lines, simplest first, must match all r'^\[\[(?P<flw>[- \w]+)\]\]$', r'^\[\[(?P<flw>[- \w]+)\]\]\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}}|)$', # with transliteration r'^\[\[(?P<flw>[- \w]+)\]\]\s*\((?P<tra>[\w]+)\)$', r'^\[\[(?P<flw>[- \w]+)\]\]\s*\((?P<tra>[\w]+)\)\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}})$', # with sections r'^\[\[(?P<flw>[- \w]+)#(?P<sect>[- \w]+)\|(?P<alt>[- \w]+)\]\]$', r'^\[\[(?P<flw>[- \w]+)#(?P<sect>[- \w]+)\|(?P<alt>[- \w]+)\]\]\s*\{\{(?P<g>[mfcn])}}$', # with scripts r'^\{\{(?P<scr>[-\w]+)\|\[\[(?P<flw>[- \w]+)\]\]}}$', r'^\{\{(?P<scr>[-\w]+)\|\[\[(?P<flw>[- \w]+)\]\]}}\s*\((?P<tra>[\w]+)\)(:?\s*\{\{(?P<g>[mfcn])}})$', # following are cases not caught by general regular expression # cases of link to FL wikt r'^\[\[:(?P<iwk>[-a-z]+):(?P<flw>[^\|\]#\{}:]+)\|(?P<alt>[^\|\]#\{}:]]+)\]\]' '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$', # three cases for a single unlinked word (don't allow phrases, trans might be one of the words) r'^(?P<flw>[\w]+)$', r'^(?P<flw>[\w]+)\s*\{\{(?P<g>[mfcn])}}(?:\s*\{\{(?P<g2>[fcnps])}}|)$', r'^(?P<flw>[\w]+)$' '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$', # other oddities, bold word (self page ref), ''m pl'' and friends. r"^'''(?P<flw>[\w]+)'''$", r"^\[\[(?P<flw>[- \w]+)\]\]\s+''(?P<g>[mfcn])\s+(?P<g2>[p])l''$" ] Trex = [ ] for rel in Tlist: Trex.append(re.compile(rel, re.U)) # then we will try general case (long regex ;-) Grex = re.compile(r'^' r'(?:\{\{(?P<scr>[-\w]+)\||)' # script template, or nothing r'\[\[(?P<flw>[^\|\]#\{}:]*)' # start of link, always, any chars not link/template syntax r'(?:#(?P<sect>[- \w]+)|)' # possible section ref r'(?:\|(?P<alt>[^\|\]#\{}:]+)|)' # possible alt, any chars not syntax r'\]\]' # end of link, always r'(?:}}|)' # end of script if present (not checked) r"(?:\s*\((?P<tra>[- '\w]+)\)|)" # possible transliteration r'(?:\s*\{\{(?P<g>[mfcnps])' # possible gender or number r'(?:\|(?P<g2>[fcnps])' # within that, possible second gender/number r'(?:\|(?P<g3>[cnps])|)' # within that, possible third gender/number or nothing r'|)}}|)' # or not second gender, or none at all r'$', re.U) # end of string, must match all, Unicode # match an existing t template so we can create entry: retmatch = re.compile(r'\{\{t[+-]?\|(?P<lc>[^|]*)\|(?P<flw>[^|}]*)' '(?:\|(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)|)' '(?:\|alt=(?P<alt>[^|}]*)|)(?:\|tr=(?P<tra>[^|}]*)|)(?:\|sc=(?P<scr>[^|}]*)|)' '(?:\|xs=[^|}]*|)}}') # merge in [one] following gender template, doesn't now put it in the canonical place [a problem] regmerge = re.compile(r'\{\{(t.?\|.*?)\}\}\s*\{\{([mfcnps])\}\}') # try he-translation, without "defective" options, with genders, must match all rehetrans = re.compile(r'\{\{he-translation\|(?P<flw>[^|}]*)\|(?P<tra>[^|}]*)' '(?:\|wv=(?P<alt>[^|}]*)|)}}' '(?:\s*\{\{(?P<g>[mfcnps])(?:\|(?P<g2>[fcnps])(?:\|(?P<g3>[cnps])|)|)}}|)$') # pre-screen matches for other entries presect = re.compile(r'^\* ?\w*: ?\[\[.*#', re.M) # section ref in trans? preflnk = re.compile(r'\[\[:[-a-z]{2,3}:') # FL link in trans? might be elsewhere # now look for templates and t additions entries = 0 probs = 0 fixed = 0 # start reader xt = threading.Thread(target=xread) xt.daemon = True # kill silently on exit (:-) xt.name = 'read XML' xt.start() tlocal.repact = True # main thread reports tsub details while True: entry = pageq.get() if not entry: # end of pass logpage.add("%d entries, %d checked, %d edits" % (entries, probs, fixed)) logpage.write() entries = 0 probs = 0 fixed = 0 # interruptable wait: while pageq.qsize() == 0: sleep(20) continue # start next pass text = entry.text title = entry.title if title.find(':') >= 0: continue entries += 1 if entries % 100 == 0: with plock: print "%d entries, %d probs, %d edits, %d on queue" % \ (entries, probs, fixed, pageq.qsize()) act = '' # check and update: if True: # [structure] with plock: print '%d: %s (%d)' % (probs, srep(title), pageq.qsize()) # ... pick up current version from en.wikt try: page = wikipedia.Page(site, title) # text = page.get() text = getwikitext(page) origtext = text except wikipedia.NoPage: print "Can't get %s from en.wikt" % srep(page.aslink()) text = '' except wikipedia.IsRedirectPage: print "Page %s is now a redirect" % srep(page.aslink()) text = '' except KeyError: print "key error from the box" text = '' if not text: continue probs += 1 nap += 1 if nap > 70: nap = 70 # some outer bound else: continue # update cache time ckey = 'en:' + title cache[ckey] = time() # assume we will complete update now ... cis += 1 if cis % 20 == 0 or pageq.qsize() == 0: cache.sync() # now parse the current entry see if we have a substitution newtext = retemp.sub(tsub, text) if newtext != text: act += 'updated t/t-/t+ templates, ' text = newtext # simple hack since we will tag over and over ;-) if '{{rfc-tbot}}' in text: act += 'tagged problem, ' # (or was previously tagged ;-) text = text.replace('{{rfc-tbot}}{{rfc-tbot}}', '{{rfc-tbot}}') # look for templates we can add or entries to create added = False intrans = False pos = '' gloss = '' lines = [ ] for line in text.splitlines(): lines.append(line) for i in range(0, len(lines)): mo = rehead.match(lines[i]) if mo: header = mo.group(1).strip() if header == "Translations": intrans = True gloss = '' else: intrans = False if header in POS: pos = header if header in Stops: pos = '' continue if lines[i].startswith(TTBC): intrans = False if not intrans: continue mo = regloss.match(lines[i]) if mo: gloss = mo.group(1).strip() continue # wiki format + one space, so lines are consistant if we change any lines[i] = restack.sub(r'\1 ', lines[i]) mo = retrans.match(lines[i]) if mo: waslinked = False else: mo = retransw.match(lines[1]) if mo: waslinked = True if not mo: continue # if other cases, continue for now (not looking at the rest of the line) if '{{ko-inline' in lines[i]: continue # if '{{he-translation' in lines[i]: continue lang = mo.group(1).strip() trest = mo.group(2) # print srep("t lang %s, rest %s" % (lang, trest)) # known language, wikt exists? if lang not in Langtoc: continue lc = Langtoc[lang] if lc not in Exists: continue nomatch = False changed = False tnew = '' for ipart in trest.split(','): ipart = ipart.strip() if not ipart: # consecutive commas, begins or ends with comma? nomatch = True continue # see if existing t template but no entry, so we can create it # this is the best case for create if someone (not tbot) added the template # also merge in following gender templates here if '{{t' in ipart: ip2 = regmerge.sub(r'{{\1|\2}}', ipart) while ip2 != ipart: ipart = ip2 ip2 = regmerge.sub(r'{{\1|\2}}', ipart) changed = True mo = retmatch.search(ipart) if mo: mod = mo.groupdict() if (mod['flw'] not in Lsects or lc not in Lsects[mod['flw']]) and pos and gloss and not nocreate: createFLentry(mod['flw'], lang, lc, pos, title, gloss, mod) tnew += ipart + ', ' continue # same for Hebrew template, but must match all of the part if '{{he-translation' in ipart: mo = rehetrans.match(ipart) if mo: mod = mo.groupdict() mod['scr'] = 'Hebr' if (mod['flw'] not in Lsects or lc not in Lsects[mod['flw']]) and pos and gloss and not nocreate: createFLentry(mod['flw'], lang, lc, pos, title, gloss, mod) tnew += ipart + ', ' continue mo = None # specific cases for rex in Trex: mo = rex.match(ipart) if mo: break # general case if not mo: mo = Grex.match(ipart) if dmatch and mo: print "match general case: %s" % srep(ipart) if mo: mod = mo.groupdict() # format t template: t = '{{t' flw = mod['flw'] # always there, but can be nil if a section link # if self-link with section if not flw and 'sect' in mod and mod['sect']: flw = title if not flw: # can't happen? sect must exist and be non-nil to match nomatch = True continue # try adding script, before the create FL step if 'scr' not in mod or not mod['scr']: scr = script(flw, lc) if scr: mod['scr'] = scr # check out transliteration if 'tra' in mod and mod['tra']: tra = mod['tra'] # close enough to be fairly certain it is the transliteration? if fuzzy(tlit(flw), tlit(tra), len(tra)-2) < max(len(tra)-2, 4): if dmatch: print srep("no match trans: %s to %s" % (tlit(flw), tlit(tra))) nomatch = True continue # no more with this one # create if missing, or check iwikis, or no match if flw != title and (flw not in Lsects or lc not in Lsects[flw]) and \ pos and gloss and not nocreate: if createFLentry(flw, lang, lc, pos, title, gloss, mod): t += '+' else: # don't convert yet if no confirm on match if flw not in Iwikis: nomatch = True elif lc in Iwikis[flw]: t += '+' else: t += '-' elif flw in Iwikis: if lc in Iwikis[flw]: t += '+' else: t += '-' else: nomatch = True t += '|' + lc + '|' + flw # if explicit iwiki, check code: if 'iwk' in mod and mod['iwk']: if mod['iwk'] != lc: print "explicit iwiki link does not match language %s" % srep(lang) nomatch = True if 'sect' in mod and mod['sect'] and mod['sect'] != lang: print "section doesn't match language %s" % srep(lang) logpage.add("[[%s]] section %s doesn't match language %s" % \ (title, mod['sect'], lang)) nomatch = True # in canonical order ... # gender if 'g' in mod and mod['g']: t += '|' + mod['g'] if 'g2' in mod and mod['g2']: t += '|' + mod['g2'] if 'g3' in mod and mod['g3']: t += '|' + mod['g3'] # alt link if 'alt' in mod and mod['alt'] and mod['alt'] != flw: alt = mod['alt'] t += '|alt=' + alt # transliteration if 'tra' in mod and mod['tra']: t += '|tr=' + tra # script if 'scr' in mod and mod['scr']: scr = mod['scr'] if scriptp(scr): if lc not in Xyzzy or scr != Xyzzy[lc]: t += '|sc=' + scr else: print srep("no match script: %s" % scr) nomatch = True # xs last if lc not in Tlang: t += '|xs=' + lang t += '}}' tnew += t + ', ' changed = True else: if dmatch: print "no match pattern: %s" % srep(ipart) nomatch = True if changed and not nomatch: tnew = tnew.strip(', ') if not waslinked: lines[i] = '* ' + lang + ': ' + tnew else: lines[i] = '* [[' + lang + ']]: ' + tnew if 'added t for ' not in act: act += 'added t for ' if ' ' + lc + ',' not in act: act += lc + ', ' added = True with plock: print srep("%s to %s" % (trest, tnew)) pass # end of lines # reassemble, else use existing text if added: text = u'\n'.join(lines) # some change, write it if act: act = act.strip(', ') fixed += 1 nap /= 2 # use our own wait, not put throttle, so prescan can continue etc # also will be better when mwapi is doing puts sleep(70 - nap) with plock: print "Updating %s: %s" % (srep(title), srep(act)) # try to fix the entry try: wikipedia.setAction(act) currtext = getedit(page) if currtext.strip('\n ') != origtext.strip('\n '): with plock: print "page changed while updating templates, not saved" continue with plock: page.put(text) # it is understood that taking plock here can stall the reader except wikipedia.EditConflict: print "Edit conflict?" continue except wikipedia.PageNotSaved: print "page not saved" continue except socket.timeout: print "socket timeout" continue except socket.error: print "socket error" continue except Exception, e: print "some exception writing page", repr(e) continue # limit number of fixes for testing # if fixed > 20: break if nap > 5: with plock: print "(sleeping %d seconds)" % nap sleep(nap) # [ notreached ] print "%d entries, %d possible updates, %d fixed" % (entries, probs, fixed) cache.close() # done if __name__ == "__main__": try: main() except KeyboardInterrupt: print "(keyboard interrupt)" finally: logpage.write() wikipedia.stopme() cache.close() Iwikis.close() Lsects.close()