User:Interwicket/code/iwiktrc
Appearance
< User:Interwicket | code
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot updates iwiki links between wiktionaries
22.1.9: try reading RC from various wikts and adding to en.wikt (just for fun)
24.1.9: try hunting down iwikis for new en.wikt entries
26.1.9: try adding reciprocals; can then use this in full run?
"""
import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import randrange
from mwapi import getwikitext, getedit
from reciprocal import addrci, replink, plock
# borrow global:
from config import usernames
def safe(s):
return pickle.dumps(s)[1:-5]
# Iwiki cache:
# not used quite yet:
"""
import shelve
Iwikis = None
def iwopen(home):
global Iwikis
Iwikis = shelve.open(home + "-iwiki-cache")
cis = 0
def iwadd(title, iws, upd = True):
global cis
if safe(title) in Iwikis and not upd: return
if not iws or not len(iws): return
# with plock: print "iwikis cache %s: %s" % (safe(title), safe(u' '.join(iws)))
Iwikis[safe(title)] = iws
cis += 1
if cis % 100: Iwikis.sync()
return
"""
Lcode = { }
Exists = set()
Active = set()
site = { }
naps = { }
def now(): return int(time.clock())
# return title, language code of FL wikt for recent changes in the other wikts
def recent(home = 'en'):
# set up list of wikt codes to look at
qtime = { }
maxnap = 350 * 60 # almost 6 hours
for lc in Exists:
# if lc == home: continue
site[lc] = wikipedia.getSite(lc, "wiktionary")
qtime[lc] = now()
naps[lc] = 60 * randrange(20, 71) # scatter 20 to 70 minutes
if lc == home: naps[lc] = 300 # five min for home wikt
# entries seen already (just let this grow?)
seen = set()
ny = 0
rcex = re.compile(r'title="(.+?)"')
while True:
# sleep until next one
nextq = now() + 1000000
nextlc = ''
for lc in qtime:
if qtime[lc] < nextq:
nextq = qtime[lc]
nextlc = lc
st = nextq - now()
if st > 90:
with plock: print "(%d, sleeping %d minutes, %s next)" % (now(), (st+29)/60, nextlc)
if st > 0:
time.sleep(st)
if st < -120:
with plock: print "(%d minutes behind)" % (-(st-29)/60)
lc = nextlc
# read recentchanges, new entries, namespace 0, from site:
if True: # [indent]
with plock: print "(%d, reading from %s.wikt)" % (now(), lc)
# set parameters
# one hour ago back to one day ago
rcend = '&rcend=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 86400))
rcstart = '&rcstart=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 3600))
if lc == home:
rcshow = "&rcshow=patrolled|!bot" # avoid junk, large numbers of bot forms
sysop = True # need patrol right on login used
else:
rcshow = ''
sysop = False
rclimit = "&rclimit=%d" % min(1 + ny/10, 200)
# with plock: print "(options " + rcend + rcshow + rclimit + ")"
try:
rct = site[lc].getUrl("/w/api.php?
action=query&list=recentchanges&format=xml&rcprop=title" +
"&rctype=new&rcnamespace=0"+rcend+rcstart+rcshow+rclimit, sysop = sysop)
except wikipedia.NoPage:
with plock: print "can't get recentchanges from %s.wikt" % lc
# rct = ''
# time.sleep(30)
qtime[lc] = now() + 700 # do other things for a bit
continue
except KeyError:
# local bogosity
with plock: print "keyerror"
sleep(20)
continue
if '<recentchanges />' in rct:
# no changes in recent history
pass
elif '</recentchanges>' not in rct:
with plock: print "some bad return from recentchanges, end tag not found"
with plock: print safe(rct)
# rct = ''
time.sleep(30)
qtime[lc] = now() + 300 # do other things for a bit
continue
found = False
for title in rcex.findall(rct):
if ':' in title: continue # other stray stuff in NS:0
if lc + ':' + title not in seen:
seen.add(lc + ':' + title)
yield title, lc
ny += 1
found = True
if found:
naps[lc] /= 2
# naps[lc] = max(naps[lc], 30) # thirty seconds
Active.add(lc)
else:
mn = naps[lc]/300 # one-fifth, in minutes
naps[lc] += 60 * randrange(5, 11 + mn) # five-ten minutes or longer if we don't find
anything
naps[lc] = min(naps[lc], maxnap)
if naps[lc] > maxnap/2: Active.discard(lc)
qtime[lc] = now() + naps[lc]
if naps[lc] > 90:
with plock: print "(naptime for %s is %d minutes)" % (lc, (naps[lc]+29)/60)
else:
with plock: print "(naptime for %s is %d seconds)" % (lc, naps[lc])
# wiki-hunt ... see if a word is in other wikts, return list ...
# challenge here is not to take a huge amount of time, but get as many as possible
# also used by iwiktll, be careful with changes!
# note that lc argument can be one code, or a list
re2head = re.compile(r'^==([^=]*)==$', re.M)
def hunt(word, text, lc, lcs = '', home = 'en'):
with plock: print " ... hunting iwikis"
totry = set()
done = set()
fps = set()
links = { }
redirs = { }
reiw = re.compile(r'\[\[([a-z-]{2,11}):' + re.escape(word) + r'\]\]')
# for lc in Active: totry.add(lc) magic occurs:
if lc == home:
# try hunting 10 most active wikts (11 because home will usually be in this list)
totry = set( sorted(Active, key=lambda c: naps[c])[:11] )
else:
# if we found an FL title, start with that
totry.add(lc)
# other codes known to caller
for lc in lcs: totry.add(lc)
# language header(s) in entry are good candidates (of course!)
# [code specific to English wikt ...]
for lang in re2head.findall(text):
if lang in Lcode: totry.add(Lcode[lang])
# simple scan for existing iwikis
for lc in reiw.findall(text):
lc = str(lc)
if lc in site:
totry.add(lc)
# not home:
totry.discard(home)
done.add(home)
exceptions = False
while totry:
lc = totry.pop()
try:
fpage = wikipedia.Page(site[lc], word)
text = getwikitext(fpage)
with plock: print " found in", lc
except wikipedia.NoPage:
with plock: print " not in", lc
done.add(lc)
continue
except wikipedia.IsRedirectPage:
redirs[lc] = fpage
with plock: print " found in", lc, "(redirect)"
except Exception, e:
exceptions = True
with plock: print "exception testing existence of word", str(e)
done.add(lc)
continue
done.add(lc)
links[lc] = fpage
# add to list to add reciprocal link, or complete set, don't (can't :-) update redirects
if lc not in redirs: fps.add(fpage)
# look for iwikis in the page, add to to-be-tried if not already done
for lc in reiw.findall(text):
lc = str(lc) # not in unicode
if lc not in site: continue # (!) else who knows what junk ...
if lc not in done and lc not in totry:
with plock: print " found further iwiki", lc
totry.add(lc)
# all done, now add reciprocals
# don't remove anything if there were exceptions because hunt may be incomplete
# if no exceptions, hunt is complete for these entries (there may be others not seen,
# but then they aren't linked, as we've looked at all links ...), so remove any
# links not found:
for fpage in fps:
addrci(fpage, site[home], links=links, redirs=redirs, remove=not exceptions)
# return list of all links and redirects, and flag if complete
return links, redirs, not exceptions
def main():
socket.setdefaulttimeout(40)
home = 'en'
xml = True
# testing rc:
xml = False
""" just keep argv code for now
for arg in sys.argv[1:]:
if arg.startswith('-start:'):
start = arg[7:]
with plock: print "starting at %s" % start
elif arg.startswith('-stop:'):
stop = arg[6:]
with plock: print "stopping at %s" % stop
elif arg.startswith('-new'):
newonly = True
with plock: print "new entries only"
elif arg.startswith('-sort'):
sort = True
with plock: print "do edits for sort"
elif arg.startswith('-xml'):
xml = True
with plock: print "read XML file"
elif arg.startswith('-update'):
update = True
with plock: print "update cache from XML (XML is current!)"
else: with plock: print "unknown command line argument %s" % arg
"""
mysite = wikipedia.getSite(home, 'wiktionary')
# make sure we are logged in
mysite.forceLogin()
meta = wikipedia.getSite(code = "meta", fam = "meta")
# get active wikt list
# minus crap. Tokipona? what are they thinking? Klingon? ;-)
Lstops = ['tokipona', 'tlh']
page = wikipedia.Page(meta, "List of Wiktionaries/Table")
existtab = page.get()
""" entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""
# reextab = re.compile(r'^\[\[:([a-z-]+):')
# reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
r'^\| .*\n'
r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
for mo in reextab.finditer(existtab):
if mo.group(2) in Lstops: continue
Exists.add(mo.group(2))
Lcode[mo.group(1)] = mo.group(2)
# see if we have a login in user config, else pretend we do
# has to be done before any call, or login status gets confused!
if mo.group(2) not in usernames['wiktionary']:
usernames['wiktionary'][mo.group(2)] = "Interwicket"
with plock: print "found %d active wikts" % len(Exists)
if len(Exists) < 150: return
# naps ... ;-)
naptime = 0
maxnap = 70
# Iwikis cache [not updated for now]
# iwopen(home)
# build table of existing entries from xml
# note we assume since we are doing RC new entries that the iwiki will be new,
# what we want here is just an index to entries, so we don't have to do lots of en.wikt lookups
enwikt = set()
if xml:
# get XML dump
dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")
ti = 0
entries = 0
reds = 0
iws = { } # in memory cache
for entry in dump.parse():
text = entry.text
title = entry.title
if ':' in title: continue
# if title < start or (stop and title > stop): continue
if text.startswith('#'): continue
entries += 1
if entries % 20000 == 0:
with plock: print "prescan %d entries" % entries
enwikt.add(title)
# test:
# if entries > 100000: break
continue
with plock: print "total %d entries" % entries
# now look for iwikis needed
entries = 0
probs = 0
fixed = 0
news = 0
cbase = now() - 86400
rate = 0.0
for title, lc in recent():
if ':' in title: continue # redundant, but eh?
# canon title
page = wikipedia.Page(mysite, title)
title = page.title()
# temp:
# if lc == 'en' and title.startswith('Da'): continue
if title.lower() == 'main page': continue
news += 1
rate = news*3600.0/(now()-cbase)
if news % 100 == 0:
with plock: print "(observed creation rate %.4f/hour)" % rate
with plock: print "%s:%s" % (safe(lc), safe(title))
# if looking at home wikt is enabled above, just add things (;-)
"""
if lc == home:
with plock: print " ... added to en.wikt"
enwikt.add(title)
continue
"""
if lc == home: tag = True
# if we are using xml? else just always look at entry
if lc != home and xml and title not in enwikt:
with plock: print " ... %s not in en.wikt" % safe(title)
continue
# [look at cache, but unlikely, as this is new]
tag = True
# now see if it is something that should be tagged/replaced:
if tag:
probs += 1
naptime += 1
# ... pick up current version from en.wikt
# with plock: print '%s is possible update, getting current entry' % safe(title)
try:
# text = page.get()
text = getwikitext(page)
oldtext = text
except wikipedia.NoPage:
with plock: print " ... %s not in en.wikt" % safe(page.title())
text = ''
except wikipedia.IsRedirectPage:
with plock: print " ... redirect page"
text = ''
except KeyError:
# annoying local error, from crappy framework code
with plock: print "KeyError"
time.sleep(200)
continue
if not text: continue
if lc != home and '[[' + lc + ':' + title + ']]' in text:
with plock: print " ... iwiki %s already in %s" % (safe(lc), safe(title))
continue
# go hunt down some iwikis, add reciprocals when needed
links, redirs, complete = hunt(title, text, lc)
# then update this one (also queued to other thread):
replink(page = page, links = links, redirs = redirs, remove = complete)
# limit number of fixes for testing
# if fixed > 7: break
# pace [not used in the same way, reconsider]
if naptime > maxnap: naptime = maxnap
"""
if naptime > 4:
with plock: print "sleeping %d seconds" % naptime
time.sleep(naptime)
"""
continue
# [notreached]
# with plock: print "%d entries, %d possible, %d updated" % (entries, probs, fixed)
# done
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()