User:Interwicket/code/iwiktrc

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot updates iwiki links between wiktionaries

22.1.9: try reading RC from various wikts and adding to en.wikt (just for fun)

24.1.9: try hunting down iwikis for new en.wikt entries

26.1.9: try adding reciprocals; can then use this in full run?

"""

import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import randrange
from mwapi import getwikitext, getedit
from reciprocal import addrci, replink, plock
# borrow global:
from config import usernames

def safe(s):
    return pickle.dumps(s)[1:-5]

# Iwiki cache:

# not used quite yet:
"""
import shelve
Iwikis = None

def iwopen(home):
    global Iwikis

    Iwikis = shelve.open(home + "-iwiki-cache")

cis = 0
def iwadd(title, iws, upd = True):
    global cis

    if safe(title) in Iwikis and not upd: return
    if not iws or not len(iws): return

    # with plock: print "iwikis cache %s: %s" % (safe(title), safe(u' '.join(iws)))
    Iwikis[safe(title)] = iws

    cis += 1
    if cis % 100: Iwikis.sync()

    return
"""

Lcode = { }
Exists = set()
Active = set()
site = { }
naps = { }

def now(): return int(time.clock())

# return title, language code of FL wikt for recent changes in the other wikts

def recent(home = 'en'):

    # set up list of wikt codes to look at

    qtime = { }
    maxnap = 350 * 60 # almost 6 hours
    for lc in Exists:
         # if lc == home: continue
         site[lc] = wikipedia.getSite(lc, "wiktionary")
         qtime[lc] = now()
         naps[lc] = 60 * randrange(20, 71) # scatter 20 to 70 minutes
         if lc == home: naps[lc] = 300 # five min for home wikt

    # entries seen already (just let this grow?)
    seen = set()
    ny = 0

    rcex = re.compile(r'title="(.+?)"')

    while True:

        # sleep until next one
        nextq = now() + 1000000
        nextlc = ''
        for lc in qtime:
            if qtime[lc] < nextq:
                nextq = qtime[lc]
                nextlc = lc
        st = nextq - now()
        if st > 90:
            with plock: print "(%d, sleeping %d minutes, %s next)" % (now(), (st+29)/60, nextlc)
        if st > 0:
            time.sleep(st)
        if st < -120:
            with plock: print "(%d minutes behind)" % (-(st-29)/60)
        lc = nextlc

        # read recentchanges, new entries, namespace 0, from site:

        if True: # [indent]

            with plock: print "(%d, reading from %s.wikt)" % (now(), lc)

            # set parameters

            # one hour ago back to one day ago
            rcend = '&rcend=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 86400))
            rcstart = '&rcstart=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 3600))

            if lc == home:
                rcshow = "&rcshow=patrolled|!bot" # avoid junk, large numbers of bot forms
                sysop = True # need patrol right on login used
            else:
                rcshow = ''
                sysop = False

            rclimit = "&rclimit=%d" % min(1 + ny/10, 200)

            # with plock: print "(options " + rcend + rcshow + rclimit + ")"

            try:
                rct = site[lc].getUrl("/w/api.php?

action=query&list=recentchanges&format=xml&rcprop=title" +
                     "&rctype=new&rcnamespace=0"+rcend+rcstart+rcshow+rclimit, sysop = sysop)
            except wikipedia.NoPage:
                with plock: print "can't get recentchanges from %s.wikt" % lc
                # rct = ''
                # time.sleep(30)
                qtime[lc] = now() + 700  # do other things for a bit
                continue
            except KeyError:
                # local bogosity
                with plock: print "keyerror"
                sleep(20)
                continue

            if '<recentchanges />' in rct:
                # no changes in recent history
                pass
            elif '</recentchanges>' not in rct:
                with plock: print "some bad return from recentchanges, end tag not found"
                with plock: print safe(rct)
                # rct = ''
                time.sleep(30)
                qtime[lc] = now() + 300  # do other things for a bit
                continue

            found = False
            for title in rcex.findall(rct):
                if ':' in title: continue # other stray stuff in NS:0

                if lc + ':' + title not in seen:
                    seen.add(lc + ':' + title)
                    yield title, lc
                    ny += 1
                    found = True

            if found:
                naps[lc] /= 2
                # naps[lc] = max(naps[lc], 30) # thirty seconds
                Active.add(lc)
            else:
                mn = naps[lc]/300 # one-fifth, in minutes
                naps[lc] += 60 * randrange(5, 11 + mn) # five-ten minutes or longer if we don't find 

anything
                naps[lc] = min(naps[lc], maxnap)
                if naps[lc] > maxnap/2: Active.discard(lc)

            qtime[lc] = now() + naps[lc]
            if naps[lc] > 90:
                with plock: print "(naptime for %s is %d minutes)" % (lc, (naps[lc]+29)/60)
            else:
                with plock: print "(naptime for %s is %d seconds)" % (lc, naps[lc])

# wiki-hunt ... see if a word is in other wikts, return list ...
# challenge here is not to take a huge amount of time, but get as many as possible
# also used by iwiktll, be careful with changes!
# note that lc argument can be one code, or a list

re2head = re.compile(r'^==([^=]*)==$', re.M)

def hunt(word, text, lc, lcs = '', home = 'en'):

    with plock: print "    ... hunting iwikis"
    totry = set()
    done = set()
    fps = set()
    links = { }
    redirs = { }

    reiw = re.compile(r'\[\[([a-z-]{2,11}):' + re.escape(word) + r'\]\]')

    # for lc in Active: totry.add(lc) magic occurs:
    if lc == home:
        # try hunting 10 most active wikts (11 because home will usually be in this list)
        totry = set( sorted(Active, key=lambda c: naps[c])[:11] )
    else:
        # if we found an FL title, start with that
        totry.add(lc)

    # other codes known to caller
    for lc in lcs: totry.add(lc)

    # language header(s) in entry are good candidates (of course!)
    # [code specific to English wikt ...]

    for lang in re2head.findall(text):
        if lang in Lcode: totry.add(Lcode[lang])

    # simple scan for existing iwikis

    for lc in reiw.findall(text):
        lc = str(lc)
        if lc in site:
            totry.add(lc)

    # not home:
    totry.discard(home)
    done.add(home)

    exceptions = False

    while totry:
        lc = totry.pop()

        try:
            fpage = wikipedia.Page(site[lc], word)
            text = getwikitext(fpage)
            with plock: print "        found in", lc
        except wikipedia.NoPage:
            with plock: print "        not in", lc
            done.add(lc)
            continue
        except wikipedia.IsRedirectPage:
            redirs[lc] = fpage
            with plock: print "        found in", lc, "(redirect)"
        except Exception, e:
            exceptions = True
            with plock: print "exception testing existence of word", str(e)
            done.add(lc)
            continue

        done.add(lc)
        links[lc] = fpage

        # add to list to add reciprocal link, or complete set, don't (can't :-) update redirects
        if lc not in redirs: fps.add(fpage)

        # look for iwikis in the page, add to to-be-tried if not already done

        for lc in reiw.findall(text):
            lc = str(lc) # not in unicode
            if lc not in site: continue # (!) else who knows what junk ...
            if lc not in done and lc not in totry:
                with plock: print "            found further iwiki", lc
                totry.add(lc)

    # all done, now add reciprocals
    # don't remove anything if there were exceptions because hunt may be incomplete
    # if no exceptions, hunt is complete for these entries (there may be others not seen,
    # but then they aren't linked, as we've looked at all links ...), so remove any
    # links not found:

    for fpage in fps:
        addrci(fpage, site[home], links=links, redirs=redirs, remove=not exceptions)

    # return list of all links and redirects, and flag if complete
    return links, redirs, not exceptions
           

def main():

    socket.setdefaulttimeout(40)

    home = 'en'
    xml = True

    # testing rc:
    xml = False

    """ just keep argv code for now
    for arg in sys.argv[1:]:
        if arg.startswith('-start:'):
            start = arg[7:]
            with plock: print "starting at %s" % start
        elif arg.startswith('-stop:'):
            stop = arg[6:]
            with plock: print "stopping at %s" % stop
        elif arg.startswith('-new'):
            newonly = True
            with plock: print "new entries only"
        elif arg.startswith('-sort'):
            sort = True
            with plock: print "do edits for sort"
        elif arg.startswith('-xml'):
            xml = True
            with plock: print "read XML file"
        elif arg.startswith('-update'):
            update = True
            with plock: print "update cache from XML (XML is current!)"
        else: with plock: print "unknown command line argument %s" % arg
    """

    mysite = wikipedia.getSite(home, 'wiktionary')
    # make sure we are logged in
    mysite.forceLogin()
    meta = wikipedia.getSite(code = "meta", fam = "meta")

    # get active wikt list
    # minus crap. Tokipona? what are they thinking? Klingon? ;-)
    Lstops = ['tokipona', 'tlh']

    page = wikipedia.Page(meta, "List of Wiktionaries/Table")
    existtab = page.get()

    """ entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""

    # reextab = re.compile(r'^\[\[:([a-z-]+):')
    # reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
    reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
                         r'^\| .*\n'
                         r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
    for mo in reextab.finditer(existtab):
        if mo.group(2) in Lstops: continue
        Exists.add(mo.group(2))
        Lcode[mo.group(1)] = mo.group(2)
        # see if we have a login in user config, else pretend we do
        # has to be done before any call, or login status gets confused!
        if mo.group(2) not in usernames['wiktionary']:
            usernames['wiktionary'][mo.group(2)] = "Interwicket"
 
    with plock: print "found %d active wikts" % len(Exists)
    if len(Exists) < 150: return

    # naps ... ;-)
    naptime = 0
    maxnap = 70

    # Iwikis cache [not updated for now]
    # iwopen(home)

    # build table of existing entries from xml
    # note we assume since we are doing RC new entries that the iwiki will be new,
    # what we want here is just an index to entries, so we don't have to do lots of en.wikt lookups

    enwikt = set()
    if xml:
      # get XML dump
      dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")

      ti = 0
      entries = 0
      reds = 0
      iws = { } # in memory cache

      for entry in dump.parse():
        text = entry.text
        title = entry.title
        if ':' in title: continue
        # if title < start or (stop and title > stop): continue
        if text.startswith('#'): continue
        entries += 1
        if entries % 20000 == 0:
            with plock: print "prescan %d entries" % entries
        enwikt.add(title)

        # test:
        # if entries > 100000: break

        continue

      with plock: print "total  %d entries" % entries

    # now look for iwikis needed

    entries = 0
    probs = 0
    fixed = 0

    news = 0
    cbase = now() - 86400
    rate = 0.0

    for title, lc in recent():

        if ':' in title: continue # redundant, but eh?

        # canon title
        page = wikipedia.Page(mysite, title)
        title = page.title()

        # temp:
        # if lc == 'en' and title.startswith('Da'): continue

        if title.lower() == 'main page': continue

        news += 1
        rate = news*3600.0/(now()-cbase)
        if news % 100 == 0:
            with plock: print "(observed creation rate %.4f/hour)" % rate

        with plock: print "%s:%s" % (safe(lc), safe(title))

        # if looking at home wikt is enabled above, just add things (;-)
        """
        if lc == home:
             with plock: print "    ... added to en.wikt"
             enwikt.add(title)
             continue
        """
        if lc == home: tag = True

        # if we are using xml? else just always look at entry
        if lc != home and xml and title not in enwikt:
             with plock: print "    ... %s not in en.wikt" % safe(title)
             continue

        # [look at cache, but unlikely, as this is new]

        tag = True

        # now see if it is something that should be tagged/replaced:

        if tag:

            probs += 1
            naptime += 1

            # ... pick up current version from en.wikt

            # with plock: print '%s is possible update, getting current entry' % safe(title)

            try:
                # text = page.get()
                text = getwikitext(page)
                oldtext = text
            except wikipedia.NoPage:
                with plock: print "    ... %s not in en.wikt" % safe(page.title())
                text = ''
            except wikipedia.IsRedirectPage:
                with plock: print "    ... redirect page"
                text = ''
            except KeyError:
                # annoying local error, from crappy framework code
                with plock: print "KeyError"
                time.sleep(200)
                continue

            if not text: continue

            if lc != home and '[[' + lc + ':' + title + ']]' in text:
                 with plock: print "    ... iwiki %s already in %s" % (safe(lc), safe(title))
                 continue

            # go hunt down some iwikis, add reciprocals when needed
            links, redirs, complete = hunt(title, text, lc)

            # then update this one (also queued to other thread):
            replink(page = page, links = links, redirs = redirs, remove = complete)

        # limit number of fixes for testing
        # if fixed > 7: break

        # pace [not used in the same way, reconsider]
        if naptime > maxnap: naptime = maxnap
        """
        if naptime > 4:
            with plock: print "sleeping %d seconds" % naptime
        time.sleep(naptime)
        """
        continue

    # [notreached]
    # with plock: print "%d entries, %d possible, %d updated" % (entries, probs, fixed)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()