User:Interwicket/code/mwapi
Appearance
< User:Interwicket | code
#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Interwicket/code/mwapi """ functions to use MW API to replace wikipedia.py browser-client functions getwikitext(page) -- get the text of the page, like get getedit(page) -- get the text of the page and an edit token page can then be saved with wikipedia.Page.put putedit(page, text, comment) -- save the page, will never create or recreate, edit only! only if getedit() was used, not framework get readapi(site, request) -- read from or post to api, with compression and maxlag built in optional parameter plock to use as lock around anything printed this version uses persistent HTTP connections """ import wikipedia import re import time from threading import currentThread, Lock plockd = Lock() # default plock import urllib, httplib from StringIO import StringIO from gzip import GzipFile # connection pool # implemented as a queue, so we can share between threads # no particular limit, effectively limited by number of threads in program import Queue pool = Queue.Queue() # since we aren't using the framework 'throttle', do something better # this is a "tick-tock" timer, shared on all threads # clicked down each success, up on each network failure of any type ticktocklock = Lock() ticktock = 1.0 def getticktock(): global ticktock return ticktock relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds') def readapi(site, request, plock = Lock(), sysop = False, mode = "GET", par = None): global ticktocklock, ticktock url = "http://" + site.hostname() + "/w/api.php?" + request done = False nap = 5 maxl = 5 maxlag = "&maxlag=%d" % maxl with ticktocklock: ticktock *= 0.95 # is -0.025 if 5 seconds, -1.0 at 20 seconds ticktock = max(ticktock, 0.1) ticktock = min(ticktock, 20.0) if ticktock >= 10.0: with plock: print "(mwapi readapi: tick tock is %.1f)" % ticktock time.sleep(ticktock) ticktock -= 1.0 # undo first increment in loop while not done: ticktock += 1.0 # done w/o lock, race condition is rare, not a serious problem, ignored! # get a connection from pool try: conn = pool.get_nowait() except Queue.Empty: conn = None try: if not conn: # with plock: print "(opening connection to %s)" % site.hostname() conn = httplib.HTTPConnection(site.hostname()) # either get or post headers = { 'Cookie': site.cookies(sysop = sysop) or '', 'Accept-Encoding': 'gzip', 'User-Agent': 'Interwicket/1.0' } if mode == "POST": headers['Content-Type'] = 'application/x-www-form-urlencoded' conn.request(mode, url + maxlag, par, headers) resp = conn.getresponse() text = resp.read() if 'gzip' in resp.getheader('Content-Encoding', ''): text = GzipFile(fileobj=StringIO(text)).read() text = unicode(text, 'UTF-8' , errors = 'ignore') done = True except Exception, e: # work around net problem 5.6.10, ignore first 3 # this is to deal with the atrocious behavior of Iconnect Kenya # which is capable of forcing requests through their proxy and then # killing 80+% with blank status ('BadStatusLine'), and connections closed repre = repr(e) if nap < 15 and ('10060' in repre or '10054' in repre or 'BadStatusLine' in repre \ or 'timeout' in repre or 'gaierror' in repre): conn.close() conn = None time.sleep(nap) nap = min(nap + nap/2, 300) ticktock -= 0.95 # undo most of increment for this failure continue # quietly with plock: print "(%s: exception reading API: %s)" % (currentThread().name, repr(e)) text = '' conn.close() conn = None time.sleep(nap) nap = min(nap + nap/2, 300) continue if '<api' not in text and 'NdxICC' in text: with plock: print "(mwapi readapi: bad reply from box)" # (not) silently ignore bad return from Nomadix box conn.close() conn = None time.sleep(5) done = False continue mo = relagged.search(text) if mo: replag = int(mo.group(1)) with plock: print "(%s: server lagged %s seconds)" % (currentThread().name, replag) # allow more lag the next time maxl += max(maxl/4, replag/20) maxlag = "&maxlag=%d" % maxl # make some progress even when server crocked ... if maxl > 600: maxlag = "" if maxlag and maxl > 60: with plock: print "(mwapi readapi: next with %s)" % maxlag # sleep replag if not more than 70 time.sleep(min(replag, 70)) done = False pool.put(conn) # should still be good conn = None continue # if we still have the connection without failure, return it to pool if conn: pool.put(conn) return text def ts(t): return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ') retok = re.compile(r' edittoken="(.*?)"') restartime = re.compile(r' starttimestamp="(.*?)"') retimestamp = re.compile(r' timestamp="(.*?)"') rerevid = re.compile(r' revid="(.*?)"') def getwikitext(page, revid = None, plock = plockd): site = page._site if hasattr(page, '_contents'): if revid: if hasattr(page, '_revisionid') and revid == page._revisionid: return page._contents else: return page._contents # else we need to get page done = False nap = 5 while not done: # if revid, get a specific revision if revid: rs = "&rvstartid=%s&rvlimit=1" % revid else: rs = '' # throw various exceptions to caller rawt = readapi(site, "action=query&prop=revisions|info&rvprop=content|ids&format=xml" + rs + "&titles=" + page.urlname(), plock = plock) i1 = rawt.find("<rev ") if i1 > 0: i1a = rawt[i1:].find('>') if i1a > 0: i1 += i1a + 1 else: i1 = -1 # something bad ... i2 = rawt.find("</rev") if i1 < 0 or i2 < i1: # deleted/does not exist? bad title, no API return if 'missing=""' in rawt: raise wikipedia.NoPage if 'invalid=""' in rawt: raise wikipedia.NoPage if '<api />' in rawt: raise wikipedia.NoPage # else with plock: print "(mwapi: no text found, sleeping %d seconds)" % nap # print repr(rawt) time.sleep(nap) nap = min(nap + nap/2, 300) else: done = True text = rawt[i1:i2] text = wikipedia.unescape(text) mo = rerevid.search(rawt) if mo: # print "mwapi (debug): revision id from getwikitext", mo.group(1) revid = mo.group(1) else: revid = '' # for us page._revisionid = revid # did we get redirect? if 'redirect=""' in rawt[:i1]: raise wikipedia.IsRedirectPage # and do not set _contents # tell wikipedia put etc that we have the contents (else it does *another* get!) page._contents = text return text def getedit(page, sysop = False, plock = plockd): site = page._site done = False nap = 5 notk = 0 while not done: # throw various exceptions to caller rawt = readapi(site, "action=query&prop=info|revisions&intoken=edit&format=xml" + "&titles=" + page.urlname(), sysop = sysop, plock = plock) # wiki locked; or possibly user blocked? we don't have enough information # this is message from locked wiki if ">Action 'edit' is not allowed for the current user</info>" in rawt: raise wikipedia.UserBlocked mo = retok.search(rawt) if mo: # token is stored in the site (!) silly, I thought it was an *edit* token site.putToken(mo.group(1), sysop = sysop) done = True else: notk += 1 if notk > 20: raise wikipedia.ServerError # give up eventually! with plock: print repr(rawt) # probably temporary? print "mwapi: no token received trying to edit %s" % repr(page.aslink()) print "mwapi: sleeping %d seconds" % nap time.sleep(nap) nap = min(nap + nap/2, 300) mo = retimestamp.search(rawt) if mo: # print "mwapi (debug): timestamp", mo.group(1) page._editTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1))) # and without reformatting, for our putedit: page._basetimestamp = mo.group(1) else: page._editTime = time.strftime('%Y%m%d%H%M%S', time.gmtime()) mo = restartime.search(rawt) if mo: # print "mwapi (debug): starttimestamp", mo.group(1) page._startTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1))) else: page._startTime = time.strftime('%Y%m%d%H%M%S', time.gmtime()) mo = rerevid.search(rawt) if mo: revid = mo.group(1) else: revid = '' if hasattr(page, "_contents"): del page._contents # ! # print "mwapi (debug): start %s, edit %s, revid %s, token %s" % (page._startTime, page._editTime, # revid, site.getToken()) return getwikitext(page, revid = revid, plock = plock) def putedit(page, text, comment = '', sysop = False, plock = plockd): site = page._site done = False nap = 5 while not done: token = site.getToken(sysop = sysop) # throw various exceptions to caller # parameters in order, token last, to make sure text is complete (!) par = urllib.urlencode([ ('text', text.encode("UTF-8")), ('title', page.title().encode("UTF-8")), ('summary', comment.encode("UTF-8")), ('basetimestamp', page._basetimestamp), ('token', token) ]) rawt = readapi(site, "action=edit&format=xml&bot=1&minor=1&nocreate=1", mode = "POST", par = par, sysop = sysop, plock = plock) if 'result="Success"' in rawt: done = True break # various errors [?] if 'code="missingtitle"' in rawt: if hasattr(page, "_contents"): del page._contents raise wikipedia.NoPage if 'code="pagedeleted"' in rawt: if hasattr(page, "_contents"): del page._contents raise wikipedia.NoPage if 'code="protectedpage"' in rawt: raise wikipedia.LockedPage with plock: print "(mwapi putedit error: %s, page %s)" % (repr(rawt[:300]), repr(page.aslink())) time.sleep(nap) nap = min(nap + nap/2, 300) if nap == 300: break # can't go on forever [?] if __name__ == "__main__": print "mwapi tests" site = wikipedia.getSite('en', 'wiktionary') print "present page foo" page = wikipedia.Page(site, 'foo') t = getwikitext(page) print repr(t) print "missing page" page = wikipedia.Page(site, 'foo xxx2') try: t = getwikitext(page) print repr(t) except Exception, e: print "exception", repr(e) """ print "redirect page" page = wikipedia.Page(site, 'html') try: t = getwikitext(page) print repr(t) except Exception, e: print "exception", repr(e) print "recent changes" try: rct = readapi(site, "action=query&list=recentchanges&format=xml&rcprop=title|user" + "&rctype=new&rcnamespace=0&rclimit=10", sysop = True) print repr(rct) except Exception, e: print "exception", repr(e) """ site = wikipedia.getSite('sw', 'wiktionary') print "present page cat on sw.wikt" page = wikipedia.Page(site, 'cat') t = getwikitext(page) print repr(t) site = wikipedia.getSite('en', 'wiktionary') print "try updating page on en.wikt" page = wikipedia.Page(site, 'User:Robert Ullmann/t1') text = getedit(page) text += "\n\nand some more text" putedit(page, text, "add some more") print "anna two ..." text = getedit(page) text += "\n\nand 2 text" putedit(page, text, "add 2") print "edit missing page" page = wikipedia.Page(site, 'foo xxx2') try: t = getedit(page) print repr(t) except Exception, e: print "exception", repr(e) page._basetimestamp = '0' print "... saving" try: t = putedit(page, "foo") print repr(t) except Exception, e: print "exception", repr(e) print "done"