User:Robert Ullmann/spork
program notes
[edit]spork is a program that looks for words used in the English Wikipedia, and adds citations entries to the wikt
Process:
spork maintains a persistent (on-disk) cache of words in the wikt that have English sections. It then looks at random 'pedia entries, and checks every lower case word longer than 3 letters. If the word is not in the wikt, it offers to create a Citations: namespace entry, which then shows up in Category:New words from Wikipedia. If the 'pedia has spelled the word incorrectly, a replacement can be supplied, and the program will then set up an edit on the article for approval.
The cache is updated with several steps:
- if the word is present in the disk cache, the word exists
- else spork reads some of an XML dump (if available), and then checks to see if the word has been found
- else it reads a few entries from a likely category, to see if it find the word
- else it looks for the specific entry in the wikt, checking for an English section
While this may seem convoluted, it serves a number of purposes. The XML file is not needed, and it doesn't matter greatly if it is stale, even a year old will help, but not affect the validity. So one doesn't need to worry about going to retrieve the XML frequently; getting one when setting up will be fine. It may be the daily, the WMF "pages-articles" or the WMF "pages-meta-current" dump; the "history" dump will not work.
If not found, spork will read a likely category, reading a few entries (20) and adding them to the cache; as these are English POS categories, it need not read each entry to check for the English section. This updates the cache more quickly than looking at each entry individually.
Finally, as it will resort to reading the entry, it will be completely up to date, including entries not categorized, before asking about creating a citation.
code
[edit]#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot looks for new words in the Wikipedia, by wandering around.
No command line arguments.
"""
import wikipedia
import sys
import re
import pickle
import xmldate
from time import sleep
import socket
from random import choice
# API acceleration, may be present:
try:
from mwapi import getwikitext, getedit
except Exception, e:
print "(module mwapi not found, using pybot calls)"
def getwikitext(p): return p.get()
def getedit(p): return p.get()
def safe(s):
return pickle.dumps(s)[1:-5]
# English = { }
# English dictionary, try on disk
import shelve
English = shelve.open("spork-English-cache")
# rather than look at each word individually, we look at a likely category first.
# this will get us a number of words at a time
# we don't want to read the entire cats at startup, would take too long, and be
# redundant if the cache has been built up
# when new words (or English sections) are added, we automatically find them
# if we start with an empty cache, we collect a large number of words quickly
# some words (pronouns, determiners, numbers) are in categories we don't look at,
# we just find those one at a time as there aren't enough to grab blocks of cat
recword = re.compile(r'title="([a-z]+)"')
reisenword = re.compile(r'[a-z]+$')
POS = ['noun', 'plural',
'verb', 'verb form', 'third-person singular form', 'past participle', 'present participle',
'simple past form',
'adjective', 'adjective superlative form', 'adjective comparative form',
'adverb' ]
# keep track of 3-4 letter suffixes found, preset some 1-2 (good numbers for English)
suffixes = { 's':'plural',
'es':'third-person singular form',
'd':'simple past form',
'ed':'past participle',
'ly':'adverb',
'st':'adjective superlative form',
'er':'adjective comparative form',
'th':'verb form',
'ng':'present participle' }
def herdcats(word, site):
if not reisenword.match(word): return # also means we don't worry about urlencode
# grab a bunch of a cat that might include word, add to cache
# have we seen this suffix? pp is possible-pos
pp = [ ]
if word[-4:] in suffixes: pp.append(suffixes[word[-4:]])
if word[-3:] in suffixes: pp.append(suffixes[word[-3:]])
if word[-2:] in suffixes: pp.append(suffixes[word[-2:]])
if word[-1:] in suffixes: pp.append(suffixes[word[-1:]])
if pp: pos = choice(pp)
else: pos = choice(POS)
# go get the cat
print " (reading cat English %ss)" % pos
try:
cname = 'Category:English_' + pos.replace(' ', '_') + 's'
cat = site.getUrl( \
"/w/api.php?action=query&list=categorymembers&cmtitle=%s&cmlimit=20&cmstartsortkey=%s&format=xml" % (cname, word))
except Exception, e:
print "trouble getting a part of a cat", str(e)
cat = ''
k = 0
for w in recword.findall(cat):
if len(w) < 4: continue
sw = safe(w)
if w == word:
print " (found %s in cat)" % sw
else:
if not reisenword.match(w): continue
if sw not in English or not English[sw]:
print " (added %s to cache)" % sw
k += 1
English[sw] = True
suffixes[w[-4:]] = pos
suffixes[w[-3:]] = pos
# (don't "learn" 1-2)
# if we didn't find it, remove the offending suffixes entry
if safe(word) not in English:
if word[-4:] in suffixes and suffixes[word[-4:]] == pos: del suffixes[word[-4:]]
if word[-3:] in suffixes and suffixes[word[-3:]] == pos: del suffixes[word[-3:]]
English.sync()
# if k: print " (added %d to cache)" % k
return
# read from XML file if available
import xmlreader
xml = None
xent = 0
Xmlc = { }
xmlp = False
# words from XML dump
reaz = re.compile(r"[a-z]+$")
def initxml():
global xml, xent, xmlp
try:
dump = xmlreader.XmlDump("en-wikt.xml")
xml = iter(dump.parse())
xent = 0
# read one (that will not be present) so the message will come at the beginning
# also this is where the file is opened:
xmlp = True # perhaps ...
readxml('-')
except Exception, e:
print "can't open en-wikt.xml", str(e)
xml = None
xmlp = False
def readxml(word):
global xml, xent, xmlp
if not xmlp: return # have no xml file from start, do nothing
r = 1000
# first, read 1000 entries (counting all)
while xml and r > 0:
try:
entry = xml.next()
except StopIteration:
print " (end of XML file, %d read)" % xent
xml = None
break
xent += 1
if xent%5000 == 0: print " (read %d from xml)" % xent
r -= 1 # countdown reads
if ':' in entry.title: continue
if not reaz.match(entry.title): continue
if "==English==" not in entry.text: continue
# record in internal cache (spread disk db out over time)
Xmlc[entry.title] = True
# now see if we have found anything to add to permanent cache
# up to 20 words, starting with the same 4 letters (5 works better? lots of words)
# this gives us a reasonable chance of hitting the word and a reasonable rate of discarding others
dels = set()
k = 20
for title in Xmlc:
if word[:5] != title[:5]: continue
dels.add(title)
sw = safe(title)
if sw in English and English[sw]: continue
k -= 1 # count new English words
English[sw] = True
if title == word:
print " (found %s in xml)" % sw
else:
print " (added %s from xml)" % sw
if not k: break
# delete titles processed from internal cache
for title in dels: del Xmlc[title]
# dbg
print " (%d in internal xml cache)" % len(Xmlc)
return
def spork():
socket.setdefaulttimeout(30)
test = False
sysop = True # use my regular login to create Citations: pages
# make sure we are logged in
site = wikipedia.getSite("en", "wiktionary")
site.forceLogin(sysop = sysop)
pedia = wikipedia.getSite("en", "wikipedia")
pedia.forceLogin()
initxml()
entries = 0
eng = 0
# set some regex
refirst = re.compile(r'<h1 class="firstHeading">(.*?)</h1>')
resent = re.compile(r'[A-Z].*?[a-z\)]\.')
# general pattern we are interested in
reword = re.compile(r"[-\w']+", re.U)
reartid = re.compile(r'var wgArticleId = "(\d*)"')
# strip tags
reanchor = re.compile(r'<a .*?>(.*?)</a>')
reimg = re.compile(r'<img .*?>')
rerefs = re.compile(r'<ref.*?>(.*?)</ref>')
resuper = re.compile(r'<sup>(.*?)</sup>')
resup2 = re.compile(r'<sup .*?>(.*?)</sup>')
resub = re.compile(r'<sup>(.*?)</sup>')
rebold = re.compile(r'<b>(.*?)</b>')
reital = re.compile(r'<i>(.*?)</i>')
respan = re.compile(r'<span .*?>(.*?)</span>')
respan2 = re.compile(r'</span>')
restrong = re.compile(r'<strong .*?>(.*?)</strong>')
# now pick up random 'pedia articles
hits = 0
misses = 0
while True:
print "(cache hit ratio %.4f)" % (hits/(hits + misses + 0.0001)) # (no div by zero)
try:
art = pedia.getUrl("/w/index.php?title=Special:Random")
except wikipedia.NoPage:
print "Can't get random article from wikipedia"
art = ''
sleep(30)
continue
except KeyboardInterrupt:
raise KeyboardInterrupt
except Exception, e:
print "some exception getting a page", str(e)
sleep(30)
continue
# find page title:
mo = refirst.search(art)
if not mo:
print "can't find page title?"
continue
article = mo.group(1)
mo = reartid.search(art)
if mo: artid = mo.group(1)
else: artid = '?'
print
print "article is %s (%s)" % (safe(article), safe(artid))
kflag = False
warts = set()
# look for paragraphs:
for line in art.splitlines():
if not line.startswith( '<p>' ) or not line.endswith( '</p>' ): continue
# strip some things
line = reanchor.sub(r'\1', line)
line = reimg.sub(r'', line)
line = rerefs.sub(r'', line)
# line = resuper.sub(r'(\1)', line)
line = resup2.sub(r'', line)
# line = resub.sub(r'(\1)', line)
line = rebold.sub(r'\1', line)
line = reital.sub(r'\1', line)
# several times ...
line = respan.sub(r'\1', line)
line = respan.sub(r'\1', line)
line = respan.sub(r'\1', line)
line = respan.sub(r'\1', line)
# clean up end tags
line = respan2.sub(r'', line)
line = restrong.sub(r'\1', line)
# sentences:
for sentence in resent.findall(line[3:-4]):
print safe(sentence)
for word in reword.findall(sentence):
if not word.islower(): continue
if word[0:1].isdigit(): continue
if '-' in word: continue # at least for now
if word.endswith("'s"): continue
if word.startswith("'"): word = word[1:]
if word.endswith("'"): word = word[:-1]
if not word: continue
# skip short words, not likely any three letter words are missing,
# and we don't need to look at (e.g.) "cm"
if len(word) < 4: continue
if word in warts: continue # seen already in this article
warts.add(word)
sw = safe(word) # make valid DB key
# continue if the word is present, and has English section
if sw in English and English[sw]:
hits += 1
if word in Xmlc: del Xmlc[word] # discard, not useful
continue
misses += 1
# candidate
print " ", sw
readxml(word) # read some entries if an XML
if sw in English and English[sw]: continue
herdcats(word, site) # possibly add from cat
if sw in English and English[sw]: continue
# check for page in wikt (may be non-English, or since created)
page = wikipedia.Page(site, word)
try:
wt = getwikitext(page)
if '==English==' in wt:
print " (entry %s exists)" % sw
English[sw] = True
continue
print " (no English in entry)"
English[sw] = False
except wikipedia.NoPage:
English[sw] = False
except wikipedia.IsRedirectPage:
# [should treat as missing, but skip for now]
print " (redirect)"
English[sw] = False # no English in entry ...
continue
except Exception, e:
print "some exception getting page: ", str(e)
continue
English.sync() # reasonable place to do this
# check for citations page
cite = wikipedia.Page(site, "Citations:" + word)
try:
ctext = getwikitext(cite)
except wikipedia.NoPage:
ctext = ''
pass
except wikipedia.IsRedirectPage:
ctext = ''
pass
except Exception, e:
print "some exception getting page: ", str(e)
ctext = ''
continue
if ctext:
print " (citations page exists)"
continue
# maybe good to go?
sent = sentence.replace(word, "'''" + word + "'''")
ctext = """{{citation}}
* {{cite wikipedia
|year={{subst:CURRENTYEAR}}
|article=%s
|passage=%s
}}
""" % (article, sent)
# ask if it is okay to write:
print "citation for %s: %s" % (safe(word), safe(sentence))
answer = wikipedia.input("add (y, n, s, k, wp replacement)?")
if answer == 'y':
try:
wikipedia.setAction("citation from Wikipedia")
try:
# re-init for create, a lot of time may have passed
cite = wikipedia.Page(site, "Citations:" + word)
ct = cite.get(sysop = sysop)
print " (citations page exists now?)"
continue
except wikipedia.NoPage:
pass # expected
except wikipedia.IsRedirectPage:
pass # will overwrite
cite.put(ctext, sysop=sysop, minorEdit=False)
except Exception, e:
print "some exception writing page: ", str(e)
if answer == 's': break # skip to next sentence
if answer == 'k':
kflag = True
break
# fix entry on 'pedia!
if len(answer) > 2:
wpage = wikipedia.Page(pedia, article)
try:
wtext = getedit(wpage)
wnew = wtext.replace(word, answer)
wikipedia.showDiff(wtext, wnew)
a2 = wikipedia.input("okay?")
if a2 != 'y': continue
wikipedia.setAction("spelling %s to %s" % (word, answer))
wpage.put(wnew)
except Exception, e:
print "some exception writing wikipedia: ", str(e)
# end of word
if kflag: break
# end of sentence
if kflag: break
# end of line
# end of article loop
# sleep(30)
continue
# done
def main():
try:
spork()
except KeyboardInterrupt:
print "(syncing cache)"
English.sync()
# (xml is open for read, let process rundown handle it)
print "(exit)"
return
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()