User:Robert Ullmann/Prologue
Appearance
This is a Wiktionary page format experiment.
Further explanation will be added (I hope and intend ;-)
Examples
[edit]Very preliminary.
Note: if you look at the wikitext, be aware that it is all expanded to remove categories. The eventual product wikitext in entries would be perfectly normal. These are mock-ups.
examples:
- Robert Ullmann/Prologue/examples/bog
- Robert Ullmann/Prologue/examples/cat
- Robert Ullmann/Prologue/examples/compute
- Robert Ullmann/Prologue/examples/is
- Robert Ullmann/Prologue/examples/mama
- Robert Ullmann/Prologue/examples/manga
- Robert Ullmann/Prologue/examples/monocline
- Robert Ullmann/Prologue/examples/simple
- Robert Ullmann/Prologue/examples/slag
- Robert Ullmann/Prologue/examples/to
- Robert Ullmann/Prologue/examples/trivia
- Robert Ullmann/Prologue/examples/馬
Code
[edit]#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/Prologue/code """ Generates prologue (section 0) examples """ import wikipedia import sys import re import socket import urllib from iwiktmwapi import getwikitext, getedit, putedit, readapi def srep(s): return repr(u''+s)[2:-1] def lkey(l): # language sort key n = l.strip('[]') if not n: return n if n == 'Translingual': return '3' + n # at end for now if n == 'English': return '1' + n # handle names like !Kung and 'Auhelawa: move (one) non-alpha to the end of key if not n[0:1].isalpha(): n = n[1:] + n[0:1] return '2' + n relink = re.compile(r'\[\[(.+?)\]\]') reh2 = re.compile(r'==([^=]+)==') rehead = re.compile(r'=+([^=]+)=+') # match language "tags" on defn lines retag = re.compile(r"[(']+([^)]+)[)']+ (.*)") rexpand = re.compile(r'<expandtemplates.*?>(.*)</expandtemplates>', re.S) # find a context span in expanded text, at start of definition: # [ this relies on context working exactly one way ... might be improved] respan = re.compile(r'<span class="ib-brac"><span class="qualifier-brac">' '\(</span></span><span class="ib-content"><span class="qualifier-content">' '(.*?)' '</span></span><span class="ib-brac"><span class="qualifier-brac">\)</span></span>' '(.*)') # example: # <span class="ib-brac"><span class="qualifier-brac">(</span></span><span class="ib-content"><span class="qualifier-content">[[nautical]][[Category:Nautical]]</span></span><span class="ib-brac"><span class="qualifier-brac">)</span></span> A strong tackle used to hoist an anchor to the [[cathead]] of a ship. recomma = re.compile(r'<span class="ib-comma"><span class="qualifier-comma">,</span></span>') def expand(text, title = ''): site = wikipedia.getSite("en", "wiktionary") # call expand templates: # parameters (and do a post op) par = urllib.urlencode([ ('text', text.encode("UTF-8")), ('title', title.encode("UTF-8")) ]) rawt = readapi(site, "action=expandtemplates&format=xml", mode = "POST", par = par) mo = rexpand.search(rawt) if not mo: print " can't expand templates?" print repr(rawt) return '' return wikipedia.unescape(mo.group(1)) recat = re.compile(r'\[\[\s*[Cc]ategory\s*:.*?\]\]') recattag = re.compile(r'<!--XCAT-->\s*<!--XCAT-->', re.S) def decat(t): """ remove categories from text. not simple as the general parser is complex a cat at the end of the line should be removed without removing the line break, but line breaks and even blank lines in between cats should be removed and blank lines after (or before?) cats should be removed if multiple, but we don't handle that case. """ # replace all cats with uniform tag # tag is an HTML comment so if it did by chance occur in the wikitext it would be gone anyway tot = recat.sub('<!--XCAT-->', t) # now replace any spans around whitespace with singlets k = 1 while k: tot, k = recattag.subn('<!--XCAT-->', tot) # and remove tags tot = tot.replace('<!--XCAT-->', '') return tot def main(): socket.setdefaulttimeout(70) # read list of the pages we should set up as examples site = wikipedia.getSite("en", "wiktionary") site.forceLogin() page = wikipedia.Page(site, "User:Robert Ullmann/Prologue/feedme") feed = getwikitext(page) # test: # feed = '[[bog]] [[cat]] [[prolog]] [[mama]]' # feed = '[[' + sys.argv[1] + ']]' for title in relink.findall(feed): print "%s:" % srep(title) try: page = wikipedia.Page(site, title) text = getwikitext(page) except Exception, e: print " exception getting page", repr(e) text = '' continue # now find language sections, POS, defs # lang is language, pos is last header (which may very well not be a POS) lang = '' pos = '' # defs is dict of lang to list of (POS, def) tuples defs = { } for line in text.splitlines(): mo = reh2.match(line) if mo: lang = mo.group(1) pos = '' continue if not lang: continue mo = rehead.match(line) if mo: pos = mo.group(1) continue if line[:2] != '# ': continue # skip {defn} and {defn-form} if '{{defn' in line: continue # def line, add into list if lang not in defs: defs[lang] = [ ] defs[lang].append( ( pos.lower(), line[2:]) ) # (that takes care of extracting the basic info) # print repr(defs) # now we have to reprocess the "Serbo-Croatian" drek: # following is an approximation, doing it "correctly" is not possible # as the forced merger discards information and the format is not tractable if "Serbo-Croatian" in defs: if "Croatian" in defs or "Serbian" in defs or "Bosnian" in defs \ or "Montenegrin" in defs: pass # use standard language entries else: dlist = defs["Serbo-Croatian"] for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: defs[lang] = [ ] for pos, defn in dlist: # look for tags mo = retag.match(defn) if mo and ("Croatian" in mo.group(1) or "Serbian" in mo.group(1) \ or "Bosnian" in mo.group(1) or "Montenegrin" in mo.group(1)): # add remainder of def to each language tagged: for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: if lang in mo.group(1): defs[lang].append( (pos, mo.group(2)) ) else: # use default on correct script if ord(title[0]) >= 0x0400 and ord(title[0]) < 0x0530: defs["Serbian"].append( (pos, defn) ) else: defs["Croatian"].append( (pos, defn) ) # now drop blanks for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: if not defs[lang]: del defs[lang] del defs["Serbo-Croatian"] # done with crap # consolidate defs ... # keeping order is the trick # use four lists, generate in parallel langs = [ ] poss = [ ] defns = [ ] ctxs = [ ] for lang in sorted(defs, key=lkey): dlist = defs[lang] for pos, defn in dlist: if pos == "han character": pos = "Han character" # fix, should be cap # do a number of things to clean up defn # [remove defdate, ref tags, etc, etc)] ctx = '' if defn.startswith('{{'): # try finding a context ... exp = expand(defn, title) print "(expand def)" mo = respan.match(exp) if mo: ctx = decat(recomma.sub(',', mo.group(1))).replace(' ', ' ') defn = mo.group(2).lstrip() print "matched context" # [at some point might expand the whole entry first? or not bother for examples] # seen already? [need some fuzziness in match!] i = 0 while i < len(defns): if pos == poss[i] and defn == defns[i] and ctx == ctxs[i]: break i += 1 if i >= len(defns): langs.append(lang) poss.append(pos) defns.append(defn) ctxs.append(ctx) else: langs[i] += ', ' + lang # (re)generate prologue: # this is easier because we are working from NS:0 entries which don't have # the prologue in them, and we don't need to reprocess defs as much for examples # harder as we need to expand and kill cats newtext = '{{tocright}}\n' # first copy the existing stuff (also template, whatever) for line in text.splitlines(): if line[:2] == '==': break newtext += line + '\n' # generate def lines # also need to handle several and many languages, and so on # section link languages? for i in range(0, len(defns)): ln = langs[i] if ln == 'English': ln = '' else: ln += ', ' ctx = ctxs[i] if ctx: ctx = ', ' + ctx newtext += "# (''" + ln + poss[i] + ctx + "'') " + defns[i] + '\n' print " # " + '(' + ln + poss[i] + ctx + ') ' + repr(defns[i]) # append the rest of the entry: (all after first header) text = '\n' + text newtext += text[text.find('\n=='):] # almost there ... newtext = expand(newtext, title) # kill cats newtext = decat(newtext) # and write new page try: xpage = wikipedia.Page(site, "User:Robert Ullmann/Prologue/examples/" + title) # otext = getedit(xpage) otext = xpage.get() # putedit(xpage, newtext, comment = "write example") xpage.put(newtext, comment = "write example") except wikipedia.NoPage: xpage.put(newtext, comment = "write example") # write initial version w/framework pass except Exception, e: print " exception getting/writing example page", repr(e) pass # finished with page loop # done if __name__ == "__main__": try: main() finally: wikipedia.stopme()