# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Tbot/code/createflw

Create a simple foreign word entry in the en.wikt

Append a section if not already present


import wikipedia
import catlib
import sys
import re
import pickle
import socket
from time import time, sleep
import shelve
from mwapi import getwikitext, getedit

from __main__ import cache, logpage, plock

def safe(s):
    return pickle.dumps(s)[1:-5]

def log(s):
    with plock: print safe(s).strip("'" + '"')

# entries we've already seen that exist [not looking at sections yet]:

Exists = set()

# some regex

# these only catch default namespace names ...
reimage = re.compile(r'\[\[image:(.*?)[\|\]]', re.I)
reaudio = re.compile(r'\[\[media:(.*?)[\|\]]', re.I)
# catch image by the |thumb| parameter ;-)
rethumb = re.compile(r'\[\[(.*?)\|[^\]]*thumb[\|\]]')
# image by .jpg or .png:
rejpg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.(jpg|png))[\|\]}]')
# and perhaps an ogg file in a template, as in en.wikt?
reogg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.ogg)[\|\}]')

# IPA string
reIPA = re.compile(r'IPA.*?([/\[][^\{\}\|\[\]]+?[/\]])')
reIPAt = re.compile(r'\{\{IPA\|([^\{\}\|]+?)[\}\|]')

# fix glosses, context at start, (1) sense number at end should be removed, each should be subbed with space
regloss1 = re.compile(r"^''\(.*?\)''")
regloss2 = re.compile(r"^\(''.*?''\)")
regloss3 = re.compile(r"\(\d+\)$")

rejump = re.compile(r'\{\{jump\|[^}]*}}')

# need only do once on load

site = wikipedia.getSite("en", "wiktionary")
csite = wikipedia.getSite("commons", "commons")

# trans table prefixes, other than "*" at the start of the line
Tlist = dict(   ru = r'\s*\|en=',
                uk = r'\s*\|en=',
                nl = r':?\*',
                sq = r'<br>\{\{en}}',
                ga = r'\{\{aistr\|en', # careful here, next char is | which must match \W
                lt = r'\{\{env1}}',
                yi = r'\|EN=',
                tr = r':?\*\{\{en}}:',
                mn = r':\*\{\{en}}:' )
# and:
Tlist['is'] = r'\|en='   # "is" is a keyword ;-)

# by lc here, various languages
Wlist = dict( cs="{{Wikipedie}}",
              hu="{{wp1}", # one arg?
              vi="{{-info-}}" )

# pronunciation templates for IPA (modded for regex, use . for diacritics etc):
Plist = dict( de="Lautschrift",
              fr="pron" )

# images that show up in page structure for various reasons, e.g. first two on pt.wikt
Istops = set([ 'LuisdeCamoes4.jpg',
               'Os Lusadas.jpg',
               'Wikipedia.png' ])

cis = 0
def createFLentry(flw, lang, lc, pos, title, gloss, mod):
    global cis

    # for now, don't add to the same page (would cause edit conflict anyway?)
    if flw == title:
        # log("skipping addition to same title for now")
        return True
        # doesn't matter because not called with title == flw and return value used (see tbot.py)

    # check cache
    # records last time we tried this word, don't try again for 110 days
    # may need to disable sometimes for debugging!

    ckey = lc + ':' + flw
    if ckey in cache:
        last = cache[ckey]
        if last > time() - (110 * 24 * 3600):
            # log("%s:%s in 110 day cache, not checked" % (lc, flw))
            return False
    cache[ckey] = time() # assume we will complete check now ...
    cis += 1
    if cis % 20 == 0: cache.sync()

    log("createFL %s: %s[%s] %s, %s (%s)" % (flw, lang, lc, pos, title, gloss))

    # get the FL.wikt page

    # fix codes WMF hasn't yet (or has, but we still don't have set correctly :-)
    zlc = lc
    if lc == 'nb': zlc = 'no'
    if lc == 'cmn': zlc = 'zh'
    if lc == 'nan': zlc = 'zh-min-nan'
    # (no yue wikt as yet, hopefully will be created as yue, not zh-yue as in pedia)

        flsite = wikipedia.getSite(zlc, "wiktionary")
        flpage = wikipedia.Page(flsite, flw)
        # fltext = flpage.get()
        fltext = getwikitext(flpage)
        if fltext: print "FL page exists ..."
    except wikipedia.NoPage:
        with plock: print "page not in FL wikt"
        return False
    except wikipedia.IsRedirectPage:
        with plock: print "FL wikt entry is a redirect"
        return True  # can change to t+
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception, e:
        with plock: print "some exception getting page from FL wikt"
        return False
    if not fltext:
        with plock: print "page not in FL wikt"
        return False

    # see if English word in FL page, presumably as a translation

    if title not in fltext:
        print "FL wikt page does not contain title"
        # logpage.add("[[%s]] entry [[:%s:%s]] exists, title not in entry" % (title, lc, flw))
        return True # we want to insert t+ template, even though not adding entry

    # nl.wikt uses ":*", will be other variations,
    # ru.wikt uses |en= ... etc etc:
    if lc in Tlist: tpre = Tlist[lc]
    else: tpre = r'\*'
    retrans = re.compile(r'^' + tpre + r'.*\W' + re.escape(title) + r'(\W|$)', re.M)

    # look for a line that may be a trans line, with title surrounded by non-word characters
    mo = retrans.search(fltext)
    if mo:
        # truncate fltext at that line, so we don't get extra stuff from following sections
        fltextall = fltext
        fltext = fltext[0:fltext.find(mo.group(0))]    # must be there, but -1 won't hurt
        print "title not in translation line?"
        logpage.add("[[%s]] entry [[:%s:%s]] exists, pattern not matched" % (title, lc, flw))
        return True # we want to insert t+ template, even though not adding entry

    # a short entry may be just the English translation, not very good (80 is arbitrary)

    # if len(fltext) < 80:
    #    print "FL wikt page is too short"
    #    return True # we want to insert t+ template, even though not adding entry

    # now reconfirm local existence and section absent, get text

    seealso = ''
    addc = 'created %s entry ' % lang
        log("getting local page %s" % flw)
        page = wikipedia.Page(site, flw)
        text = getedit(page)
        # check language section ...
        if re.search('^==\s*\[*' + re.escape(lang) + '\]*\s*==', text, re.M):
            log("page %s and section %s already exists" % (flw, lang))
            return True  # meaning there is a page and section there now, so convert to t+

        # crappy special case until rationality w/r/t Norwegian and Nynorsk returns ...
        if lang == "Norwegian" and '==Norwegian ' in text:
            log("page %s and some Norwegian section already exists" % flw)
            return True  # meaning there is a page and section there now, so convert to t+

        # another temporary crappy special case, SC bullshit ...
        if lang in ['Croatian', 'Bosnian', 'Serbian'] and '==Serbo-' in text:
            log("page %s and some Serbo- section already exists" % flw)
            return True  # meaning there is a page and section there now, so convert to t+

        addc = 'added %s section ' % lang
    except wikipedia.NoPage:
        # usual case when entry is new
        text = ''
    except wikipedia.IsRedirectPage:
        # overwrite a redirect if present
        text = ''
        addc = 'replaced redirect with %s entry ' % lang
        seealso = page.getRedirectTarget()
        # limit to case redirects, simple case for now (so we don't "fix" Hebrew)
        if flw.lower() != seealso.lower():
            log("page %s is a redirect to %s, not replaced" % (flw, seealso))
            return True

    # see if we can "borrow" image or audio

    image = ''
    mo = reimage.search(fltext)
    if not mo and '|thumb|' in fltext: mo = rethumb.search(fltext)
    if not mo: mo = rejpg.search(fltext)
    if mo:
         img = mo.group(1)
         if ':' in img: img = img.split(':')[1]
         if img and img not in Istops:
             log("found image: %s" % img)
             ipage = wikipedia.Page(csite, "Image:" + img)
                 ipt = getwikitext(ipage)
                 image = '[[Image:%s|thumb|%s]]\n' % (img, flw)
                 with plock: print "found on commons"
             except wikipedia.NoPage:
                 with plock: print "not found on commons"
             except Exception, e:
                 with plock: print "other exception looking for commons image"

    audio = ''
    mo = reaudio.search(fltext)
    if not mo: mo = reogg.search(fltext)
    if mo:
         aud = mo.group(1)
         if ':' in aud: aud = aud.split(':')[1]
         if aud[0:2].lower() != lc:
             log("audio file name %s does not match language %s" % (aud, lc))
             aud = ''
         if aud:
             log("found audio: %s" % aud)
             apage = wikipedia.Page(csite, "Image:" + aud)
                 apt = getwikitext(apage)
                 audio = '* {{audio|%s|%s}}\n' % (aud, flw)
                 with plock: print "found on commons"
             except wikipedia.NoPage:
                 with plock: print "not found on commons"
             except Exception, e:
                 with plock: print "other exception looking for commons audio"

    ipa = ''
    ipas = set() # so repeats don't bother us
    for i in reIPA.findall(fltext):
    for i in reIPAt.findall(fltext):
    if lc in Plist:
        rp = re.compile(r'\{\{' + Plist[lc] + '\|(.*?)\}\}')
        for i in rp.findall(fltext):
    if len(ipas) == 1:
        i = ipas.pop().strip()
        if i.startswith('/'): i = '/' + i.strip('[] /') + '/'
        elif i.startswith('['): i = '[' + i.strip('[] /') + ']'
        elif i: i = '/' + i.strip('[] /') + '/'
        if i == '//' or i == '[]': i = ''
        if i == '/.../' or i == '[...]': i = ''
        if i:
            ipa = "* {{IPA|%s|lang=%s}}\n" % (i, lc)
            log("found IPA %s" % i)
    elif len(ipas) > 1:
        with plock: print "more than one IPA?"

    if audio or ipa:
         pron = '\n===Pronunciation===\n' + ipa + audio
         pron = ''

    # 'pedia link? look at all original text; often follow trans table

    wplink = ''
    if ("{{wikipedia}}" in fltextall or "{{wikipedia|" + flw + '}}' in fltextall or
             (lc in Wlist and Wlist[lc] in fltextall) or
             (lc in Wlist and Wlist[lc][:-2] + '|' + flw + '}}' in fltextall)):
        wplink = '{{wikipedia|lang=%s}}\n' % lc
        print "added wikipedia link"

    # set up additional infl params from attribute dict:
    aip = ''
    if 'alt' in mod and mod['alt']: aip += '|head=' + mod['alt']
    if 'tra' in mod and mod['tra']: aip += '|tr=' + mod['tra']
    if 'g' in mod and mod['g']: aip += '|g=' + mod['g']
    if 'g2' in mod and mod['g2']: aip += '|g2=' + mod['g2']
    if 'g3' in mod and mod['g3']: aip += '|g3=' + mod['g3']
    if 'scr' in mod and mod['scr']: aip += '|sc=' + mod['scr']

    gwas = gloss
    gloss = gloss.strip()
    gloss = regloss1.sub(' ', gloss)
    gloss = regloss2.sub(' ', gloss)
    gloss = regloss3.sub(' ', gloss)
    gloss = rejump.sub(' ', gloss)
    gloss = gloss.strip()
    if not gloss:
         log("nothing left to gloss ...")
         return True   # as FL wikt page does exist
    # decap gloss (some people insist on capitalizing it, which is wrong) and fix, this is almost always right:
    if gloss.startswith('Of '): gloss = 'of ' + gloss[3:]
    if gloss.startswith('Country '): gloss = 'country ' + gloss[8:]
    if gloss.startswith('Person '): gloss = 'person ' + gloss[7:]
    gl = gloss.lower()
    if "translation" in gl:
        log("word 'translation' in gloss, skipped")
        return True    # FL wikt page exists
    if gl[1:] != gloss[1:]: gl = gloss       # caps in string after first, so probably okay
    if gloss.startswith(title): gl = gloss   # Proper noun, e.g. "French language"
    if gl != gwas:
        log("gloss changed %s -> %s" % (gwas, gl))

    # change Proper noun to Noun if lower case; usually the right answer
    if pos == "Proper noun" and flw[0:1].islower():
        log("changed Proper noun to Noun")
        pos = "Noun"

    # add to or create entry text:
    if text: text += '\n\n----\n{{rfc-auto|sort languages}}\n'
    text += """==%s==

# [[%s]] (%s)

{{tbot entry|%s|%s|{{subst:CURRENTYEAR}}|{{subst:CURRENTMONTHNAME}}|%s}}

""" % (lang, wplink, image, pron, pos, lc, pos.lower(), aip, title, gl, lang, title, lc)

    # other special things (no reason not to ;-)
    if lc == 'fr' and pos == 'Verb': text = text.replace("{{tbot", "{{rfinfl|type=conjugation|lang=fr}}\n{{tbot")
    # [ others as desired ]

    # add interwiki, let AutoFormat and Interwicket sort things as needed
    iw = '[[%s:%s]]' % (lc, flw)
    if iw not in text: text += iw + '\n'

    # if overwriting redirect, add see
    if seealso: text = '{{also|' + seealso + '}}\n' + text

        with plock:
             page.put(text, comment = addc + "from translation at [[%s]] and [[:%s:%s]]" % \
                       (title, lc, flw), minorEdit = False)
    except wikipedia.PageNotSaved:
        with plock: print "failed to save page"
        return False
    except socket.timeout:
        with plock: print "socket timeout, maybe not saving page"
        return False
    except socket.error:
        with plock: print "socket error, maybe not saving page"
        return False
    except Exception, e:
        with plock: print "some exception saving page", repr(e)
        return False

    # Exists.add(flw)
    return True