User:Visviva/Tracking/Code
Appearance
< User:Visviva | Tracking
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Functions for use in generating lists of words lacking Wiktionary coverage.
The only function normally called directly is aggregate(path).
Does not include downloaders.
The assumption is that the files are already in place
in a dedicated directory on the hard drive.
(e.g. "C:\Feedme\HaroldStarr_2009-01-01")
In addition, it is assumed that
the working list of English titles,
named "en_titles.txt", is already present
in the script directory.
This code is currently quite rough,
and is being debugged only very gradually
through painful experience.
Use at your own risk.
'''
import datetime
import locale
import os
import re
import time
from copy import copy
from htmlentitydefs import name2codepoint
punctuation="!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~’”“"
scriptdir="C:\Code"
#Directory for central storage of shortcuts to candidate lists:
actiondir=""
#Currently rigged for Windows only.
class ufo: #Thanks to Robert Ullmann for this bit of simple magic.
def __init__(self, **k):
for a in k: setattr(self, a, k[a])
class sourceobject:
file=""
sourcename=""
sourcey=""
sourcegroup=""
convertentities=True
enforce_paragraphs=False
encoding="iso-8859-1"
extracted=False
pagefromcite=False
format="html"
sourcetype="news"
year=str(datetime.date.today().year)
date=datetime.date.today().strftime("%B %d").replace(" 0"," ")
day=""
authormatchers=[]
datematchers=[]
doimatchers=False
pagematchers=False
sourcematchers=False
titlematchers=[]
urlmatchers=["\<url\>(.*?)\<\/url\>"]
volmatchers=False
issuematchers=False
includers=False
replacers=False
author=""
page=""
pages=""
text=""
title=""
toctext=""
url=""
volume=""
doi=""
issue=""
urlcore=""
headertemplate=""
getstring="\<[pP][^\>]*\>([\s\S]*?)\<\/[pP]\>"
goodtags=["sub","sup","i","em"]
authorcleaner=False
datecleaner=""
titlecleaner=""
def textprep(self):
self.text=unicode(self.text,self.encoding,errors="ignore")
if self.convertentities:
self.text=unescape(self.text).encode("utf-8")
else:
self.text=self.text.encode("utf-8")
def getdatum(self,matchlist):
datum=""
if matchlist:
for m in matchlist:
if re.search(m,self.text):
datum=re.search(m,self.text).group(1)
break
return datum
def getinfo(self):
self.getauthor()
self.getdate()
self.getdoi()
self.getissue()
self.getpage()
self.getsource()
self.gettitle()
self.geturl()
self.getvolume()
def getauthor(self):
self.author=self.getdatum(self.authormatchers)
self.authorclean()
def getdate(self):
self.date=self.getdatum(self.datematchers)
self.dateclean()
def getdoi(self):
self.doi=self.getdatum(self.doimatchers)
def getissue(self):
if self.issuematchers:
self.issue=self.getdatum(self.issuematchers)
def getpage(self):
if self.pagematchers:
self.page=self.getdatum(self.pagematchers)
if self.toctext and sourcegroup=="Springer":
pagematch=re.search(re.escape(self.title)+"[\s\S]+?\ \;(.*)",self.toctext)
if pagematch:
self.pages=pagematch.group(1).strip()
elif self.sourcegroup=="Science":
if "-" in self.page:
endpoints=self.page.split("-")
self.pages=endpoints[0].strip()+"-"+endpoints[1].strip()
self.page=""
else:
self.page=self.page.strip()
def getsource(self):
if self.sourcematchers: self.sourcename=self.getdatum(self.sourcematchers)
def gettitle(self):
self.title=self.getdatum(self.titlematchers)
self.titleclean()
def geturl(self):
self.url=self.getdatum(self.urlmatchers)
if self.urlcore:
self.url=self.urlcore.replace("{{{match}}}",self.url)
def getvolume(self):
if self.volmatchers:
self.volume=self.getdatum(self.volmatchers)
def authorclean(self):
self.author=self.author.strip()
self.author=re.sub('\<sup\>.*?\<\/sup\>','',self.author)
self.author=re.sub("\<[^\>]+\>","",self.author)
if self.sourcename in self.author: self.author=""
elif self.sourcey in self.author: self.author=""
if self.authorcleaner: exec(self.authorcleaner)
if self.sourcegroup=="Guardian":
self.author=self.author.split(", ")[0].split(" in ")[0].strip()
elif self.sourcegroup=="Science":
authors=self.author.split(";")
authorparts=[]
if len(authors)>2:
authorparts=authors[0].split(",")
if len(authorparts)>1:
self.author=authorparts[1].strip()+" "+authorparts[0].strip()+" et al."
else:
self.author=authorparts[0].strip()
elif len(authors)==1 and len(authors[0].split(","))>2: #Nature uses commas only, no semicolons
authors=self.author.split(",")
if len(authors)>2:
self.author=authors[0].strip()+" et al."
elif len(authors)==2:
self.author=authors[0].strip()+" & "+authors[1].strip()
else:
try: self.author=authors[0].strip()
except IndexError:
self.author=""
elif len(authors)==2:
authorparts1=authors[0].split(",")
authorparts2=authors[1].split(",")
self.author=authorparts1[1]+" "+authorparts1[0]+" & "+authorparts2[1]+" "+authorparts2[0]
else:
try: authorparts=authors[0].split(",")
except IndexError:
self.author=""
if len(authorparts)>1:
self.author=authorparts[1].strip()+" "+authorparts[0].strip()
else:
try: self.author=authors[0].strip()
except IndexError:
self.author=""
if self.sourcename=="New York Times":
self.author=self.author.title()
def dateclean(self):
self.date=re.sub("\<.*?\>","",self.date)
# if self.datecleaner: exec(self.datecleaner)
if self.sourcename=="New York Times":
dateparts=self.date.split(",")
try: self.year=dateparts[1].strip()
except IndexError: pass
self.date=dateparts[0].strip()
elif self.sourcegroup=="Guardian":
try: parsedate=time.strptime(self.date,"%Y_%m_%d")
except ValueError:
parsedate=""
if parsedate:
self.date=time.strftime("%B %d",parsedate)
self.year=time.strftime("%Y", parsedate)
elif self.sourcename=="Toronto Star":
dateparts=self.date.split(",")
self.date=dateparts[0].replace("Jan","January").replace("Feb","February").replace("Mar","March").replace("Apr","April").replace("Jun","June").replace("Jul","July").replace("Aug","August").replace("Sep","September").replace("Oct","October").replace("Nov","November").replace("Dec","December").strip()
self.year=dateparts[1].strip()
elif self.sourcename=="Herald Sun":
self.year=self.date.split(", ")[1][0:4]
self.date=self.date.split(", ")[0]
elif self.sourcename=="Chicago Reader":
self.date=self.date.split(", ")[0]
self.year=self.date.split(", ")[1]
elif self.sourcegroup=="Springer":
rawdate=self.date.strip().split()
self.year=rawdate[2]
self.date=rawdate[1]+" "+rawdate[0]
elif self.sourcegroup=="Science":
try: parsedate=time.strptime(self.date,"%m/%d/%Y")
except ValueError:
try: parsedate=time.strptime(self.date,"%Y-%m-%d")
except ValueError:
parsedate=""
if parsedate:
self.date=time.strftime("%B %d",parsedate)
self.year=time.strftime("%Y", parsedate)
self.date=self.date.replace(" 0"," ")
def textclean(self):
if self.includers:
cleantext=re.search(self.includers,self.text)
if cleantext:
self.text=cleantext.group(1)
else:
self.text=""
if self.replacers:
for r in self.replacers:
self.text=re.sub(r,self.replacers[r],self.text)
self.text=re.sub("\<[\/]*i\>","''",self.text)
def titleclean(self):
for g in self.goodtags:
self.title=re.sub("\<"+g+".*?\>","&&!g",self.title)
self.title=re.sub("\<\/"+g+".*?\>","!&&g",self.title)
self.title=re.sub("\<[^\>]+\>","",self.title)
for g in self.goodtags:
self.title=re.sub("\&\&\!"+g,"<"+g+">",self.title)
self.title=re.sub("\!\&\&"+g,"</"+g+">",self.title)
if self.titlecleaner: exec(self.titlecleaner)
def getsourcedata(sourcey): # under construction
attributes={}
journals=["Science","Nature","Notices of the American Mathematical Society","Lancet","Erkenntnis","Philosophical Studies","Journal of Pragmatics","Archives of Sexual Behavior"]
aliases={
"G":"The Guardian",
"O":"The Observer",
"Observer":"The Observer",
"Guardian":"The Guardian",
"NYT":"New York Times",
"Reader":"Chicago Reader",
"HS":"Herald Sun",
"Star":"Toronto Star",
"NAMS":"Notices of the American Mathematical Society",
"CT": "Calcutta Telegraph",
"PEH":"Port Elizabeth Herald",
"AL":"Applied Linguistics",
"JPrag":"Journal of Pragmatics",
"PhilStud":"Philosophical Studies",
"ASB":"Archives of Sexual Behavior"
}
sourcegroups={
"The Guardian":"Guardian",
"The Observer":"Guardian",
"Nature":"Science",
"Erkenntnis":"Springer",
"Philosophical Studies":"Springer",
"Archives of Sexual Behavior":"Springer",
}
attributes["headertemplate"]={
"New York Times":"NYT_header",
"The Observer":"Observer header",
"The Guardian":"Guardian header"
}
attributes["authormatchers"]={
"New York Times": [
"title=\"More Articles by (.*?)\"",
"\<meta\s+name=\"byl\"\s+content=\"By\s+([^\"]*)"],
"Guardian":["\<li class=\"byline\"\>([\s\S]*?)\<\/li\>"],
"Herald Sun":["\<p class=\"author\"\>(.*?)\<\/p\>"],
"Toronto Star":[
"\<span id=.*?___Author1__\" class=\"articleAuthor\">([^<]*)",
'\<p class=\"authorByline\"\>([\s\S]*?)\<\/p'],
"Chicago Reader":['\<meta name="author" content="(.*?)"'],
"Science":["\<.*? name=\"citation_authors\" content=\"(.*)\""],
"Springer":['\<p class=\"AuthorGroup\"\>(.*)\<\/p'],
}
attributes["datematchers"]={
"New York Times": ["\<meta\s+name=\"DISPLAYDATE\"\s+content=\"([^\"]*)"],
"Guardian": ["cv\.c7\=\"(.*?)\"\;"],
"Herald Sun": ["\<p class=\"published-date\">(.*?)\<\/p\>"],
"Toronto Star": [
"\<span style=\"text-transform:capitalize;\"\> (.*?, \d+) .*",
'\<span class=\"date\">([\s\S]*?)\<\/span'],
"Chicago Reader": ['\<meta name="date" content="(.*?)"'],
"Science":["\<.*? name=\"citation_date\" content=\"(.*)\""],
"Springer":['\<strong\>Published online\: \<\/strong\>([^\<]*)'],
}
attributes["pagematchers"]={
"Nature":[
"pageNumbers=[p]+([\d\-]+)",
"\>[\d]+\<\/span\>,[\s\S]+?([\d\-]*)\n",
"\/i\>[\s]*\<b\>[\d]+\<\/b\>,[\s]*([\d\-]+)[\s]*\("],
"New York Times": ["\nPage ([\S]*)\r"],
"Science":["Home\<\/a\> .*? [\s\S]+?[p]+\.([\-\t\d\s\n\r]*[\d]+)\n"]
}
attributes["sourcematchers"]={ # Not relevant for most cases
"Springer":['\<tbody\>[\s\S]*?\<tr class=\"header\"\>[\s\S]*?\<td rowspan=\"1\" colspan=\"1\"\>([^\<]*)'],
}
attributes["titlematchers"]={
"New York Times": ["<meta\s+name=\"hdl\"\s+content=\"(.*?)\""],
"Guardian": ["\<h1\>(.*?)\<\/h1\>"],
"Herald Sun": ["\<title\>(.*?)\|"],
"Toronto Star":[
"\<span id=.*?___Title__\" class=\"headlineArticle\">([^<]*)",
"\<h4\>(.*?)\<\/h4"],
"Chicago Reader":[
'\<meta name="headline" content="(.*?)"\>',
'\<meta name="storytype" content="(.*?)"\>'],
"Science":["\<.*? name=\"citation_title\" content=\"(.*)\""],
"Springer":['\<a name=\"title\"\>\<\/a\>(.*)']
}
attributes["urlmatchers"]={ #Backup url-matching if the stamp is absent
"New York Times": ["return encodeURIComponent\(\'(.*?\.html)\'\)\;"],
"Guardian": ["\<url\>([^\<]*)"],
"Herald Sun":[re.escape("http://digg.com/submit?phase=2&url=")+"(.*?)\&\;"],
"Toronto Star":["onclick=\"addthis_url = '(.*?)'"],
"Chicago Reader":["\<title\>Reader Archive--Extract\: (.*?)\<"],
"Science": ["\<.*? name=\"citation_abstract_html_url\" content=\"(.*)\"", "\<.*? name=\"citation_.*?_url\" content=\"(.*)\""],
}
attributes["issuematchers"]={
"Science":["\<.*? name=\"citation_issue\" content=\"(.*?)\""]
}
attributes["doimatchers"]={
"Science":[
"\<.*? name=\"citation_doi\" content=\"doi\:(.*?)\"",
"\<.*? name=\"citation_doi\" content=\"(.*?)\""]
}
attributes["volmatchers"]={
"Science":["\<.*? name=\"citation_volume\" content=\"(.*?)\""]
}
attributes["replacers"]={ # Text to switch or remove before any extraction
"New York Times": {"\<person[^\>]*?\>":""},
"Herald Sun":{"&squo;":"'", "\>\>.*?\>\>":""},
"Science": {"\|\[":"&","\]\|":";"},
}
attributes["includers"]={ # Article text
"Guardian":'\<div id=\"article-wrapper\"\>([\s\S]*?)\<\/div\>',
"Herald Sun": '\<div class=\"btm20\"\>([\s\S]*?)\<\/div\>',
"Notices of the American Mathematical Society":"([\s\S]*)Degrees Conferred \n", #Avoid morass of non-delimited text
"Springer":"\<title\>([^\<]+)",
}
attributes["datecleaners"]={ # code to execute to scrub date
"New York Times":'displaydate=datematch.group(1); dateparts=displaydate.split(", "); info.year=dateparts[1]; info.date=dateparts[0]'
}
attributes["titlecleaners"]={
# "Science":"title=str(BeautifulSoup(title,convertEntities=BeautifulSoup.ALL_ENTITIES))"
"Springer":"\<[\/]*i[^\>]*?\>"
}
attributes["getstring"]={ #non-overlapping regex to find the paragraphs of actual text
"Nature":"(?i)(.*)\<P",
"Science":"(?i)(.*)\<P",
"Chicago Reader":"\<img.*?\>([\s\S]*?)\<br",
}
attributes["encoding"]={
"New York Times": "windows-1252",
"Guardian":"windows-1252",
"Toronto Star": "utf-8"
}
attributes["authorsinpedia"]={
"Chicago Reader": ["Cecil Adams","Jonathan Rosenbaum"],
"New York Times": ["Maureen Dowd", "David Brooks","William Safire"]
}
attributes["urlcore"]={
"Chicago Reader": "https://securesite.chireader.com/cgi-bin/Archive/abridged2.bat?path="
}
source=aliases.get(sourcey, sourcey)
sourcedata=sourceobject()
sourcedata.sourcename=source
sourcedata.sourcegroup=sourcegroups.get(source, source)
if sourcedata.sourcename in journals:
sourcedata.sourcetype="journal"
sourcedata.sourcey=sourcey
for a in attributes:
if sourcedata.sourcename in attributes[a]:
setattr(sourcedata, a, attributes[a][sourcedata.sourcename])
elif sourcedata.sourcegroup in attributes[a]:
setattr(sourcedata, a, attributes[a][sourcedata.sourcegroup])
if not sourcedata.headertemplate:
sourcedata.headertemplate=source+"_header".replace("The ","")
sourcedata.urlmatchers.append("\<url\>(.*?)\<\/url\>")
return sourcedata
def aggregate(path,source="",today=datetime.date.today(),learning=False):
sourcetype="news"
enforce_paragraphs=False
be_suspicious=False
toctext=False
sourcey=source
if today==datetime.date.today():
sourcey,today=getpathdata(path)
if today.day:
year=today.strftime("%Y")
month=today.strftime("%m")
day=today.strftime("%d")
else:
year=today.year
month=today.month
day=""
if source=="":
if sourcey=="G":
source="The Guardian"
elif sourcey=="O":
source="The Observer"
elif sourcey=="NYT":
source="New York Times"
elif sourcey=="Reader":
source="Chicago Reader"
elif sourcey=="HS":
source="Herald Sun"
elif sourcey=="Star":
source="Toronto Star"
elif sourcey=="NAMS":
source="Notices of the American Mathematical Society"
sourcetype="journal"
else:
source=sourcey
header=""
aggregator={}
writefile=open(path+"\\"+"agg.txt","w") # force blank file
writefile.write("")
writefile.close()
blankobject=sourceobject()
files=os.listdir(path)
filedata={}
bookdata={}
skippers=["firstfile.html","agg.txt","agg-log.txt","log.txt","candidates.txt","data.txt","raw.raw","raw.txt","cookies.lwp","toc.txt"]
if "data.txt" in files: # for Gutenberg books etc., where metadata has to be added separately
datafile=open(path+"\\data.txt","r")
rawdata=datafile.read()
sourcetype="book" #default to book
for line in rawdata.split("\n"):
lineparts=line.split(":",1)
try: bookdata[lineparts[0].strip()]=lineparts[1].strip()
except IndexError: continue
datafile.close()
if "toc.txt" in files:
tocfile=open(path+"\\toc.txt","r")
toctext=tocfile.read()
tocfile.close()
for filename in files: #Read text from all eligible files in dir
if "." not in filename: continue
if ".pdf" in filename: continue
if filename in skippers: continue
thisdata=getsourcedata(source)
thisdata.file=filename
thisdata.toctext=toctext
file=open(path+"\\"+filename)
rawtext=file.read()
if isspringer(rawtext) and not thisdata.source:
thisdata=getsourcedata("Springer")
thisdata.text=rawtext
file.close()
if bookdata:
for b, v in bookdata.items():
setattr(thisdata, b, v)
if source=="New York Times":
thisdata.enforce_paragraphs=True
elif isspringer(rawtext):
thisdata.getsource()
elif source=="Notices of the American Mathematical Society":
thisdata.pagefromcite="Notices\s*of\s*the\s*AMS\s*([0-9]{3}).*\n[\s\S]*?{{{passage}}}"
thisdata.day=day
thisdata.textprep()
thisdata.path=path
thisdata.getinfo()
thisdata.textclean()
if thisdata.headertemplate:
header="{{User:Visviva/"+thisdata.headertemplate+"|"+year+"|"+month+"|"
if thisdata.day:
header+=day
if thisdata.day != datetime.date.today():
header=header+"|creation="+datetime.date.today().strftime("%Y-%m-%d")
header=header+"}}"
getthis=thisdata.getstring
if not re.search(getthis,thisdata.text):
print "No paragraphs for "+filename
if not thisdata.enforce_paragraphs:
getthis="([\s\S]+)"
filedata[thisdata.file]=copy(thisdata)
aggregator=getparas(thisdata.text,aggregator,thisdata.file,getthis,thisdata.encoding)
continue
writefile=open(path+"\\"+"agg.txt","w") # Dump all processed text into one aggregated file
for a in aggregator:
writefile.write(a)
continue
writefile.close()
stopwords=getstops()
English=set()
types=set()
lctokens=0
totalwords=0
tokencount=0
uniques=[]
kwix={}
data={}
English=gettitles()
for ag in aggregator:
if aggregator[ag] in filedata:
sentencedata=copy(filedata[aggregator[ag]])
else:
print "No data for "+aggregator[ag]
sentencedata=copy(blankobject)
newuniques,newkwix,newdata,tokencount,lctokens,types=getconcordance(ag,sentencedata,English,uniques,stopwords,tokencount,lctokens,types,be_suspicious)
uniques.extend(newuniques)
kwix.update(newkwix)
data.update(newdata)
continue
uniques.sort()
if learning:
return uniques
ufile=open(path+"\\"+"candidates.txt","w")
ufile.write(header+"\n\n")
ufile.write("{{User:Visviva/wordhunt stats")
ufile.write("\n|tokens="+str(tokencount))
ufile.write("\n|goodtokens="+str(lctokens))
ufile.write("\n|types="+str(len(types)))
ufile.write("\n|newwords="+str(len(uniques))+"\n}}\n\n")
try:
if not today==datetime.date(int(year),int(month),int(day)):
isodate=year+"-"+month+"-"+day
else:
isodate=today.isoformat()
except ValueError:
isodate=year+"-"+month
ufile.write("=="+isodate+"==\n\n")
for u in uniques:
citationtemplate="{{User:Visviva/quote-"+data[u].sourcetype+"-special"
if data[u].sourcetype=="journal":
citationstring="# [["+u+"]]\n#*"+citationtemplate+"|pagetitle="+u+"|year="+data[u].year+"|date="+data[u].date+"|author="+data[u].author+"|title="+data[u].title+"|work="+source+"|doi="+data[u].doi+"|volume="+data[u].volume+"|issue="+data[u].issue
if data[u].url:
citationstring+="|url="+data[u].url
if data[u].pages:
citationstring+="|pages="+data[u].pages
if data[u].page:
citationstring+="|page="+data[u].page
citationstring=citationstring+"\n|passage="+kwix[u].strip()+" }}\n"
else:
if data[u].sourcetype=="book":
citationstring="# [["+u+"]]\n#*{{User:Visviva/quote-book-special|pagetitle="+u
for b in bookdata:
citationstring+="|"+b+"="+bookdata[b]
citationstring+="\n|passage="+kwix[u].strip()+"}}\n"
citationstring=citationstring.replace("{{{file}}}",data[u].file.split(".")[0])
elif "Lancet" in source or data[u].sourcename=="":
citationstring="# [["+u+"]]\n#:''"+kwix[u].strip()+"''\n"
else:
citationstring="# [["+u+"]]\n#*"+citationtemplate+"|pagetitle="+u+"|year="+data[u].year+"|date="+data[u].date+"|author="+data[u].author+"|title="+data[u].title+"|work="+source+"|url="+data[u].url
if data[u].page:
citationstring=citationstring+"|page="+data[u].page
citationstring=citationstring+"\n|passage="+kwix[u].strip()+"}}\n"
ufile.write(citationstring)
continue
ufile.write("\n===Sequestered===\n\n")
ufile.close()
if actiondir:
actionfilename=os.path.split(path)[1]+"_candidates.txt"
meh=os.system("fsutil hardlink create "+actiondir+"\\"+actionfilename+" "+path.replace("\\\\","\\")+"\candidates.txt")
homework=open(scriptdir+"\\homework.txt","a") # To be put into the daily stopword-acquisition cycle
homework.write("\n"+path)
def getparas(rawtext,aggregator,filename,getstring="(.*)",encoding="iso-8859-1"): # grabs paragraphs using either a standard pattern or a custom one
verybadtags=["script","style","object","iframe"]
for v in verybadtags:
rawtext=re.sub("(?i)\<"+v+"[\s\S]*?\<\/"+v+"\>","",rawtext)
rawtext=re.sub("\<\!\-\-[\s\S]*?\-\-\>","",rawtext)
for m in re.finditer(getstring,rawtext):
try: n=m.group(1)
except IndexError: continue
aggregator[n]=filename
continue
return aggregator
def gettitles(): # grabs titles from file
English=set()
listfile=open(scriptdir+"\\en_titles.txt","r")
for l in listfile:
English.add(l.strip())
continue
return English
def getstops(): # grabs stopwords from file
stopfile=open(scriptdir+"\\stop.txt","r")
stopwords=set()
for s in stopfile:
stopwords.add(s.strip())
continue
return stopwords
def badbreak(sentence,next=""): # checks whether the next block is actually part of the same sentence.
sentence=sentence.strip()
next=next.strip()
neverbreaks=["vs.","e.g.", "e. g.","i.e.","i. e.","Mr.","Mrs.","Dr.","Prof.","Ms.","Sen.","Rep.","fig.","figs.","Fig.","Figs."]
for n in neverbreaks:
if sentence.endswith(n) or next.startswith(n):
return True
alwaysbreaks=["“"]
for a in alwaysbreaks:
if next.startswith(a):
return False
elif sentence.endswith(a):
return len(a)
if next=="": return False
try: lastchar=sentence[-1]
except IndexError: return False
if next[0]=="," or next[0]==";" or next[0]==":": return True
try:
if re.match("[a-z]",next):
return True
except IndexError: pass
maths=sentence.split("&&math&&") #Avoid breaking in middle of math expression
if len(maths)>1:
if "&&/math&&" not in maths[-1]:
return True
if lastchar=="." and len(sentence)>2:
if re.match("\d",next):
return True
elif re.match(".*[A-Z][a-z]{0,2}\.",sentence.split(" ")[-1]) or re.match("[A-Z]",sentence[-2]) or re.match("[A-Z][a-z]\.",sentence[-3:]):
return True
return False
def getpathdata(path): #gets date and source alias from name of a directory in the form "<source alias>_<date>"
today=datetime.date.today()
pathparts=path.split("\\")
sourcey=""
dirparts=pathparts[-1].split("_")
if len(dirparts)==2:
sourcey=dirparts[0]
datey=dirparts[1]
if len(datey)==10:
todayraw=time.strptime(datey,"%Y-%m-%d")
today=datetime.date(todayraw.tm_year,todayraw.tm_mon,todayraw.tm_mday)
elif len(datey)==8:
try:
todayraw=time.strptime(datey,"%Y%m%d")
today=datetime.date(todayraw.tm_year,todayraw.tm_mon,todayraw.tm_mday)
except ValueError: pass
elif len(datey)==6:
todayraw=time.strptime(datey,"%Y%m")
today=ufo()
today.year=str(todayraw.tm_year)
today.month="%02d" % todayraw.tm_mon
today.day=""
return sourcey,today
def getconcordance(ag="",sentencedata=sourceobject(),English=set(),uniques=[],stopwords=set(),tokencount=0,lctokens=0,types=[],be_suspicious=False):
# ag=ag.decode("utf-8")
ag=re.sub("\<a[^\>]+\>","", ag)
ag=re.sub("\<img .*? alt=\"\$\$(?P<group>[^\"]*?)\$\$\"[^>]*?\>","&&math&&\g<group>&&/math&&",ag) # Math text for Springer
ag=ag.replace(".&&/math&&","&&/math&&.") #Prevent periods from being caught inside <math>
ag=re.sub("\<img[^\>]+\>"," ",ag)
ag=re.sub("[\n\r\t]+"," ",ag) #Trim down whitespace
x=0
closequote=unicode("”",encoding='utf-8')
closequoteparts=""
for x in range(len(closequote)):
closequoteparts+=re.escape(closequote[x].encode("utf-8"))
openquote=unicode("“",encoding='utf-8')
openquoteparts=""
for x in range(len(openquote)):
openquoteparts+=re.escape(openquote[x].encode("utf-8"))
pieces=re.split("([\.\?\!\|]+[\s\"\'\)\]\’"+closequoteparts+openquoteparts+"]*)",ag)
sentences=[]
newuniques=[]
newkwix={}
newdata={}
for p in pieces: #Recombine sentences, separators
# p=p.encode("utf-8")
if x%2:
sentences.append(pieces[x-1]+p)
x+=1
sentences.append(pieces[-1])
x=0
for s in sentences:
rawsentence=s
x+=1
try: next=sentences[x]
except IndexError: next=""
try:
while re.match("[\)\]\’]",next.strip()[0]): #cleanup
s+=next[0]
sentences[x]=next[1:]
next=sentences[x]
except IndexError: pass
while badbreak(s,next):
status=badbreak(s,next)
if type(status)==int:
try:
sentences[x]=s[-status:]+next
s=s[:-status]
except IndexError:
break
next=sentences[x]
continue
s+=next
try: sentences.remove(sentences[x])
except IndexError: break
try: next=sentences[x]
except IndexError: break
sentence=s
sentence=re.sub("\<[^\>]*\>"," ",sentence) # Throw out any tags in KWIC line
sentence=sentence.replace("&&math&&","<math>")
sentence=sentence.replace("&&/math&&","</math>")
sentence=sentence.replace("|","{{!}}") # Make template-safe
s=" ".join(re.split("[a-z\/\:\.\%\+]*\.[a-z][a-z\s\/\_\?\=\.\%\+]+",s)) # remove web addresses, even if they have spaces
s=" ".join(re.split("\<i.*?\>[\s\S]*?\<\/i\>",s)) # remove anything in italics
s=re.sub("\<em\>.*?\<\/em\>"," ",s)
s=re.sub("\<span style=\"font-style\: italic;\"\>.*?\<\/span\>"," ",s) # especially annoying fancy ones
s=re.sub("\<span class=\"italic\"\>[^\>]*"," ",s)
s=" ".join(re.split("\<[^\>]*\>",s)) # Now, remove all tags
s=" ".join(re.split("\S*@[^A-Z]*",s)) # remove emails, and anything until the next capitalized word
s=" ".join(re.split("\S*\{at\}[^A-Z]*",s)) # remove obfuscated emails
s=" ".join(re.split("\S+[\-\'\’]+\S*",s)) # remove hyphenated and apostrophated words
words=re.split("[\s"+re.escape(punctuation)+"]+",s)
y=0
badwords=["com","org","gov","uk","ca","au","nl","fr"]
for word in words:
word=word.strip()
tokencount+=1
if not word.islower() or word in stopwords or re.search("[^a-z]+",word): # Currently throws out all words with diacritics
y+=1
continue
try:
if words[y+1].strip() in badwords : # Any extra domain names that weren't filtered out above for any reason (spacing etc.)
y+=1
continue
if words[y-1]+" "+word in English or word+" "+words[y+1] in English:
y+=1
continue
except IndexError:
pass
if be_suspicious:
if issuspicious(word,words,y,English):
y+=1
continue
w=word.strip()
lctokens+=1
types.add(w)
if w not in English and w not in uniques and w not in newuniques:
if y>0 and words[y]==w and re.match("[A-Z]{1} ",words[y-1]+" "): # Has the first letter of a sentence been split off?
y+=1
continue
if (words[y-1]+w).lower() in English or words[y-1]+w in English:
y+=1
continue
newuniques.append(w)
if sentencedata.pagefromcite:
pagefromcite=sentencedata.pagefromcite.replace("{{{passage}}}",w)
pagematch=re.search(pagefromcite,sentencedata.text)
if not pagematch:
print "No page for "+word
else:
sentencedata.page=pagematch.group(1)
newkwix[w]=re.sub("[\n\r\t\s]+"," ",sentence.replace(w,"'''"+w+"'''",1).strip())
newdata[w]=copy(sentencedata)
y+=1
continue
continue
return newuniques,newkwix,newdata,tokencount,lctokens,types
def issuspicious(word,words,y,English):
try:
if word+words[y+1].strip() in English or words[y-1].strip()+word in English:
return True
for z, letter in enumerate(word):
if word[:z] in English and word[z:] in English:
return True
except IndexError:
pass
return False
def learn(path):
print "Learning from "+path+"..."
unchecked=aggregate(path,learning=True)
try: checkedfile=open(path+"\\candidates.txt","r")
except IOError: return False
checkedtext=checkedfile.read()
checkedfile.close()
checked=[]
added=[]
stopfile=open(scriptdir+"\\stop.txt","r")
allthestops=stopfile.read()
stopfile.close()
stoplist=re.split("[\r\n\s]+",allthestops)
for chunk in checkedtext.split("# [["):
if "{" in chunk.split("\n")[0]: continue
checkmatch=re.match("([^\]]*?)\]\]",chunk)
if checkmatch:
checked.append(checkmatch.group(1))
for u in unchecked:
if u in stoplist: continue
if u in checked: continue
stoplist.append(u)
added.append(u)
stoplist.sort()
logfile=open(scriptdir+"\\stoplog.txt","a")
logfile.write("\n\n"+str(datetime.date.today())+"\n"+path+"\n"+str(added))
stopfile=open(scriptdir+"\\stop.txt","w")
stopfile.write("\n\n")
for s in stoplist:
if not s.strip(): continue
stopfile.write(s+"\n")
stopfile.close()
return added
def learnfromfile():
file=open(scriptdir+"\\homework.txt","r")
list=file.read()
file.close()
done=set()
for l in list.split("\n"):
path=l.strip()
if path in done:
continue
done.add(path)
learn(path)
blankthis=open(scriptdir+"\\homework.txt","w")
blankthis.write("")
blankthis.close()
def isspringer(text):
match=re.search('\<a name=\"title\"\>\<\/a\>',text)
return match is not None
def unescape(text):
#From code by Fredrik Lundh at http://effbot.org/zone/re-sub.htm#unescape-html
# Licensed to the public domain at http://effbot.org/zone/copyright.htm
# Seems to work better than BeautifulSoup for this purpose
def fixup(m):
text = m.group(0)
if text.startswith("&#"):
try:
if text.startswith("&#x"):
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub("\&\#?\w+\;", fixup, text)