User:Visviva/Tracking/Code

#!/usr/bin/python
# -*- coding: utf-8  -*-

'''

Functions for use in generating lists of words lacking Wiktionary coverage.

The only function normally called directly is aggregate(path).

Does not include downloaders.  
The assumption is that the files are already in place 
in a dedicated directory on the hard drive. 
(e.g. "C:\Feedme\HaroldStarr_2009-01-01")
In addition, it is assumed that 
the working list of English titles, 
named "en_titles.txt", is already present 
in the script directory.

This code is currently quite rough, 
and is being debugged only very gradually 
through painful experience.  
Use at your own risk.

'''


import datetime
import locale
import os
import re
import time
from copy import copy
from htmlentitydefs import name2codepoint

punctuation="!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~’”“"
scriptdir="C:\Code" 
#Directory for central storage of shortcuts to candidate lists:
actiondir="" 
#Currently rigged for Windows only.

class ufo: #Thanks to Robert Ullmann for this bit of simple magic.
    def __init__(self, **k):
        for a in k: setattr(self, a, k[a])

class sourceobject:
	file=""
	sourcename=""
	sourcey=""
	sourcegroup=""
	convertentities=True
	enforce_paragraphs=False
	encoding="iso-8859-1"
	extracted=False
	pagefromcite=False
	format="html"
	sourcetype="news"
	year=str(datetime.date.today().year)
	date=datetime.date.today().strftime("%B %d").replace(" 0"," ")
	day=""
	authormatchers=[]
	datematchers=[]
	doimatchers=False
	pagematchers=False
	sourcematchers=False
	titlematchers=[]
	urlmatchers=["\<url\>(.*?)\<\/url\>"]
	volmatchers=False
	issuematchers=False
	includers=False
	replacers=False
	author=""
	page=""
	pages=""
	text=""
	title=""
	toctext=""
	url=""
	volume=""
	doi=""
	issue=""
	urlcore=""
	headertemplate=""
	getstring="\<[pP][^\>]*\>([\s\S]*?)\<\/[pP]\>"
	goodtags=["sub","sup","i","em"]
	authorcleaner=False
	datecleaner=""
	titlecleaner=""
	def textprep(self):
		self.text=unicode(self.text,self.encoding,errors="ignore")
		if self.convertentities:
			self.text=unescape(self.text).encode("utf-8")
		else:
			self.text=self.text.encode("utf-8")
	def getdatum(self,matchlist):
		datum=""
		if matchlist: 
			for m in matchlist:
				if re.search(m,self.text):
					datum=re.search(m,self.text).group(1)
					break
		return datum
	def getinfo(self):
		self.getauthor()
		self.getdate()
		self.getdoi()
		self.getissue()
		self.getpage()
		self.getsource()
		self.gettitle()
		self.geturl()
		self.getvolume()
	def getauthor(self):
		self.author=self.getdatum(self.authormatchers)
		self.authorclean()
	def getdate(self):
		self.date=self.getdatum(self.datematchers)
		self.dateclean()
	def getdoi(self):
		self.doi=self.getdatum(self.doimatchers)
	def getissue(self):
		if self.issuematchers:
			self.issue=self.getdatum(self.issuematchers)
	def getpage(self):
		if self.pagematchers: 
			self.page=self.getdatum(self.pagematchers)
		if self.toctext and sourcegroup=="Springer":
			pagematch=re.search(re.escape(self.title)+"[\s\S]+?\&nbsp\;(.*)",self.toctext)
			if pagematch:
				self.pages=pagematch.group(1).strip()
		elif self.sourcegroup=="Science":
			if "-" in self.page:
				endpoints=self.page.split("-")
				self.pages=endpoints[0].strip()+"-"+endpoints[1].strip()
				self.page=""
			else:
				self.page=self.page.strip()
	def getsource(self):
		if self.sourcematchers: self.sourcename=self.getdatum(self.sourcematchers)
	def gettitle(self):
		self.title=self.getdatum(self.titlematchers)
		self.titleclean()
	def geturl(self):
		self.url=self.getdatum(self.urlmatchers)
		if self.urlcore:
			self.url=self.urlcore.replace("{{{match}}}",self.url)
	def getvolume(self):
		if self.volmatchers:
			self.volume=self.getdatum(self.volmatchers)
	def authorclean(self):
		self.author=self.author.strip()
		self.author=re.sub('\<sup\>.*?\<\/sup\>','',self.author)
		self.author=re.sub("\<[^\>]+\>","",self.author)
		if self.sourcename in self.author: self.author=""
		elif self.sourcey in self.author: self.author=""
		if self.authorcleaner: exec(self.authorcleaner)
		if self.sourcegroup=="Guardian":
			self.author=self.author.split(", ")[0].split(" in ")[0].strip()
		elif self.sourcegroup=="Science":
			authors=self.author.split(";")
			authorparts=[]
			if len(authors)>2:
				authorparts=authors[0].split(",")
				if len(authorparts)>1:
					self.author=authorparts[1].strip()+" "+authorparts[0].strip()+" et al."
				else:
					self.author=authorparts[0].strip()
			elif len(authors)==1 and len(authors[0].split(","))>2: #Nature uses commas only, no semicolons
				authors=self.author.split(",")
				if len(authors)>2: 
					self.author=authors[0].strip()+" et al."
				elif len(authors)==2:
					self.author=authors[0].strip()+" & "+authors[1].strip()
				else:
					try: self.author=authors[0].strip()
					except IndexError: 
						self.author=""
			elif len(authors)==2:
				authorparts1=authors[0].split(",")
				authorparts2=authors[1].split(",")
				self.author=authorparts1[1]+" "+authorparts1[0]+" & "+authorparts2[1]+" "+authorparts2[0]
			else:
				try: authorparts=authors[0].split(",")
				except IndexError: 
					self.author=""
				if len(authorparts)>1:
					self.author=authorparts[1].strip()+" "+authorparts[0].strip()
				else: 
					try: self.author=authors[0].strip()
					except IndexError: 
						self.author=""
		if self.sourcename=="New York Times":
			self.author=self.author.title()
	def dateclean(self):
		self.date=re.sub("\<.*?\>","",self.date)
#		if self.datecleaner: exec(self.datecleaner)
		if self.sourcename=="New York Times":
			dateparts=self.date.split(",")
			try: self.year=dateparts[1].strip()
			except IndexError: pass
			self.date=dateparts[0].strip()
		elif self.sourcegroup=="Guardian":
			try: parsedate=time.strptime(self.date,"%Y_%m_%d")
			except ValueError:
				parsedate=""
			if parsedate:
				self.date=time.strftime("%B %d",parsedate)
				self.year=time.strftime("%Y", parsedate)
		elif self.sourcename=="Toronto Star":
			dateparts=self.date.split(",")
			self.date=dateparts[0].replace("Jan","January").replace("Feb","February").replace("Mar","March").replace("Apr","April").replace("Jun","June").replace("Jul","July").replace("Aug","August").replace("Sep","September").replace("Oct","October").replace("Nov","November").replace("Dec","December").strip()
			self.year=dateparts[1].strip()
		elif self.sourcename=="Herald Sun":
			self.year=self.date.split(", ")[1][0:4]
			self.date=self.date.split(", ")[0]
		elif self.sourcename=="Chicago Reader":
			self.date=self.date.split(", ")[0]
			self.year=self.date.split(", ")[1]
		elif self.sourcegroup=="Springer":
			rawdate=self.date.strip().split()
			self.year=rawdate[2]
			self.date=rawdate[1]+" "+rawdate[0]
		elif self.sourcegroup=="Science":
			try: parsedate=time.strptime(self.date,"%m/%d/%Y")
			except ValueError:
				try: parsedate=time.strptime(self.date,"%Y-%m-%d")
				except ValueError:
					parsedate=""
			if parsedate:
				self.date=time.strftime("%B %d",parsedate)
				self.year=time.strftime("%Y", parsedate)
		self.date=self.date.replace(" 0"," ")
	def textclean(self):
		if self.includers:
			cleantext=re.search(self.includers,self.text)
			if cleantext:
				self.text=cleantext.group(1)
			else:
				self.text=""
		if self.replacers:
			for r in self.replacers:
				self.text=re.sub(r,self.replacers[r],self.text)
		self.text=re.sub("\<[\/]*i\>","''",self.text)
	def titleclean(self):
		for g in self.goodtags:
				self.title=re.sub("\<"+g+".*?\>","&&!g",self.title)
				self.title=re.sub("\<\/"+g+".*?\>","!&&g",self.title)
		self.title=re.sub("\<[^\>]+\>","",self.title)
		for g in self.goodtags:
				self.title=re.sub("\&\&\!"+g,"<"+g+">",self.title)
				self.title=re.sub("\!\&\&"+g,"</"+g+">",self.title)
		if self.titlecleaner: exec(self.titlecleaner)

def getsourcedata(sourcey): # under construction
	attributes={}
	journals=["Science","Nature","Notices of the American Mathematical Society","Lancet","Erkenntnis","Philosophical Studies","Journal of Pragmatics","Archives of Sexual Behavior"]
	aliases={
		"G":"The Guardian",
		"O":"The Observer",
		"Observer":"The Observer",
		"Guardian":"The Guardian",
		"NYT":"New York Times",
		"Reader":"Chicago Reader",
		"HS":"Herald Sun",
		"Star":"Toronto Star",
		"NAMS":"Notices of the American Mathematical Society",
		"CT": "Calcutta Telegraph",
		"PEH":"Port Elizabeth Herald",
		"AL":"Applied Linguistics",
		"JPrag":"Journal of Pragmatics",
		"PhilStud":"Philosophical Studies",
		"ASB":"Archives of Sexual Behavior"
	}
	sourcegroups={
		"The Guardian":"Guardian",
		"The Observer":"Guardian",
		"Nature":"Science",
		"Erkenntnis":"Springer",
		"Philosophical Studies":"Springer",
		"Archives of Sexual Behavior":"Springer",
	}
	attributes["headertemplate"]={
		"New York Times":"NYT_header",
		"The Observer":"Observer header",
		"The Guardian":"Guardian header"
	}
	attributes["authormatchers"]={
		"New York Times": [
			"title=\"More Articles by (.*?)\"",
			"\<meta\s+name=\"byl\"\s+content=\"By\s+([^\"]*)"],
		"Guardian":["\<li class=\"byline\"\>([\s\S]*?)\<\/li\>"],
		"Herald Sun":["\<p class=\"author\"\>(.*?)\<\/p\>"],
		"Toronto Star":[
			"\<span id=.*?___Author1__\" class=\"articleAuthor\">([^<]*)", 
			'\<p class=\"authorByline\"\>([\s\S]*?)\<\/p'],
		"Chicago Reader":['\<meta name="author" content="(.*?)"'],
		"Science":["\<.*? name=\"citation_authors\" content=\"(.*)\""],
		"Springer":['\<p class=\"AuthorGroup\"\>(.*)\<\/p'],
	}
	attributes["datematchers"]={
		"New York Times": ["\<meta\s+name=\"DISPLAYDATE\"\s+content=\"([^\"]*)"],
		"Guardian": ["cv\.c7\=\"(.*?)\"\;"],
		"Herald Sun": ["\<p class=\"published-date\">(.*?)\<\/p\>"],
		"Toronto Star": [
			"\<span style=\"text-transform:capitalize;\"\> (.*?, \d+) .*",
			'\<span class=\"date\">([\s\S]*?)\<\/span'],
		"Chicago Reader": ['\<meta name="date" content="(.*?)"'],
		"Science":["\<.*? name=\"citation_date\" content=\"(.*)\""],
		"Springer":['\<strong\>Published online\: \<\/strong\>([^\<]*)'],
	}
	attributes["pagematchers"]={
		"Nature":[
			"pageNumbers=[p]+([\d\-]+)",
			"\>[\d]+\<\/span\>,[\s\S]+?([\d\-]*)\n",
			"\/i\>[\s]*\<b\>[\d]+\<\/b\>,[\s]*([\d\-]+)[\s]*\("],
		"New York Times": ["\nPage ([\S]*)\r"],
		"Science":["Home\<\/a\> .*? [\s\S]+?[p]+\.([\-\t\d\s\n\r]*[\d]+)\n"]
	}
	attributes["sourcematchers"]={ # Not relevant for most cases
		"Springer":['\<tbody\>[\s\S]*?\<tr class=\"header\"\>[\s\S]*?\<td rowspan=\"1\" colspan=\"1\"\>([^\<]*)'],
	}
	attributes["titlematchers"]={
		"New York Times": ["<meta\s+name=\"hdl\"\s+content=\"(.*?)\""],
		"Guardian": ["\<h1\>(.*?)\<\/h1\>"],
		"Herald Sun": ["\<title\>(.*?)\|"],
		"Toronto Star":[
			"\<span id=.*?___Title__\" class=\"headlineArticle\">([^<]*)",
			"\<h4\>(.*?)\<\/h4"],
		"Chicago Reader":[
			'\<meta name="headline" content="(.*?)"\>', 
			'\<meta name="storytype" content="(.*?)"\>'],
		"Science":["\<.*? name=\"citation_title\" content=\"(.*)\""],
		"Springer":['\<a name=\"title\"\>\<\/a\>(.*)']
	}
	attributes["urlmatchers"]={ #Backup url-matching if the stamp is absent
		"New York Times": ["return encodeURIComponent\(\'(.*?\.html)\'\)\;"],
		"Guardian": ["\<url\>([^\<]*)"],
		"Herald Sun":[re.escape("http://digg.com/submit?phase=2&amp;url=")+"(.*?)\&amp\;"],
		"Toronto Star":["onclick=\"addthis_url = '(.*?)'"],
		"Chicago Reader":["\<title\>Reader Archive--Extract\: (.*?)\<"],
		"Science": ["\<.*? name=\"citation_abstract_html_url\" content=\"(.*)\"", "\<.*? name=\"citation_.*?_url\" content=\"(.*)\""],
	}
	attributes["issuematchers"]={
		"Science":["\<.*? name=\"citation_issue\" content=\"(.*?)\""]
	}
	attributes["doimatchers"]={
		"Science":[
			"\<.*? name=\"citation_doi\" content=\"doi\:(.*?)\"", 
			"\<.*? name=\"citation_doi\" content=\"(.*?)\""]
	}
	attributes["volmatchers"]={
		"Science":["\<.*? name=\"citation_volume\" content=\"(.*?)\""]
	}
	attributes["replacers"]={ # Text to switch or remove before any extraction
		"New York Times": {"\<person[^\>]*?\>":""},
		"Herald Sun":{"&squo;":"'", "\>\>.*?\>\>":""},
		"Science": {"\|\[":"&","\]\|":";"},
	}
	attributes["includers"]={ # Article text
		"Guardian":'\<div id=\"article-wrapper\"\>([\s\S]*?)\<\/div\>',
		"Herald Sun": '\<div class=\"btm20\"\>([\s\S]*?)\<\/div\>',
		"Notices of the American Mathematical Society":"([\s\S]*)Degrees Conferred \n", #Avoid morass of non-delimited text	
		"Springer":"\<title\>([^\<]+)",
	}
	attributes["datecleaners"]={ # code to execute to scrub date
		"New York Times":'displaydate=datematch.group(1); dateparts=displaydate.split(", "); info.year=dateparts[1]; info.date=dateparts[0]'
	}
	attributes["titlecleaners"]={
#		"Science":"title=str(BeautifulSoup(title,convertEntities=BeautifulSoup.ALL_ENTITIES))"
		"Springer":"\<[\/]*i[^\>]*?\>"
	}
	attributes["getstring"]={ #non-overlapping regex to find the paragraphs of actual text
		"Nature":"(?i)(.*)\<P",
		"Science":"(?i)(.*)\<P",
		"Chicago Reader":"\<img.*?\>([\s\S]*?)\<br",
	}
	attributes["encoding"]={
		"New York Times": "windows-1252",
		"Guardian":"windows-1252",
		"Toronto Star": "utf-8"
	}
	attributes["authorsinpedia"]={
		"Chicago Reader": ["Cecil Adams","Jonathan Rosenbaum"],
		"New York Times": ["Maureen Dowd", "David Brooks","William Safire"]
	}
	attributes["urlcore"]={
		"Chicago Reader": "https://securesite.chireader.com/cgi-bin/Archive/abridged2.bat?path="
	}
	source=aliases.get(sourcey, sourcey)
	sourcedata=sourceobject()
	sourcedata.sourcename=source
	sourcedata.sourcegroup=sourcegroups.get(source, source)
	if sourcedata.sourcename in journals:
		sourcedata.sourcetype="journal"
	sourcedata.sourcey=sourcey
	for a in attributes:
		if sourcedata.sourcename in attributes[a]:
			setattr(sourcedata, a, attributes[a][sourcedata.sourcename])
		elif sourcedata.sourcegroup in attributes[a]:
			setattr(sourcedata, a, attributes[a][sourcedata.sourcegroup])
	if not sourcedata.headertemplate:
		sourcedata.headertemplate=source+"_header".replace("The ","")
	sourcedata.urlmatchers.append("\<url\>(.*?)\<\/url\>")
	return sourcedata

def aggregate(path,source="",today=datetime.date.today(),learning=False):
	sourcetype="news"
	enforce_paragraphs=False
	be_suspicious=False
	toctext=False
	sourcey=source
	if today==datetime.date.today():
		sourcey,today=getpathdata(path)
	if today.day:
		year=today.strftime("%Y")
		month=today.strftime("%m")
		day=today.strftime("%d")
	else:
		year=today.year
		month=today.month
		day=""
	if source=="":
		if sourcey=="G":
			source="The Guardian"
		elif sourcey=="O":
			source="The Observer"
		elif sourcey=="NYT":
			source="New York Times"
		elif sourcey=="Reader":
			source="Chicago Reader"
		elif sourcey=="HS":
			source="Herald Sun"
		elif sourcey=="Star":
			source="Toronto Star"
		elif sourcey=="NAMS":
			source="Notices of the American Mathematical Society"
			sourcetype="journal"
		else:
			source=sourcey
	header=""
	aggregator={}
	writefile=open(path+"\\"+"agg.txt","w") # force blank file
	writefile.write("")
	writefile.close()
	blankobject=sourceobject()
	files=os.listdir(path)
	filedata={}
	bookdata={}
	skippers=["firstfile.html","agg.txt","agg-log.txt","log.txt","candidates.txt","data.txt","raw.raw","raw.txt","cookies.lwp","toc.txt"]
	if "data.txt" in files: # for Gutenberg books etc., where metadata has to be added separately
		datafile=open(path+"\\data.txt","r")
		rawdata=datafile.read()
		sourcetype="book" #default to book
		for line in rawdata.split("\n"):
			lineparts=line.split(":",1)
			try: bookdata[lineparts[0].strip()]=lineparts[1].strip()
			except IndexError: continue
		datafile.close()
	if "toc.txt" in files:
		tocfile=open(path+"\\toc.txt","r")
		toctext=tocfile.read()
		tocfile.close()
	for filename in files: #Read text from all eligible files in dir
		if "." not in filename: continue
		if ".pdf" in filename: continue
		if filename in skippers: continue
		thisdata=getsourcedata(source)
		thisdata.file=filename
		thisdata.toctext=toctext
		file=open(path+"\\"+filename)
		rawtext=file.read()
		if isspringer(rawtext) and not thisdata.source:
			thisdata=getsourcedata("Springer")
		thisdata.text=rawtext
		file.close()
		if bookdata:
			for b, v in bookdata.items():
				setattr(thisdata, b, v)
		if source=="New York Times":
			thisdata.enforce_paragraphs=True
		elif isspringer(rawtext):
			thisdata.getsource()
		elif source=="Notices of the American Mathematical Society":
			thisdata.pagefromcite="Notices\s*of\s*the\s*AMS\s*([0-9]{3}).*\n[\s\S]*?{{{passage}}}"
		thisdata.day=day
		thisdata.textprep()
		thisdata.path=path
		thisdata.getinfo()
		thisdata.textclean()
		if thisdata.headertemplate:
			header="{{User:Visviva/"+thisdata.headertemplate+"|"+year+"|"+month+"|"
			if thisdata.day:
				header+=day
			if thisdata.day != datetime.date.today():
				header=header+"|creation="+datetime.date.today().strftime("%Y-%m-%d")
			header=header+"}}"
		getthis=thisdata.getstring
		if not re.search(getthis,thisdata.text): 
			print "No paragraphs for "+filename
			if not thisdata.enforce_paragraphs:
				getthis="([\s\S]+)"
		filedata[thisdata.file]=copy(thisdata)
		aggregator=getparas(thisdata.text,aggregator,thisdata.file,getthis,thisdata.encoding)
		continue
	writefile=open(path+"\\"+"agg.txt","w") # Dump all processed text into one aggregated file
	for a in aggregator:
		writefile.write(a)
		continue
	writefile.close()
	stopwords=getstops()
	English=set()
	types=set()
	lctokens=0
	totalwords=0
	tokencount=0
	uniques=[]
	kwix={}
	data={}
	English=gettitles()
	for ag in aggregator:
		if aggregator[ag] in filedata:
			sentencedata=copy(filedata[aggregator[ag]])
		else: 
			print "No data for "+aggregator[ag]
			sentencedata=copy(blankobject)
		newuniques,newkwix,newdata,tokencount,lctokens,types=getconcordance(ag,sentencedata,English,uniques,stopwords,tokencount,lctokens,types,be_suspicious)
		uniques.extend(newuniques)
		kwix.update(newkwix)
		data.update(newdata)
		continue
	uniques.sort()
	if learning:
		return uniques
	ufile=open(path+"\\"+"candidates.txt","w")
	ufile.write(header+"\n\n")
	ufile.write("{{User:Visviva/wordhunt stats")
	ufile.write("\n|tokens="+str(tokencount))
	ufile.write("\n|goodtokens="+str(lctokens))
	ufile.write("\n|types="+str(len(types)))
	ufile.write("\n|newwords="+str(len(uniques))+"\n}}\n\n")
	try: 
		if not today==datetime.date(int(year),int(month),int(day)):
			isodate=year+"-"+month+"-"+day
		else: 
			isodate=today.isoformat()
	except ValueError:
		isodate=year+"-"+month
	ufile.write("=="+isodate+"==\n\n")
	for u in uniques:
		citationtemplate="{{User:Visviva/quote-"+data[u].sourcetype+"-special"
		if data[u].sourcetype=="journal":
			citationstring="# [["+u+"]]\n#*"+citationtemplate+"|pagetitle="+u+"|year="+data[u].year+"|date="+data[u].date+"|author="+data[u].author+"|title="+data[u].title+"|work="+source+"|doi="+data[u].doi+"|volume="+data[u].volume+"|issue="+data[u].issue
			if data[u].url:
				citationstring+="|url="+data[u].url
			if data[u].pages:
				citationstring+="|pages="+data[u].pages
			if data[u].page:
				citationstring+="|page="+data[u].page
			citationstring=citationstring+"\n|passage="+kwix[u].strip()+" }}\n"
		else:
			if data[u].sourcetype=="book":
				citationstring="# [["+u+"]]\n#*{{User:Visviva/quote-book-special|pagetitle="+u
				for b in bookdata:
					citationstring+="|"+b+"="+bookdata[b]
				citationstring+="\n|passage="+kwix[u].strip()+"}}\n"
				citationstring=citationstring.replace("{{{file}}}",data[u].file.split(".")[0])
			elif "Lancet" in source or data[u].sourcename=="":
				citationstring="# [["+u+"]]\n#:''"+kwix[u].strip()+"''\n"
			else:
				citationstring="# [["+u+"]]\n#*"+citationtemplate+"|pagetitle="+u+"|year="+data[u].year+"|date="+data[u].date+"|author="+data[u].author+"|title="+data[u].title+"|work="+source+"|url="+data[u].url
				if data[u].page:
					citationstring=citationstring+"|page="+data[u].page
				citationstring=citationstring+"\n|passage="+kwix[u].strip()+"}}\n"
		ufile.write(citationstring)
		continue
	ufile.write("\n===Sequestered===\n\n")
	ufile.close()
	if actiondir:
		actionfilename=os.path.split(path)[1]+"_candidates.txt"
		meh=os.system("fsutil hardlink create "+actiondir+"\\"+actionfilename+" "+path.replace("\\\\","\\")+"\candidates.txt")
	homework=open(scriptdir+"\\homework.txt","a") # To be put into the daily stopword-acquisition cycle
	homework.write("\n"+path)



def getparas(rawtext,aggregator,filename,getstring="(.*)",encoding="iso-8859-1"): # grabs paragraphs using either a standard pattern or a custom one
	verybadtags=["script","style","object","iframe"]
	for v in verybadtags:
		rawtext=re.sub("(?i)\<"+v+"[\s\S]*?\<\/"+v+"\>","",rawtext)
	rawtext=re.sub("\<\!\-\-[\s\S]*?\-\-\>","",rawtext)
	for m in re.finditer(getstring,rawtext):
		try: n=m.group(1)
		except IndexError: continue
		aggregator[n]=filename
		continue
	return aggregator

def gettitles(): # grabs titles from file
	English=set()
	listfile=open(scriptdir+"\\en_titles.txt","r")
	for l in listfile:
		English.add(l.strip())
		continue
	return English

def getstops(): # grabs stopwords from file
	stopfile=open(scriptdir+"\\stop.txt","r")
	stopwords=set()
	for s in stopfile:
		stopwords.add(s.strip())
		continue
	return stopwords

def badbreak(sentence,next=""): # checks whether the next block is actually part of the same sentence.
	sentence=sentence.strip()
	next=next.strip()
	neverbreaks=["vs.","e.g.", "e. g.","i.e.","i. e.","Mr.","Mrs.","Dr.","Prof.","Ms.","Sen.","Rep.","fig.","figs.","Fig.","Figs."]
	for n in neverbreaks:
		if sentence.endswith(n) or next.startswith(n): 
			return True
	alwaysbreaks=["“"]
	for a in alwaysbreaks:
		if next.startswith(a): 
			return False
		elif sentence.endswith(a):
			return len(a)
	if next=="": return False
	try: lastchar=sentence[-1]
	except IndexError: return False
	if next[0]=="," or next[0]==";" or next[0]==":": return True
	try: 
		if re.match("[a-z]",next):
			return True
	except IndexError: pass
	maths=sentence.split("&&math&&") #Avoid breaking in middle of math expression
	if len(maths)>1:
		if "&&/math&&" not in maths[-1]:
			return True
	if lastchar=="." and len(sentence)>2:
		if re.match("\d",next):
				return True
		elif re.match(".*[A-Z][a-z]{0,2}\.",sentence.split(" ")[-1]) or re.match("[A-Z]",sentence[-2]) or re.match("[A-Z][a-z]\.",sentence[-3:]):
			return True
	return False

def getpathdata(path): #gets date and source alias  from name of a directory in the form "<source alias>_<date>"
	today=datetime.date.today()
	pathparts=path.split("\\")
	sourcey=""
	dirparts=pathparts[-1].split("_")
	if len(dirparts)==2:
		sourcey=dirparts[0]
		datey=dirparts[1]
		if len(datey)==10:
			todayraw=time.strptime(datey,"%Y-%m-%d")
			today=datetime.date(todayraw.tm_year,todayraw.tm_mon,todayraw.tm_mday)
		elif len(datey)==8: 
			try: 
				todayraw=time.strptime(datey,"%Y%m%d")
				today=datetime.date(todayraw.tm_year,todayraw.tm_mon,todayraw.tm_mday)
			except ValueError: pass
		elif len(datey)==6:
			todayraw=time.strptime(datey,"%Y%m")
			today=ufo()
			today.year=str(todayraw.tm_year)
			today.month="%02d" % todayraw.tm_mon
			today.day=""
	return sourcey,today


def getconcordance(ag="",sentencedata=sourceobject(),English=set(),uniques=[],stopwords=set(),tokencount=0,lctokens=0,types=[],be_suspicious=False):
#	ag=ag.decode("utf-8")
	ag=re.sub("\<a[^\>]+\>","", ag)
	ag=re.sub("\<img .*? alt=\"\$\$(?P<group>[^\"]*?)\$\$\"[^>]*?\>","&&math&&\g<group>&&/math&&",ag) # Math text for Springer
	ag=ag.replace(".&&/math&&","&&/math&&.") #Prevent periods from being caught inside <math>
	ag=re.sub("\<img[^\>]+\>"," ",ag)
	ag=re.sub("[\n\r\t]+"," ",ag) #Trim down whitespace
	x=0
	closequote=unicode("”",encoding='utf-8')
	closequoteparts=""
	for x in range(len(closequote)):
		closequoteparts+=re.escape(closequote[x].encode("utf-8"))
	openquote=unicode("“",encoding='utf-8')
	openquoteparts=""
	for x in range(len(openquote)):
		openquoteparts+=re.escape(openquote[x].encode("utf-8"))
	pieces=re.split("([\.\?\!\|]+[\s\"\'\)\]\’"+closequoteparts+openquoteparts+"]*)",ag)
	sentences=[]
	newuniques=[]
	newkwix={}
	newdata={}
	for p in pieces: #Recombine sentences, separators
#		p=p.encode("utf-8")
		if x%2:
			sentences.append(pieces[x-1]+p)
		x+=1
	sentences.append(pieces[-1])
	x=0
	for s in sentences:
		rawsentence=s
		x+=1
		try: next=sentences[x]
		except IndexError: next=""
		try: 
			while re.match("[\)\]\’]",next.strip()[0]): #cleanup 
				s+=next[0]
				sentences[x]=next[1:]
				next=sentences[x]
		except IndexError: pass
		while badbreak(s,next):
			status=badbreak(s,next)
			if type(status)==int:
				try: 
					sentences[x]=s[-status:]+next
					s=s[:-status]
				except IndexError:
					break
				next=sentences[x]
				continue
			s+=next
			try: sentences.remove(sentences[x])
			except IndexError: break
			try: next=sentences[x]
			except IndexError: break
		sentence=s
		sentence=re.sub("\<[^\>]*\>"," ",sentence) # Throw out any tags in KWIC line
		sentence=sentence.replace("&&math&&","<math>")
		sentence=sentence.replace("&&/math&&","</math>")
		sentence=sentence.replace("|","{{!}}") # Make template-safe
		s=" ".join(re.split("[a-z\/\:\.\%\+]*\.[a-z][a-z\s\/\_\?\=\.\%\+]+",s)) # remove web addresses, even if they have spaces
		s=" ".join(re.split("\<i.*?\>[\s\S]*?\<\/i\>",s)) # remove anything in italics
		s=re.sub("\<em\>.*?\<\/em\>"," ",s)
		s=re.sub("\<span style=\"font-style\: italic;\"\>.*?\<\/span\>"," ",s) # especially annoying fancy ones
		s=re.sub("\<span class=\"italic\"\>[^\>]*"," ",s) 
		s=" ".join(re.split("\<[^\>]*\>",s)) # Now, remove all tags
		s=" ".join(re.split("\S*@[^A-Z]*",s)) # remove emails, and anything until the next capitalized word
		s=" ".join(re.split("\S*\{at\}[^A-Z]*",s)) # remove obfuscated emails
		s=" ".join(re.split("\S+[\-\'\’]+\S*",s)) # remove hyphenated and apostrophated words
		words=re.split("[\s"+re.escape(punctuation)+"]+",s)
		y=0
		badwords=["com","org","gov","uk","ca","au","nl","fr"]
		for word in words:
			word=word.strip()
			tokencount+=1
			if not word.islower() or word in stopwords or re.search("[^a-z]+",word):  # Currently throws out all words with diacritics
				y+=1
				continue
			try: 
				if words[y+1].strip() in badwords : # Any extra domain names that weren't filtered out above for any reason (spacing etc.)
					y+=1
					continue
				if words[y-1]+" "+word in English or word+" "+words[y+1] in English:
					y+=1
					continue
			except IndexError: 
				pass
			if be_suspicious:
				if issuspicious(word,words,y,English):
					y+=1
					continue
			w=word.strip()
			lctokens+=1
			types.add(w)
			if w not in English and w not in uniques and w not in newuniques:
				if y>0 and words[y]==w and re.match("[A-Z]{1} ",words[y-1]+" "): # Has the first letter of a sentence been split off?
					y+=1
					continue
				if (words[y-1]+w).lower() in English or words[y-1]+w in English:
					y+=1
					continue
				newuniques.append(w)
				if sentencedata.pagefromcite:
					pagefromcite=sentencedata.pagefromcite.replace("{{{passage}}}",w)
					pagematch=re.search(pagefromcite,sentencedata.text)
					if not pagematch: 
						print "No page for "+word
					else: 
						sentencedata.page=pagematch.group(1)
				newkwix[w]=re.sub("[\n\r\t\s]+"," ",sentence.replace(w,"'''"+w+"'''",1).strip())
				newdata[w]=copy(sentencedata)
				y+=1
			continue
		continue
	return newuniques,newkwix,newdata,tokencount,lctokens,types

def issuspicious(word,words,y,English):
	try: 
		if word+words[y+1].strip() in English or words[y-1].strip()+word in English:
			return True
		for z, letter in enumerate(word):
			if word[:z] in English and word[z:] in English:
				return True
	except IndexError:
		pass
	return False

def learn(path):
	print "Learning from "+path+"..."
	unchecked=aggregate(path,learning=True)
	try: checkedfile=open(path+"\\candidates.txt","r")
	except IOError: return False
	checkedtext=checkedfile.read()
	checkedfile.close()
	checked=[]
	added=[]
	stopfile=open(scriptdir+"\\stop.txt","r")
	allthestops=stopfile.read()
	stopfile.close()
	stoplist=re.split("[\r\n\s]+",allthestops)
	for chunk in checkedtext.split("# [["):
		if "{" in chunk.split("\n")[0]: continue
		checkmatch=re.match("([^\]]*?)\]\]",chunk)
		if checkmatch:
			checked.append(checkmatch.group(1))
	for u in unchecked:
		if u in stoplist: continue
		if u in checked: continue
		stoplist.append(u)
		added.append(u)
	stoplist.sort()
	logfile=open(scriptdir+"\\stoplog.txt","a")
	logfile.write("\n\n"+str(datetime.date.today())+"\n"+path+"\n"+str(added))
	stopfile=open(scriptdir+"\\stop.txt","w")
	stopfile.write("\n\n")
	for s in stoplist:
		if not s.strip(): continue
		stopfile.write(s+"\n")
	stopfile.close()
	return added

def learnfromfile():
	file=open(scriptdir+"\\homework.txt","r")
	list=file.read()
	file.close()
	done=set()
	for l in list.split("\n"):
		path=l.strip()
		if path in done:
			continue
		done.add(path)
		learn(path)
	blankthis=open(scriptdir+"\\homework.txt","w")
	blankthis.write("")
	blankthis.close()

def isspringer(text):
	match=re.search('\<a name=\"title\"\>\<\/a\>',text)
	return match is not None

def unescape(text): 
#From code by Fredrik Lundh at http://effbot.org/zone/re-sub.htm#unescape-html
# Licensed to the public domain at http://effbot.org/zone/copyright.htm
# Seems to work better than BeautifulSoup for this purpose
    def fixup(m):
        text = m.group(0)
        if text.startswith("&#"):
            try:
                if text.startswith("&#x"):
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("\&\#?\w+\;", fixup, text)