User:Commander Keane/Audio workflow/xFileGenBot10.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
  1. !/usr/bin/python
  2. -*- coding: utf-8 -*-
  3. dumps lists from Categories into C:\Users\jim\pywikitest

from __future__ import absolute_import, unicode_literals import pywikibot from pywikibot import pagegenerators import os.path import re import os import io


class fileGenBot():


   #global maxListLength     
   #maxListLength=10
   #global catName
   #catName="Bosphorus crossings"
   #catName="Undersea tunnels"
   #catName="Suburbs of Brisbane"
   global recurseGlobal
   recurseGlobal = 0  # was 5
   global lines_per_file
   lines_per_file = 400 #300
   global wikiCheck
   wikiCheck = False #Look up each article to see if audio already exists
   
   
   #global fileTitle
   #fileTitle=catName
   #global petScanFile
   #petScanFile="C:\\Users\\jim\\pywikitest\\Suburbs of Brisbane.txt"
   def __init__(self):
       #self.newRecording = []
       print()
       print("fileGenBot Initialsed")
       print()
       #main()


   def visitCat2(self, catNameOnly):
       site=pywikibot.Site("en","wiktionary")   #leave blank for en.wp  
       uniCat = "u'Category:" +str(catNameOnly)# + '"'
       #print "Uni cat: " + uniCat
       print ("Cat name only: " + str(catNameOnly))
       #cat = pywikibot.Category(site,uniCat)
       cat = pywikibot.Category(site,catNameOnly)
       #Category:Bosphorus crossings
       
       pages=cat.articles()
       list1= []
       lowerNumber=0
       num=0
       #for page in pagegenerators.PreloadingGenerator(pages, 49):
       for page in pagegenerators.CategorizedPageGenerator(cat, recurse=recurseGlobal):
           noBrackets = str(page)
           noFront=noBrackets[16:]
           noEnd=noFront[:len(noFront)-2]
           #noEnd=noEnd.decode('utf-8')
           list1.append(noEnd)
           #print noEnd.decode('utf-8')            
       return list1
   def printListToFile(self, listGenerated):
       filePathBeginning = 'C:\\Users\\jim\\pywikitest\\'
       #newFileName = 'C:\\Users\\jim\\pywikitest\\' + str(fileTitle) + '\\' + str(fileTitle)+ '.txt'
       newDir = filePathBeginning + str(fileTitle)
       newFileName='C:\\Users\\jim\\pywikitest\\' + str(fileTitle) + '\\' + str(fileTitle) + '.txt'
       num=1
       dirNum=1
       
       if os.path.isdir(newDir):
           while os.path.isdir(newDir):
               dirNum=dirNum+1
               newDir = filePathBeginning + str(fileTitle) + str(dirNum)
               newFileName = filePathBeginning + str(fileTitle) + str(dirNum) + '\\' + str(fileTitle) + str(dirNum) + '.txt'


       if os.path.isdir(newDir)==False:
           os.mkdir(newDir)
       #newFileName = newDir + '\\' + str(fileTitle) + '.txt'   
       
       with io.open(newFileName, "a", encoding='utf8') as outFile:
           for line in listGenerated:
               #print line to text file
               #print str(listGenerated[line])
               line=line.encode('utf-8') #line=line.encode('utf-8')
               lineEnd = '\n'
               lineEnd=lineEnd.encode('utf-8')
               lineCombined = line + lineEnd
               lineCombined = lineCombined.decode('utf8')
               outFile.write(lineCombined)
       return newFileName
   def checkArticle(self, bigList, torF):
       if torF==False:
           return bigList
       else:
               
           newBigList=[]
           for line in bigList:
               #print line
               site=pywikibot.Site()
               #page=pywikibot.Page(site,"u" + str(line) + "")
               #uniLine = 'u'+str(line)
           
               page=pywikibot.Page(site,line)
               #textUnEn=page.text
               
               text=page.text
               #text2=text.encode('utf-8')
               #print text2
               if  re.search(u'\.ogg',text) == None and re.search(u'\.oga',text) == None:
                   
                   #print "good to record: ", line
                   
                   newBigList.append(line)
           #print "length of list is: ",len(newBigList)
           return newBigList
   def petScan2simpleListFile(self):
       #newDirtyList=PetScanFile
       with open(petScanFile, "r") as longFile:
           for line in longFile:
               if str(line)[:3]=="| [[":
                   print (str(line))
                   
               
           
   
   def splitBigFile(self, newFileName):
       shortcutsFile = r'C:\Users\jim\pywikibot\Command shortcuts\Generated lists.txt'
       smallfile = None
       #newFile = 'C:\\Users\\jim\\pywikitest\\' + str(fileTitle)+ '.txt'
       
       with io.open(newFileName, encoding='utf8') as bigfile:
           folderNames = open(shortcutsFile,"a")
           for lineno, line in enumerate(bigfile):
               if lineno % lines_per_file == 0:
                   if smallfile:
                       smallfile.close()
                   startOfFileName = newFileName[:-4]
                   small_filename = startOfFileName+'_sf_{}.txt'.format(lineno + lines_per_file)
                   smallfile = io.open(small_filename, "w", encoding='utf8')
               smallfile.write(line)
           if smallfile:
               folderNameStripped = small_filename[small_filename.rfind('\\') +1 :-4]
               
                               
               folderNames.write(folderNameStripped + ' \n')
               smallfile.close()
           folderNames.close()
           


def main(*args):

   local_args = pywikibot.handle_args(args)
   args = local_args
   argCatName = args[0]
   print ("arg cat name: " + str(argCatName))
   global fileTitle
   fileTitle=argCatName
   bot = fileGenBot()
   listFromCat = bot.visitCat2(argCatName)
   list2 = bot.checkArticle(listFromCat, wikiCheck) #False to not check if audio exists
   fileName1 = bot.printListToFile(list2)
   bot.splitBigFile(fileName1)
   
   

if __name__ == '__main__':

   main()