User:Mpaa/Sandbox3

From Wikisource
Jump to navigation Jump to search
# -*- coding: utf-8 -*-
import query
import wikipedia as pywikibot
import i18n
import time
import datetime
import sys
import re
import query_ext


class PreloadAuthorPages:
    #get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."

    def __init__(self, letter=u'Z', getinfo=True):
        """
        Constructor. Parameters:
            @param letter: The initial of Author pages to get
            @getinfo: get also Author info or only page list?
        """
        self.letter = letter
        self.title=u'Category:Authors-'+self.letter.upper()
        self.getinfo=getinfo

    def get(self):
        #get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."

        letterDict = {} #categorymembers of "Category:Authors-X"
                        #  => i.e. subcat "Category:Authors-X..."
        catDict = {}    #categorymembers of "Category:Authors-X..."
                        #  => i.e. "Authors:Name Xsurname"

        #categorymembers of "Category:Authors-X"
        catiter_PreloadCat=query_ext._PreloadingCategory(self.title, 500)
        for item in catiter_PreloadCat:
            print item[u'title']
            if item[u'ns']==14:
                catDict[item[u'title']]=item[u'title']
                letterDict[self.title]=catDict

                #categorymembers of "Category:Authors-X..."
                cat=catDict[item[u'title']]
                catiter_PreloadSubCat=query_ext._PreloadingCategory(catDict[item[u'title']], 500)
                
                subcatDict = {}
                for subitem in catiter_PreloadSubCat:
                     print cat,'SubItem: ' + subitem[u'title']
                     subcatDict[subitem[u'title']]=subitem[u'title']
                     
                letterDict[self.title][cat]=subcatDict

        if self.getinfo:
        #Get Author info for Authors in Category:Letter and subcategories
            for letterDictkey in letterDict[self.title].keys():
                subCat=letterDict[self.title][letterDictkey]  #subCat now is pure list of Authors w/o Author info 

                authorsDict={} #Author info dictionary
                for author in subCat.keys():
                    infoAuthor=GetAuthorInfo(author)
                    info=infoAuthor.get()
                    if info!=None:
                        info.update(dict(author=author))

                        #subCat (list of Authors) is replaced with Author info dictionary
                        authorsDict[info[u'sortkey']]=info

                #subCat (list of Authors) replaced with Author info dictionary
                letterDict[self.title][letterDictkey]=authorsDict
             

        return letterDict                    


class GetAuthorInfo:
    #get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."

    def __init__(self, author, paramAuthors=None):
        """
        Constructor. Parameters:
            @param author: Author page title
            @param paramAuthors: Author template fields
        """

        self.author=author
        if paramAuthors==None:
            self.paramAuthors=[u'lastname',u'firstname',u'birthyear',\
                               u'deathyear',u'last_initial',u'description',\
                               u'defaultsort']
        else:
            self.paramAuthors=paramAuthors
            
        print self.author, paramAuthors, self.paramAuthors

    def get(self):
        #Get Author info and set sortkey
        #Returns None for redirects/not existing pages
        author_dict = {}
        pagetext=None

        page = pywikibot.Page(pywikibot.getSite(), self.author)
        if '/' in self.author:
            pywikibot.output(u'Warning - page %s is a subpage; skipping.'
                             % page.title(asLink=True))
        else:
            page = pywikibot.Page(pywikibot.getSite(), self.author)
            pywikibot.output(u'Getting page: %s' % page.title(asLink=True))
            pagetext = self.load(page)

        if pagetext!=None:
            if '{{author' not in pagetext.lower():
                pywikibot.output(u'Warning - no template: ' + self.author)
            for param in self.paramAuthors:
                author_dict[param]=self.get_param_value(param, pagetext)
            author_dict[u'sortkey']=self.set_sortkey(author_dict)
        else:
            author_dict=None

        return author_dict
                             

    def load(self, page):
        '''
        Loads the given page, does some changes, and saves it.
        '''
        try:
            # Load the page
            text = page.get(throttle=False)
        except pywikibot.NoPage:
            pywikibot.output(u'Page %s does not exist; skipping.'
                             % page.title(asLink=True))
        except pywikibot.IsRedirectPage:
            pywikibot.output(u'Page %s is a redirect; skipping.'
                             % page.title(asLink=True))
        else:
            return text
        return None


    def get_param_value(self, param, wikitext):
        #Get param value from author page
        #Strip comments as there are cases with multiple comments
        if param not in wikitext: return u''
        try: 
            #output=wikitext.split(param)[1].split('\n')[0].split('=')[1].split('|')[0].strip()
            output=wikitext.split(param)[1].split('\n')[0].split('=',1)[1].strip()
            if '<!--' in output:
                if '-->' in output:
                    output=output.split('<!--')[0]+output.split('-->')[1]
                else:
                    output=output.split('<!--')[0]
                output=output.strip()
            
            if '<!--' in output:
                if '-->' in output:
                    output=output.split('<!--')[0]+output.split('-->')[1]
                else:
                    output=output.split('<!--')[0]
                output=output.strip()

            #remove templates from param
            pattern=re.compile(r'{{.*?({{.*)?}}')
            output=pattern.sub(u'',output).strip()
            pattern=re.compile(r'<ref>.*?</ref>')
            output=pattern.sub(u'',output).strip()
            
            return output
        
        except:
            return u''


    def set_sortkey(self, author_dict):
        #to be done: check that keys exist
        if author_dict[u'defaultsort']==u'':
            if author_dict[u'lastname']==u'':
                sortkey=author_dict[u'firstname']
            else:
                sortkey=author_dict[u'lastname']+', '+author_dict[u'firstname']
        else:
            sortkey=author_dict[u'defaultsort']
        return sortkey



class PutAuthors:
    #write output

    def __init__(self, letterDict, letter, filename='debug_authors_dict.txt', getinfo=True):
        """
        Constructor. Parameters:
            @param letterDict: Author dictionary
            @param letter: which letter to do
            @param filename
            @param getinfo: also Author info?
        """

        self.letterDict=letterDict
        self.letter=letter
        self.getinfo=getinfo
        self.title=u'Category:Authors-'+letter.upper()
        self.filename=filename
        self.template=u'{{author index page|'+letter+'}}'

        
    def formatAuthor(self, author):
        #format author for output file
        authorlink=author[u'author']
        last=author[u'lastname']
        first=author[u'firstname']
        birth=author[u'birthyear']
        death=author[u'deathyear']
        description=author[u'description']

        if description!=u'':
            description=', '+description

        datestring=u''
        
        if last and first:
            namestring = '%s, %s' % (last, first)
        elif last and not first: # occasionally users choose to only explicitly specify surname
            namestring = '%s' % last
        elif first and not last:
            namestring = '%s' % first

        if birth or death:
            ndash=u'\u2013'
            datestring = ', (%s %s %s)' % (birth,ndash,death)
            
        line = '*[[%s|%s]]%s%s' % (authorlink,namestring,datestring,description)
        
        return line    


    def generateText(self):
        #page text generation
        
        page_output=[]
        page_output.append(self.template+'\n')
        for letterDictkey in sorted(self.letterDict[self.title].keys()):
            page_output.append((u'=='+unicode(letterDictkey)[-2:]+'=='))
            for subCatkey in sorted(self.letterDict[self.title][letterDictkey].keys()):
                author=self.letterDict[self.title][letterDictkey][subCatkey]
                page_output.append(self.formatAuthor(author))
            page_output.append('\n')

        page_text="\n".join(page_output)

        return page_text
    

    def writeFile(self, page_text):
        f=open(self.filename, 'w')
        f.write(page_text.encode('utf-8'))
        f.close()

    def run(self):
        page_text = self.generateText()
        #page_text=u'pippo'
        self.writeFile(page_text)
        



#site = pywikibot.getSite()
letter=u'z'
paramAuthors=[u'lastname',u'firstname',u'birthyear',u'deathyear',\
             u'last_initial',u'description',u'defaultsort']
getinfo=True
filename='debug_authors_dict.txt'

authorDictionary=PreloadAuthorPages(letter)
letterDict=authorDictionary.get()
out2file=PutAuthors(letterDict,letter)
x=out2file.run()