User:Visviva/authors.py

From Wikisource
Jump to: navigation, search
import xml.etree.cElementTree as ET
import urllib2
import urllib
import time
import re

rooturl = "http://en.wikisource.org/w/api.php?"

class AuthorUpdater:
        def __init__(self):
                pass
        
        def update_authors(self,all_authors=False):
                # get wikitext of all author-index pages
                querystring = "action=query&generator=allpages&gaplimit=50&gapfilterredir=nonredirects&gapprefix=Authors-&prop=revisions&gapnamespace=4&rvprop=content&format=xml"
                url = rooturl+querystring
                page = download_page(url)
                # create set of linked authors
                lines = [x for x in page.split("\n") if "Author:" in x]
                self.indexed_authors = set()
                for line in lines:
                        author = line.split("Author:")[1].split("|")[0].split("]")[0]
                        self.indexed_authors.add(author)

                # get list of all Author: pages
                namespace = "102"
                querystring = "action=query&list=allpages&apnamespace=%s&aplimit=500&apfilterredir=nonredirects&format=xml" % namespace
                url = rooturl+querystring
                node = recursive_download(url).find("query").find("allpages")
                self.all_authors = set([dict(x.items())["title"].encode("utf-8").split("Author:")[1] for x in node])

                # create set of unlinked authors
                if all_authors is False:
                        self.unlinked = self.all_authors - self.indexed_authors
                else: 
                        self.unlinked = self.all_authors
                
                self.process_lists(self.unlinked)

                # digest wikitext for surname, firstname, birthyear, deathyear
                # dict of fullname:4-tuple pairs
                self.dixion = {}
                for authorname in self.texts.keys():
                        if "/" in authorname: continue # subpage
                        self.textion = self.texts[authorname]
                        if "{{author" not in self.textion.lower(): continue
                        out_list=[]
                        for param in ["lastname","firstname","birthyear","deathyear"]:
                                out_list.append(self.get_param_value(param,self.textion))
                        out_tuple=tuple(out_list)
                        self.dixion[authorname]=out_tuple

                # generate wikitext list of missing authors
                self.sortable = [(self.dixion[x][0].upper()+self.dixion[x][1].upper(),x) for x in self.dixion.keys()]
                self.sortable.sort()
                self.sorted = [x[1] for x in self.sortable]
                self.out_lines = self.collate(self.sorted,self.dixion)
                self.output = "\n".join(self.out_lines)
                return self.output

        def process_lists(self,unlinked):
                # get wikitext of all unlinked authors
                # have to do this in chunks
                thelist=list(unlinked)
                thelist=["Author:"+x for x in thelist]
                self.texts={}
                while thelist:
                        print len(thelist)
                        chunk = [urllib.quote(x) for x in thelist[:50]]
                        thelist = thelist[50:]
                        batch = "|".join(chunk)
                        try: print batch
                        except: print "unprintable"
                        querystring = "action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % batch
                        url = rooturl+querystring
                        self.url = url
                        page = download_page(url)
                        for result in ET.XML(page).find("query").find("pages"):
                                text = result.find("revisions").find("rev").text
                                if not text: continue
                                text = text.encode("utf-8")
                                title = dict(result.items())["title"].encode("utf-8").split("Author:")[1]
                                self.texts[title] = text
                        time.sleep(60)
                return self.texts

                
        def collate(self,sorted,dixion):
                lines=[]
                firstletter_before=""
                for key in sorted:
                        namestring = key
                        datestring = "" # defaults
                        data = dixion[key]
                        if data[0] and data[1]:
                                namestring = "%s, %s" % (data[0],data[1])
                        elif data[0] and not data[1]: # occasionally users choose to only explicitly specify surname
                                if data[0] in key:
                                        firstname = key.replace(data[0],"").strip()
                                        namestring = "%s, %s" % (data[0],firstname)
                        elif data[1] and not data[0]: # does this ever happen?
                                if data[1] in key:
                                        surname = key.replace(data[1],"").strip()
                                        namestring = "%s, %s" % (surname,data[1])
                        
                        if data[2] or data[3]:
                                datestring = "(%s – %s)" % (data[2],data[3])
                        line = "*[[Author:%s|%s]] %s" % (key,namestring,datestring)
                        firstletter_now = namestring[0].upper()
                        if firstletter_before != firstletter_now: 
                                print firstletter_before, firstletter_now
                                extraline = "\n==%s==\n" % firstletter_now
                                lines.append(extraline)
                        firstletter_before = firstletter_now
                        lines.append(line)
                return lines
                
        def get_param_value(self,param,wikitext):
                if param not in wikitext: return ""
                try: 
                        output=wikitext.split(param)[1].split("\n")[0].split("=")[1].split("|")[0].strip()
                        if "<!--" in output:
                                if "-->" in output:
                                        output=output.split("<!--")[0]+output.split("-->")[1]
                                else:
                                        output=output.split("<!--")[0]
                                output=output.strip()
                        return output
                except:
                        return ""
        
def recursive_download(baseurl):
        firstpage = download_page(baseurl)
        page = firstpage
        node = ET.XML(firstpage)
        continuer = node.find("query-continue")
        more_items_exist = bool(continuer)
        while more_items_exist:
                param = list(continuer)[0].items()[0][0]
                startfrom = list(continuer)[0].items()[0][1]
                startfrom = urllib.quote(startfrom.encode("utf-8"))
                url = baseurl+"&"+param+"="+startfrom
                print startfrom, str(len(list(node.find("query"))[0]))
                newpage = download_page(url)
                newnode = ET.XML(newpage)
                continuer = newnode.find("query-continue")
                more_items_exist = bool(continuer)
                for item in list(list(newnode.find("query"))[0]):
                        list(node.find("query"))[0].append(item)
                time.sleep(60)
        return node
                

def download_page(url):
                page = ""
                while not page:
                        try: 
                                page = urllib2.urlopen(url).read()
                                if not page:
                                        break
                        except:
                                time.sleep(60)
                                continue
                return page