User:Visviva/authors.py
Jump to navigation
Jump to search
import xml.etree.cElementTree as ET import urllib2 import urllib import time import re rooturl = "http://en.wikisource.org/w/api.php?" class AuthorUpdater: def __init__(self): pass def update_authors(self,all_authors=False): # get wikitext of all author-index pages querystring = "action=query&generator=allpages&gaplimit=50&gapfilterredir=nonredirects&gapprefix=Authors-&prop=revisions&gapnamespace=4&rvprop=content&format=xml" url = rooturl+querystring page = download_page(url) # create set of linked authors lines = [x for x in page.split("\n") if "Author:" in x] self.indexed_authors = set() for line in lines: author = line.split("Author:")[1].split("|")[0].split("]")[0] self.indexed_authors.add(author) # get list of all Author: pages namespace = "102" querystring = "action=query&list=allpages&apnamespace=%s&aplimit=500&apfilterredir=nonredirects&format=xml" % namespace url = rooturl+querystring node = recursive_download(url).find("query").find("allpages") self.all_authors = set([dict(x.items())["title"].encode("utf-8").split("Author:")[1] for x in node]) # create set of unlinked authors if all_authors is False: self.unlinked = self.all_authors - self.indexed_authors else: self.unlinked = self.all_authors self.process_lists(self.unlinked) # digest wikitext for surname, firstname, birthyear, deathyear # dict of fullname:4-tuple pairs self.dixion = {} for authorname in self.texts.keys(): if "/" in authorname: continue # subpage self.textion = self.texts[authorname] if "{{author" not in self.textion.lower(): continue out_list=[] for param in ["lastname","firstname","birthyear","deathyear"]: out_list.append(self.get_param_value(param,self.textion)) out_tuple=tuple(out_list) self.dixion[authorname]=out_tuple # generate wikitext list of missing authors self.sortable = [(self.dixion[x][0].upper()+self.dixion[x][1].upper(),x) for x in self.dixion.keys()] self.sortable.sort() self.sorted = [x[1] for x in self.sortable] self.out_lines = self.collate(self.sorted,self.dixion) self.output = "\n".join(self.out_lines) return self.output def process_lists(self,unlinked): # get wikitext of all unlinked authors # have to do this in chunks thelist=list(unlinked) thelist=["Author:"+x for x in thelist] self.texts={} while thelist: print len(thelist) chunk = [urllib.quote(x) for x in thelist[:50]] thelist = thelist[50:] batch = "|".join(chunk) try: print batch except: print "unprintable" querystring = "action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % batch url = rooturl+querystring self.url = url page = download_page(url) for result in ET.XML(page).find("query").find("pages"): text = result.find("revisions").find("rev").text if not text: continue text = text.encode("utf-8") title = dict(result.items())["title"].encode("utf-8").split("Author:")[1] self.texts[title] = text time.sleep(60) return self.texts def collate(self,sorted,dixion): lines=[] firstletter_before="" for key in sorted: namestring = key datestring = "" # defaults data = dixion[key] if data[0] and data[1]: namestring = "%s, %s" % (data[0],data[1]) elif data[0] and not data[1]: # occasionally users choose to only explicitly specify surname if data[0] in key: firstname = key.replace(data[0],"").strip() namestring = "%s, %s" % (data[0],firstname) elif data[1] and not data[0]: # does this ever happen? if data[1] in key: surname = key.replace(data[1],"").strip() namestring = "%s, %s" % (surname,data[1]) if data[2] or data[3]: datestring = "(%s – %s)" % (data[2],data[3]) line = "*[[Author:%s|%s]] %s" % (key,namestring,datestring) firstletter_now = namestring[0].upper() if firstletter_before != firstletter_now: print firstletter_before, firstletter_now extraline = "\n==%s==\n" % firstletter_now lines.append(extraline) firstletter_before = firstletter_now lines.append(line) return lines def get_param_value(self,param,wikitext): if param not in wikitext: return "" try: output=wikitext.split(param)[1].split("\n")[0].split("=")[1].split("|")[0].strip() if "<!--" in output: if "-->" in output: output=output.split("<!--")[0]+output.split("-->")[1] else: output=output.split("<!--")[0] output=output.strip() return output except: return "" def recursive_download(baseurl): firstpage = download_page(baseurl) page = firstpage node = ET.XML(firstpage) continuer = node.find("query-continue") more_items_exist = bool(continuer) while more_items_exist: param = list(continuer)[0].items()[0][0] startfrom = list(continuer)[0].items()[0][1] startfrom = urllib.quote(startfrom.encode("utf-8")) url = baseurl+"&"+param+"="+startfrom print startfrom, str(len(list(node.find("query"))[0])) newpage = download_page(url) newnode = ET.XML(newpage) continuer = newnode.find("query-continue") more_items_exist = bool(continuer) for item in list(list(newnode.find("query"))[0]): list(node.find("query"))[0].append(item) time.sleep(60) return node def download_page(url): page = "" while not page: try: page = urllib2.urlopen(url).read() if not page: break except: time.sleep(60) continue return page