User:TalBot/header check.py

From Wikisource
Jump to: navigation, search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Check for pages without a header
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#
# run with standard args "-log -putthrottle:xx"
#
 
import catlib, pagegenerators, wikipedia
 
wikipedia.get_throttle.setDelay(5)
 
for args in wikipedia.handleArgs():
	wikipedia.output(u'(WWW) ignoring unrecognised argument: %s' % arg)
 
# Basic stuff
 
site = wikipedia.getSite()
namespaces = [ u'', u'Author', u'Help', u'Portal', u'Wikisource' ]
template_names = [ u'Archive header', u'Author', u'EB1911', u'Header', u'Header2', u'Process header' ]
category_names = [ u'Soft redirects', u'Protected deleted pages' ]
 
# generate page titles
 
allpages = set()
templaterefs = set()
catpages = set()
 
for namespace in namespaces:
	allpages |= set(page.title() for page in pagegenerators.AllpagesPageGenerator(namespace = site.getNamespaceIndex(namespace), includeredirects = False))
 
for template in template_names:
	templaterefs |= set(page.title() for page in wikipedia.Page(site, u'Template:' + template).getReferences(onlyTemplateInclusion = True))
 
for category in category_names:
	catpages |= set(page.title() for page in catlib.Category(site, u'Category:' + category).articles(recurse = True))
 
wikipedia.stopme()
 
#
# List pages with no header
#
 
noheaders = sorted(allpages - templaterefs - catpages)
 
wikipedia.output(u'(III) Pages with no headers:')
 
for title in noheaders:
	wikipedia.output(u'* [[%s]]' % title)