User:TalBot/header check.py
From Wikisource
#! /usr/bin/env python # _*_ coding: utf8 _*_ # # Check for pages without a header # # Copyright (C) 2007, GrafZahl (en.wikisource.org user) # # Licence: GPLv2 # # run with standard args "-log -putthrottle:xx" # import catlib, pagegenerators, wikipedia wikipedia.get_throttle.setDelay(5) for args in wikipedia.handleArgs(): wikipedia.output(u'(WWW) ignoring unrecognised argument: %s' % arg) # Basic stuff site = wikipedia.getSite() namespaces = [ u'', u'Author', u'Help', u'Portal', u'Wikisource' ] template_names = [ u'Archive header', u'Author', u'EB1911', u'Header', u'Header2', u'Process header' ] category_names = [ u'Soft redirects', u'Protected deleted pages' ] # generate page titles allpages = set() templaterefs = set() catpages = set() for namespace in namespaces: allpages |= set(page.title() for page in pagegenerators.AllpagesPageGenerator(namespace = site.getNamespaceIndex(namespace), includeredirects = False)) for template in template_names: templaterefs |= set(page.title() for page in wikipedia.Page(site, u'Template:' + template).getReferences(onlyTemplateInclusion = True)) for category in category_names: catpages |= set(page.title() for page in catlib.Category(site, u'Category:' + category).articles(recurse = True)) wikipedia.stopme() # # List pages with no header # noheaders = sorted(allpages - templaterefs - catpages) wikipedia.output(u'(III) Pages with no headers:') for title in noheaders: wikipedia.output(u'* [[%s]]' % title)