User:TalBot/xo pp check.py

From Wikisource
Jump to: navigation, search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Fix extra stuff before Executive Orders and Presidential Proclamations
#
# run with args "-log -putthrottle:xx"
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#

import pagegenerators, re, wikipedia

wikipedia.get_throttle.setDelay(5)

# Handle args

args = wikipedia.handleArgs()

for arg in args:
        wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)

# Basic text tokens

summ = u'Removing garbage before header'

# Regexes

header_xp = re.compile(r'\{\{\s*[Hh]eader')

# page generators

xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order')
pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')

# Procedure to check extra stuff before header

def check_stuff_before_header(page):
        wikipedia.output(u'(III) Checking [[%s]]' % page.title())
        if(page.isRedirectPage()):
                wikipedia.output(u'   (XXX) This page is a redirect')
                return
        text = page.get()
        match = header_xp.search(text)
        if(match == None):
                wikipedia.output(u'   (XXX) This page does not have a header')
                return
        wikipedia.output(u'   (XXX) Text before header:\n   %s' % text[:match.start()])
        return

# check pages

for page in xo_pages:
        check_stuff_before_header(page)

for page in pp_pages:
        check_stuff_before_header(page)
Personal tools
Namespaces
Variants
Actions
Navigation
Toolbox
Print/export