User:TalBot/xo pp fix.py
From Wikisource
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Fix extra stuff before Executive Orders and Presidential Proclamations
#
# run with args "-log -putthrottle:xx"
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#
import pagegenerators, re, wikipedia
wikipedia.get_throttle.setDelay(5)
# Handle args
args = wikipedia.handleArgs()
for arg in args:
wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)
# Basic text tokens
summ = u'Removing garbage before {{header}}'
# Regexes
header_xp = re.compile(r'\{\{\s*[Hh]eader')
# page generators
xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order')
pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')
# Procedure to fix extra stuff before header
def fix_stuff_before_header(page):
wikipedia.output(u'(III) Checking [[%s]]' % page.title())
if(page.isRedirectPage()):
wikipedia.output(u' (III) Skipping page, redirect')
return
text = page.get()
match = header_xp.search(text)
if(match == None):
wikipedia.output(u' (III) Skipping page, no header')
return
newtext = text[match.start():]
if newtext != text:
wikipedia.output(u' (III) Removing garbage before header')
page.put(newtext, summ, minorEdit = False)
return
# check pages
for page in xo_pages:
fix_stuff_before_header(page)
for page in pp_pages:
fix_stuff_before_header(page)