User:TalBot/rm-soft-redir.py

From Wikisource
Jump to: navigation, search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Remove soft redirects for specified months after fixing the pages linking to it
#
# Copyright © 2006—2010, GrafZahl (en.wikisource.org user)
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
#
# Uses some ideas from wikipedia.py by Rob W.W. Hooft, Andre Engels, which is
# distributed under the terms of the MIT licence.
#
# run with standard args "-log -putthrottle:xx"
#
# Further arguments:
#
#       -cat:xxx
#               Specifies the category for which soft redirects should be
#               removed, for example: -cat:'Soft redirects/August 2006'
#               (replace the single quotes with whatever is appropriate for
#               your shell)
#
#       -dumplinks
#               Write all pages linking to a soft redirect page for the given
#               month to a file
#
#       -delete
#               Actually try to delete the pages (assumes sysop privileges!).
#               Otherwise the to-be-deleted page will be logged with
#               [to-be-deleted] prefix.
#
#       -xlink:xxx
#               Specifies a set of pages to be excluded from link correction
#               as a regular expression. For example, to exclude all
#               discussion archives, specify -xlink:'.*/Archive.*' (replace
#               the single quotes with whatever is appropriate for your
#               shell).
#
#       -nopipe:xxx
#               Specifies a set of soft redirects as a regular expression.
#               These redirects will not be added to corrected links as
#               pipes. Pipes that already exist will not be altered.
#
#
 
import catlib, re, sys, wikipedia
 
wikipedia.get_throttle.setDelay(5)
 
# Handle args
 
args = wikipedia.handleArgs()
 
month = False
delete = False
dumplinks = False
xlinks = []
nopipe = []
 
for arg in args:
        if arg[:5] == u'-cat:':
                month = arg[5:]
        elif arg == u'-delete':
                delete = True
        elif arg == u'-dumplinks':
                dumplinks = True
        elif arg[:7] == u'-xlink:':
                try:
                        xlinks.append(re.compile(arg[7:]))
                except re.error:
                        wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[7:])
        elif arg[:8] == u'-nopipe:':
                try:
                        nopipe.append(re.compile(arg[8:]))
                except re.error:
                        wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[8:])
        else:
                wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)
 
if not month:
        wikipedia.output(u'(FFF) No category given (-cat:xxx)')
        sys.exit(1)
 
# basic text tokens, etc.
 
cattitle = u'Category:%s' % month
base_redirover = u'#REDIRECT[[%s]]'
summ = u'[bot] shortcutting redirect(s)'
base_delsumm = u'[bot] deleting old soft redirect to [[%s]]'
comment_re = re.compile(r'(?ms)<!--.*?-->')
inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>')
nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>')
link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]')
 
# Function to count instances of a substring in a string, with possible overlap
 
def count_overlap(string, substring):
        count = 0
        start = string.find(substring) + 1
        while start:
                count += 1
                start = string.find(substring, start) + 1
        return count
 
def get_page(pagecache, site, title):
        page = wikipedia.Page(site, title)
        try:
                result = pagecache[page.title()]
        except KeyError:
                pagecache[page.title()] = page
                result = pagecache[page.title()]
        return result
 
# Function to extract all links to a given non-redirect page
 
def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False):
        """pagetext: Text to be searched
           pagetitle: Title of page to be searched (must not be a redirect page)
           oldtitle: title to be found, generated by wikipedia.Page.title()
           newtitle: New title to link to
           dontpipe: do not add pipe text if pipe is missing
 
           pagetitle, oldtitle and newtitle should be mutually different
 
           Returns list of (search, replace) tuples, where replace is, if
           possible, a relative link, if search is a relative link
 
           Piping:
           - Existing pipes are not altered
           - When no pipe exists, the old link will be used as pipes by
             default
           - When no pipe exists and dontpipe == True, no pipe will be
             inserted
        """
        text = pagetext
        result = []
        # The following code is similar to wikipedia.Page.linkedPages
        ### Kill all comments, nowiki and includeonly
        text = re.sub(comment_re, r'', text)
        text = re.sub(nowiki_re, r'', text)
        text = re.sub(inconly_re, r'', text)
        ### Extract all links
        for match in link_re.finditer(text):
                # Extract title and calculate replacement if it is equivalent to newtitle
                oldlink = match.group(0)
                title = match.group(r'title')
                sectionlink = match.group(r'sectionlink')
                if sectionlink == None:
                        sectionlink = u''
                pipetext = match.group(r'pipe')
                wtitle = title.strip()
                if len(wtitle) == 0: # Internal anchor
                        continue
                # Check if the link begins with a colon
                if wtitle[0] == u':':
                        colon = u':'
                else:
                        colon = u''
                ### Ignore links to another wiki
                if site.isInterwikiLink(wtitle):
                        continue
                ### Handle relative links
                relative = False
                nestlevel = count_overlap(wtitle, u'/../')
                if wtitle.startswith(u'../'):
                        relative = True
                        nestlevel += 1
                if (not wtitle.startswith(u'../' * nestlevel)) or (pagetitle.count(u'/') < nestlevel):
                        # not a valid link
                        continue
                wpagetitle = pagetitle
                ##### Calculate absolute link
                for i in range(nestlevel):
                        wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')]
                        wtitle = wtitle[3:]
                if relative:
                        wtitle = wpagetitle + u'/' + wtitle
                        # If the calculated title ends with /, it is stripped.
                        # Bug in MediaWiki?
                        if wtitle.endswith(u'/'):
                                wtitle = wtitle[:-1]
                if wtitle.startswith(u'/'):
                        wtitle = wpagetitle + wtitle
                        # Also a form of a relative link
                        relative = True
                ### Normalise title
                try:
                        wtitle = wikipedia.Page(site, wtitle).title()
                except wikipedia.Error:
                        # Something wrong with the title
                        wikipedia.output(u'(DDD) Title %s caused exception (pagetitle=%s, oldtitle=%s, newtitle=%s, oldlink=%s, extracted title=%s)' % (wtitle, pagetitle, oldtitle, newtitle, oldlink, title))
                        continue
                if wtitle != oldtitle:
                        # It's some other link
                        continue
                ### Replace link with new link
                wnewtitle = newtitle
                if relative:
                        # Make it a relative link
                        ### How many levels are there in total in the page title?
                        totallevels = pagetitle.count(u'/') + 1
                        ### How many levels do the new title and the current page in common?
                        ##### Check '/' form first, otherwise count matching
                        ##### initial letters
                        if wnewtitle.startswith(pagetitle):
                                commonlevels = totallevels
                        else:
                                i = 0
                                while wnewtitle[i] == pagetitle[i]:
                                        i += 1
                                commonlevels = wnewtitle.count(u'/', 0, i + 1)
                        ### kill common levels from new title and add
                        ### sufficient "../"
                        for i in range(commonlevels):
                                wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:]
                        if commonlevels == totallevels:
                                wnewtitle = u'/' + wnewtitle
                        wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle
                if pipetext == None:
                        if dontpipe == False:
                                pipetext = u'|' + title
                        else:
                                pipetext = u''
                newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext )
                result.append((oldlink, newlink))
        return list(set(result))
 
# Start operation
 
site = wikipedia.getSite()
cat = catlib.Category(site, cattitle)
articles = list(cat.articles())
 
# Generate dictionary of texts linking to each soft redirect
 
pagecache = {}
linksdict = {}
included = set()
excluded = set()
 
for page in articles:
        refs = page.getReferences()
        linksdict[page.title()] = []
        for ref in refs:
                if ref.title() in excluded:
                        continue
                do_include = True
                for xlink in xlinks:
                        match = xlink.match(ref.title())
                        if (match != None) and (match.group() == ref.title()):
                                do_include = False
                if do_include:
                        linksdict[page.title()].append(ref.title())
                        included.add(ref.title())
                else:
                        excluded.add(ref.title())
 
included = sorted(included)
excluded = sorted(excluded)
 
wikipedia.output(u'(III) The following pages will be link-corrected:')
for title in included:
        wikipedia.output(u'* [[%s]]' % title)
 
wikipedia.output(u'(III) The following pages will be EXCLUDED from link correction:')
for title in excluded:
        wikipedia.output(u'* [[%s]]' % title)
 
# Now check which links are deemed unpipeable
 
dontpipe = set()
 
for page in articles:
        for pattern in nopipe:
                match = pattern.match(page.title())
                if (match != None) and (match.group() == page.title()):
                        dontpipe.add(page.title())
                        break
 
dontpipe = sorted(dontpipe)
 
wikipedia.output(u'(III) The following old links will not have pipes added:')
for title in dontpipe:
        wikipedia.output(u'[[%s]]' % title)
 
# Dump links
 
if dumplinks:
        while True:
                fname = wikipedia.input(u'File name for list of pages linking to soft redirects?')
                try:
                        f = file(fname, u'a')
                        if f.tell() != 0:
                                wikipedia.output(u'(EEE) File %s already exists. Please choose another file name.' % fname)
                                f.close()
                        else:
                                break
                except IOError:
                        wikipedia.output(u'(EEE) IO Error during operation with %s. Please try again or choose another file name.' % fname)
        # Write links in Wiki markup. Exceptions terminate process.
        for title, links in linksdict.iteritems():
                for link in links:
                        f.write(u'* [[%s]] links to [[%s]]\n' % ( link, title ))
        f.close()
        wikipedia.output(u'Links written to file %s' % fname)
 
wikipedia.input(u'Press RET to commence link correction, or C-C to abort.')
 
# Correct links without putting the corrected version at first and delete old pages
 
oldtextdict = {}
changedict = {}
backrefdict = {}
deleteset = set()
 
for title, links in linksdict.iteritems():
        wikipedia.output(u'(III) Calculating updated links to %s' % title)
        softredir = get_page(pagecache, site, title)
        # Check if someone confused soft and hard redirs
        if softredir.isRedirectPage():
                wikipedia.output(u'(EEE) %s is a hard redirect, not a soft one' % title)
                continue
        # Extract new target
        newlist = softredir.linkedPages()
        ### There should be only one normal link
        if len(newlist) != 1:
                wikipedia.output(u'(EEE) No unambiguous target for soft redirect %s' % title)
                continue
        new = newlist[0]
        newtitle = new.title()
        # HACK!
        if new.namespace() in (6, 14):
                newtitle = u':' + newtitle
        # End HACK
        redirover = base_redirover % newtitle
        # Correct links for each page individually
        for pagetitle in links:
                page = get_page(pagecache, site, pagetitle)
                # Back link
                if not backrefdict.has_key(pagetitle):
                        backrefdict[pagetitle] = []
                backrefdict[pagetitle].append(title)
                # Special treatment for redirect pages
                # These can be fixed immediately because they have exactly one link
                # FIXME: Section links aren't handled properly. Attempt to subsume under general case.
                #if page.isRedirectPage():
                #       try:
                #               page.put(redirover, summ)
                #       except wikipedia.Error:
                #               wikipedia.output(u'(EEE) Unable to edit redirect %s' % pagetitle)
                #       continue
                # get text tokens to be replaced with new link
                try:
                        try:
                                text = changedict[pagetitle]
                        except KeyError:
                                text = page.get()
                                oldtextdict[pagetitle] = text
                        if title in dontpipe:
                                override_pipe = True
                        else:
                                override_pipe = False
                        srlist = make_search_replace_list(text, pagetitle, title, newtitle, override_pipe)
                        for sr in srlist:
                                text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre'])
                        changedict[pagetitle] = text
                except wikipedia.Error:
                        wikipedia.output(u'(EEE) Unable to process %s' % pagetitle)
        # Add soft redirect page to the set of to-be-deleted pages
        if delete:
                deleteset.add(title)
        else:
                wikipedia.output(u'(III) [to-be-deleted] %s' % title)
 
# Now update all links
 
for title, change in changedict.iteritems():
        wikipedia.output('(III) Updating links in page %s' % title)
        # Get current version of page
        page = wikipedia.Page(site, title)
        # Update only if page wasn't edited since
        try:
                # Check if text has changed
                # Comparison of permalinks would be more efficient, but
                # unfortunately, pywikipedia's permalink feature is somewhat
                # broken
                if page.get() == oldtextdict[title]:
                        page.put(change, summ)
                else:
                        wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title)
                        # Don't delete soft redirects that still have issues
                        for backlink in backrefdict[title]:
                                deleteset.discard(backlink.title())
        except wikipedia.Error:
                wikipedia.output('(EEE) Unable to edit [[%s]]' % title)
 
# Lastly, delete the soft redirects
 
for title in deleteset:
        try:
                page = get_page(pagecache, site, title)
                delsumm = base_delsumm % page.linkedPages()[0].title()
                page.delete(delsumm, False)
        except wikipedia.Error:
                wikipedia.output(u'(EEE) Unable to delete %s' % title)
        except IndexError:
                wikipedia.output(u'(EEE) Not deleting %s: Unable to find redirect target' % title)