User:TalBot/replace-link.py

From Wikisource
Jump to navigation Jump to search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Update links in a given list of pages.
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#
# Reuses some stuff from rm-soft-redir.py by the same author.
#
# run with standard args "-log -putthrottle:xx"
#
# Further arguments:
#
#	-oldlink:xxx
#		Link to be replaced; must be a valid page title.
#
#	-newlink:xxx
#		Link to replace the old link with
#
#	-pagelist:xxx
#		File containing a list of newline separated page titles
#		in which the replacement should take place. Must be
#		UTF-8 encoded.
#
#	-summary:xxx
#		Edit summary
#
#	WARNING: This bot script was written for the English Wikisource, which
#		is a UTF-8 wiki. For non UTF-8 wikis you must change the
#		explicit UTF-8 conversion below to suit your needs. Remember
#		that this is GPL software, so there is NO WARRANTY OF ANY
#		KIND, TO THE EXTENT ALLOWED BY APPLICABLE LAW.
#

import re, sys, wikipedia

wikipedia.get_throttle.setDelay(5)

# Handle args

args = wikipedia.handleArgs()

oldtitles = []
newtitles = []
pagefilename = False
summary = False

for arg in args:
	if arg.startswith(u'-oldlink:'):
		oldtitles.append(arg[9:])
	elif arg.startswith(u'-newlink:'):
		newtitles.append(arg[9:])
	elif arg.startswith(u'-pagelist:'):
		pagefilename = arg[10:]
	elif arg.startswith(u'-summary:'):
		summary = arg[9:]
	else:
		wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)

if len(oldtitles) == 0:
	wikipedia.output(u'(FFF) no old link title given (-oldtitle:xxx)')
	sys.exit(1)

if len(newtitles) != len(oldtitles):
	wikipedia.output(u'(FFF) You must specify -newtitle:xxx exactly as often as -oldtitle:xxx')
	sys.exit(1)

if not pagefilename:
	wikipedia.output(u'(FFF) no file with page list given (-pagelist:xxx)')
	sys.exit(1)

if not summary:
	wikipedia.output(u'(WWW) No edit summary given (-summary:xxx)')
	summary = u'(no summary)'

# basic text templates

summ = u'[bot] Automatic link replacement: %s' % summary
comment_re = re.compile(r'(?ms)<!--.*?-->')
inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>')
nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>')
link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]')

# Function to count instances of a substring in a string, with possible overlap

def count_overlap(string, substring):
        count = 0
        start = string.find(substring) + 1
        while start:
                count += 1
                start = string.find(substring, start) + 1
        return count

# Function to extract all links to a given non-redirect page

def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False):
        """pagetext: Text to be searched
           pagetitle: Title of page to be searched (must not be a redirect page)
           oldtitle: title to be found, generated by wikipedia.Page.title()
           newtitle: New title to link to
           dontpipe: do not add pipe text if pipe is missing

           pagetitle, oldtitle and newtitle should be mutually different

           Returns list of (search, replace) tuples, where replace is, if
           possible, a relative link, if search is a relative link

           Piping:
           - Existing pipes are not altered
           - When no pipe exists, the old link will be used as pipes by
             default
           - When no pipe exists and dontpipe == True, no pipe will be
             inserted
        """
        text = pagetext
        result = []
        # The following code is similar to wikipedia.Page.linkedPages
        ### Kill all comments, nowiki and includeonly
        text = re.sub(comment_re, r'', text)
        text = re.sub(nowiki_re, r'', text)
        text = re.sub(inconly_re, r'', text)
        ### Extract all links
        for match in link_re.finditer(text):
                # Extract title and calculate replacement if it is equivalent to newtitle
                oldlink = match.group(0)
                title = match.group(r'title')
		# Check if the link begins with a colon
                sectionlink = match.group(r'sectionlink')
                if sectionlink == None:
                        sectionlink = u''
                pipetext = match.group(r'pipe')
                wtitle = title.strip()
		if len(wtitle) > 0 and wtitle[0] == u':':
			colon = u':'
		else:
			colon = u''
                ### Ignore links to another wiki
                if site.isInterwikiLink(wtitle):
                        continue
                ### Handle relative links
                relative = False
                nestlevel = count_overlap(wtitle, u'/../')
                if wtitle.startswith(u'../'):
                        relative = True
                        nestlevel += 1
                if not wtitle.startswith(u'../' * nestlevel) or pagetitle.count(u'/') < nestlevel:
                        # not a valid link
                        continue
                wpagetitle = pagetitle
                ##### Calculate absolute link
                for i in range(nestlevel):
                        wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')]
                        wtitle = wtitle[3:]
                if relative:
                        wtitle = wpagetitle + u'/' + wtitle
                        # If the calculated title ends with /, it is stripped.
                        # Bug in MediaWiki?
                        if wtitle.endswith(u'/'):
                                wtitle = wtitle[:-1]
                if wtitle.startswith(u'/'):
                        wtitle = wpagetitle + wtitle
                        # Also a form of a relative link
                        relative = True
                ### Normalise title
                try:
                        wtitle = wikipedia.Page(site, wtitle).title()
                except wikipedia.Error:
                        # Something wrong with the title
                        wikipedia.output(u'(DDD) Title %s caused exception' % wtitle)
                        continue
                if wtitle != oldtitle:
                        # It's some other link
                        continue
                ### Replace link with new link
                wnewtitle = newtitle
                if relative:
                        # Make it a relative link
                        ### How many levels do the new title and the current page in common?
                        i = 0
                        while wnewtitle[i] == pagetitle[i]:
                                i += 1
                        commonlevels = wnewtitle.count(u'/', 0, i)
                        ### How many levels are there in total in the page title?
                        totallevels = pagetitle.count(u'/')
                        ### kill common levels from new title and add sufficient "../"
                        for i in range(commonlevels):
                                wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:]
                        if commonlevels == totallevels:
                                wnewtitle = u'/' + wnewtitle
                        wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle
                if pipetext == None:
                        if dontpipe == False:
                                pipetext = u'|' + title
                        else:
                                pipetext = u''
                newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext )
                result.append((oldlink, newlink))
        return list(set(result))

# Start operation

site = wikipedia.getSite()

try:
	oldpages = [ wikipedia.Page(site, oldtitle) for oldtitle in oldtitles ]
	newpages = [ wikipedia.Page(site, newtitle) for newtitle in newtitles ]
except wikipedia.Error:
	wikipedia.output(u'(FFF) invalid titles')
	sys.exit(1)

# Load page list

pagelist = []

try:
	pagefile = file(pagefilename, r'r')
except IOError:
	wikipedia.output(u'(FFF) Unable to load page file "%s"' % pagefilename)
	sys.exit(1)

try:
	for line in pagefile:
		line = line.rstrip()
		try:
			page = wikipedia.Page(site, unicode(line, 'UTF-8'))
			if page.exists():
				pagelist.append(page)
			else:
				wikipedia.output(u'(EEE) Page [[%s]] does not exist' % line)
		except wikipedia.Error:
			wikipedia.output(u'(EEE) Error with page [[%s]], ignoring this page' % line)
except IOError:
	wikipedia.output(u'(EEE) IO-Error reading from file "%s", ignoring remaining lines' % pagefilename)

try:
	pagefile.close()
except IOError:
	wikipedia.output(u'(EEE) IO-error closing file')

# Generate new text

oldtextdict = {}
changedict = {}

for page in pagelist:
	wikipedia.output(u'(III) Calculating updated links in [[%s]]' % page.title())
	try:
		try:
			text = changedict[page.title()]
		except KeyError:
			text = page.get()
			oldtextdict[page.title()] = text
		srlist = []
		for i in range(len(oldpages)):
			srlist += make_search_replace_list(text, page.title(), oldpages[i].title(), newpages[i].title(), True)
		for sr in srlist:
			text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre'])
		changedict[page.title()] = text
	except wikipedia.Error:
		wikipedia.output(u'(EEE) Unable to process %s' % page.title())

# Now update all links

for title in changedict:
	wikipedia.output('(III) Updating links in page %s' % title)
	# Get current version of page
        page = wikipedia.Page(site, title)
        # Update only if page wasn't edited since
	try:
                # Check if text has changed
                # Comparison of permalinks would be more efficient, but
                # unfortunately, pywikipedia's permalink feature is somewhat
                # broken
                if page.get() == oldtextdict[title]:
                        page.put(changedict[title], summ)
                else:
                        wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title)
        except wikipedia.Error:
                wikipedia.output('(EEE) Unable to edit [[%s]]' % title)