User:TalBot/rm-soft-redir.py
From Wikisource
#! /usr/bin/env python # _*_ coding: utf8 _*_ # # Remove soft redirects for specified months after fixing the pages linking to it # # Copyright © 2006—2010, GrafZahl (en.wikisource.org user) # # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # Uses some ideas from wikipedia.py by Rob W.W. Hooft, Andre Engels, which is # distributed under the terms of the MIT licence. # # run with standard args "-log -putthrottle:xx" # # Further arguments: # # -cat:xxx # Specifies the category for which soft redirects should be # removed, for example: -cat:'Soft redirects/August 2006' # (replace the single quotes with whatever is appropriate for # your shell) # # -dumplinks # Write all pages linking to a soft redirect page for the given # month to a file # # -delete # Actually try to delete the pages (assumes sysop privileges!). # Otherwise the to-be-deleted page will be logged with # [to-be-deleted] prefix. # # -xlink:xxx # Specifies a set of pages to be excluded from link correction # as a regular expression. For example, to exclude all # discussion archives, specify -xlink:'.*/Archive.*' (replace # the single quotes with whatever is appropriate for your # shell). # # -nopipe:xxx # Specifies a set of soft redirects as a regular expression. # These redirects will not be added to corrected links as # pipes. Pipes that already exist will not be altered. # # import catlib, re, sys, wikipedia wikipedia.get_throttle.setDelay(5) # Handle args args = wikipedia.handleArgs() month = False delete = False dumplinks = False xlinks = [] nopipe = [] for arg in args: if arg[:5] == u'-cat:': month = arg[5:] elif arg == u'-delete': delete = True elif arg == u'-dumplinks': dumplinks = True elif arg[:7] == u'-xlink:': try: xlinks.append(re.compile(arg[7:])) except re.error: wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[7:]) elif arg[:8] == u'-nopipe:': try: nopipe.append(re.compile(arg[8:])) except re.error: wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[8:]) else: wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg) if not month: wikipedia.output(u'(FFF) No category given (-cat:xxx)') sys.exit(1) # basic text tokens, etc. cattitle = u'Category:%s' % month base_redirover = u'#REDIRECT[[%s]]' summ = u'[bot] shortcutting redirect(s)' base_delsumm = u'[bot] deleting old soft redirect to [[%s]]' comment_re = re.compile(r'(?ms)<!--.*?-->') inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>') nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>') link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]') # Function to count instances of a substring in a string, with possible overlap def count_overlap(string, substring): count = 0 start = string.find(substring) + 1 while start: count += 1 start = string.find(substring, start) + 1 return count def get_page(pagecache, site, title): page = wikipedia.Page(site, title) try: result = pagecache[page.title()] except KeyError: pagecache[page.title()] = page result = pagecache[page.title()] return result # Function to extract all links to a given non-redirect page def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False): """pagetext: Text to be searched pagetitle: Title of page to be searched (must not be a redirect page) oldtitle: title to be found, generated by wikipedia.Page.title() newtitle: New title to link to dontpipe: do not add pipe text if pipe is missing pagetitle, oldtitle and newtitle should be mutually different Returns list of (search, replace) tuples, where replace is, if possible, a relative link, if search is a relative link Piping: - Existing pipes are not altered - When no pipe exists, the old link will be used as pipes by default - When no pipe exists and dontpipe == True, no pipe will be inserted """ text = pagetext result = [] # The following code is similar to wikipedia.Page.linkedPages ### Kill all comments, nowiki and includeonly text = re.sub(comment_re, r'', text) text = re.sub(nowiki_re, r'', text) text = re.sub(inconly_re, r'', text) ### Extract all links for match in link_re.finditer(text): # Extract title and calculate replacement if it is equivalent to newtitle oldlink = match.group(0) title = match.group(r'title') sectionlink = match.group(r'sectionlink') if sectionlink == None: sectionlink = u'' pipetext = match.group(r'pipe') wtitle = title.strip() if len(wtitle) == 0: # Internal anchor continue # Check if the link begins with a colon if wtitle[0] == u':': colon = u':' else: colon = u'' ### Ignore links to another wiki if site.isInterwikiLink(wtitle): continue ### Handle relative links relative = False nestlevel = count_overlap(wtitle, u'/../') if wtitle.startswith(u'../'): relative = True nestlevel += 1 if (not wtitle.startswith(u'../' * nestlevel)) or (pagetitle.count(u'/') < nestlevel): # not a valid link continue wpagetitle = pagetitle ##### Calculate absolute link for i in range(nestlevel): wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')] wtitle = wtitle[3:] if relative: wtitle = wpagetitle + u'/' + wtitle # If the calculated title ends with /, it is stripped. # Bug in MediaWiki? if wtitle.endswith(u'/'): wtitle = wtitle[:-1] if wtitle.startswith(u'/'): wtitle = wpagetitle + wtitle # Also a form of a relative link relative = True ### Normalise title try: wtitle = wikipedia.Page(site, wtitle).title() except wikipedia.Error: # Something wrong with the title wikipedia.output(u'(DDD) Title %s caused exception (pagetitle=%s, oldtitle=%s, newtitle=%s, oldlink=%s, extracted title=%s)' % (wtitle, pagetitle, oldtitle, newtitle, oldlink, title)) continue if wtitle != oldtitle: # It's some other link continue ### Replace link with new link wnewtitle = newtitle if relative: # Make it a relative link ### How many levels are there in total in the page title? totallevels = pagetitle.count(u'/') + 1 ### How many levels do the new title and the current page in common? ##### Check '/' form first, otherwise count matching ##### initial letters if wnewtitle.startswith(pagetitle): commonlevels = totallevels else: i = 0 while wnewtitle[i] == pagetitle[i]: i += 1 commonlevels = wnewtitle.count(u'/', 0, i + 1) ### kill common levels from new title and add ### sufficient "../" for i in range(commonlevels): wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:] if commonlevels == totallevels: wnewtitle = u'/' + wnewtitle wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle if pipetext == None: if dontpipe == False: pipetext = u'|' + title else: pipetext = u'' newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext ) result.append((oldlink, newlink)) return list(set(result)) # Start operation site = wikipedia.getSite() cat = catlib.Category(site, cattitle) articles = list(cat.articles()) # Generate dictionary of texts linking to each soft redirect pagecache = {} linksdict = {} included = set() excluded = set() for page in articles: refs = page.getReferences() linksdict[page.title()] = [] for ref in refs: if ref.title() in excluded: continue do_include = True for xlink in xlinks: match = xlink.match(ref.title()) if (match != None) and (match.group() == ref.title()): do_include = False if do_include: linksdict[page.title()].append(ref.title()) included.add(ref.title()) else: excluded.add(ref.title()) included = sorted(included) excluded = sorted(excluded) wikipedia.output(u'(III) The following pages will be link-corrected:') for title in included: wikipedia.output(u'* [[%s]]' % title) wikipedia.output(u'(III) The following pages will be EXCLUDED from link correction:') for title in excluded: wikipedia.output(u'* [[%s]]' % title) # Now check which links are deemed unpipeable dontpipe = set() for page in articles: for pattern in nopipe: match = pattern.match(page.title()) if (match != None) and (match.group() == page.title()): dontpipe.add(page.title()) break dontpipe = sorted(dontpipe) wikipedia.output(u'(III) The following old links will not have pipes added:') for title in dontpipe: wikipedia.output(u'[[%s]]' % title) # Dump links if dumplinks: while True: fname = wikipedia.input(u'File name for list of pages linking to soft redirects?') try: f = file(fname, u'a') if f.tell() != 0: wikipedia.output(u'(EEE) File %s already exists. Please choose another file name.' % fname) f.close() else: break except IOError: wikipedia.output(u'(EEE) IO Error during operation with %s. Please try again or choose another file name.' % fname) # Write links in Wiki markup. Exceptions terminate process. for title, links in linksdict.iteritems(): for link in links: f.write(u'* [[%s]] links to [[%s]]\n' % ( link, title )) f.close() wikipedia.output(u'Links written to file %s' % fname) wikipedia.input(u'Press RET to commence link correction, or C-C to abort.') # Correct links without putting the corrected version at first and delete old pages oldtextdict = {} changedict = {} backrefdict = {} deleteset = set() for title, links in linksdict.iteritems(): wikipedia.output(u'(III) Calculating updated links to %s' % title) softredir = get_page(pagecache, site, title) # Check if someone confused soft and hard redirs if softredir.isRedirectPage(): wikipedia.output(u'(EEE) %s is a hard redirect, not a soft one' % title) continue # Extract new target newlist = softredir.linkedPages() ### There should be only one normal link if len(newlist) != 1: wikipedia.output(u'(EEE) No unambiguous target for soft redirect %s' % title) continue new = newlist[0] newtitle = new.title() # HACK! if new.namespace() in (6, 14): newtitle = u':' + newtitle # End HACK redirover = base_redirover % newtitle # Correct links for each page individually for pagetitle in links: page = get_page(pagecache, site, pagetitle) # Back link if not backrefdict.has_key(pagetitle): backrefdict[pagetitle] = [] backrefdict[pagetitle].append(title) # Special treatment for redirect pages # These can be fixed immediately because they have exactly one link # FIXME: Section links aren't handled properly. Attempt to subsume under general case. #if page.isRedirectPage(): # try: # page.put(redirover, summ) # except wikipedia.Error: # wikipedia.output(u'(EEE) Unable to edit redirect %s' % pagetitle) # continue # get text tokens to be replaced with new link try: try: text = changedict[pagetitle] except KeyError: text = page.get() oldtextdict[pagetitle] = text if title in dontpipe: override_pipe = True else: override_pipe = False srlist = make_search_replace_list(text, pagetitle, title, newtitle, override_pipe) for sr in srlist: text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre']) changedict[pagetitle] = text except wikipedia.Error: wikipedia.output(u'(EEE) Unable to process %s' % pagetitle) # Add soft redirect page to the set of to-be-deleted pages if delete: deleteset.add(title) else: wikipedia.output(u'(III) [to-be-deleted] %s' % title) # Now update all links for title, change in changedict.iteritems(): wikipedia.output('(III) Updating links in page %s' % title) # Get current version of page page = wikipedia.Page(site, title) # Update only if page wasn't edited since try: # Check if text has changed # Comparison of permalinks would be more efficient, but # unfortunately, pywikipedia's permalink feature is somewhat # broken if page.get() == oldtextdict[title]: page.put(change, summ) else: wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title) # Don't delete soft redirects that still have issues for backlink in backrefdict[title]: deleteset.discard(backlink.title()) except wikipedia.Error: wikipedia.output('(EEE) Unable to edit [[%s]]' % title) # Lastly, delete the soft redirects for title in deleteset: try: page = get_page(pagecache, site, title) delsumm = base_delsumm % page.linkedPages()[0].title() page.delete(delsumm, False) except wikipedia.Error: wikipedia.output(u'(EEE) Unable to delete %s' % title) except IndexError: wikipedia.output(u'(EEE) Not deleting %s: Unable to find redirect target' % title)