User:Inductiveload/Scripts/WP backlink finder

From Wikisource
Jump to: navigation, search

This program finds author pages which link to Wikipedia using the "wikipedia" field in the {{author}} template, but whcih do not have a return link from Wikipedia to the author page.

It takes no parameters, the input and output files are specified in the script:



#!/usr/bin/env python
# -*- coding: utf-8 -*-
#       This scripts finds authors which have a link to wikipedia, wikiquote
#       or commons but no return link from that site.
#       It uses User:Phe's script to find a list of pages in the
#       relevant categories
#       A list of authors and corresponding WP articles without backlinks
#       is returned as the output file.

import pw_script_header #adds the pywikipedia directory to the python PATH
import wikipedia
import query_ext

import codecs
import re

class WS_External_Backlink_Checker():
    def check_for_missing_backlinks(self, wiki):

        if wiki == 'wikipedia':
            wikiname = 'Wikipedia'
            wikiprefix = 'w'
            category = 'Category:Author pages linking to Wikipedia'
            ext_site  = wikipedia.getSite("en", "wikipedia")
            header_regex = r"\| *[Ww]ikipedia *= *(.*)"
            search_regex = r"{{ *[Ww]ikisource[ -]?author"

        elif wiki == 'wikiquote':
            wikiname = 'Wikiquote'
            wikiprefix = 'q'
            category = 'Category:Author pages linking to Wikiquote'
            ext_site  = wikipedia.getSite("en", "wikiquote")
            header_regex = r"\| *[Ww]ikiquote *= *(.*)"
            search_regex = r"{{ *[Ww]ikisource[ -]?author"
            print '(ERR) Unknown wiki: %s' % wiki

        ws_pages_with_links = query_ext.PreloadingCategory(category, recurse = False, filtered_cat = [], site = self.ws_site)

        for ws_page in ws_pages_with_links: #for every author page with a WP link
            ws_page_title = ws_page['title']

            print 'INF: Processing page: %s' % ws_page_title

            ws_page = wikipedia.Page(self.ws_site, ws_page_title) # get the page
            ws_page_text = ws_page.get() #extract wikitext

            m = header_regex, ws_page_text) #look for the name of the wikipedia article

            if not m: #failed to find, skip this one
                print "\t(INF) %s article not found" % wikiname

            ext_page_title = # this is the WP article name

            print "\tINF: Found %s page: %s" % (wikiname, ext_page_title)

            ext_page = wikipedia.Page(ext_site, ext_page_title) #get the WP article page
                ext_page_text = ext_page.get(get_redirect=True) #extract wikitext
            except wikipedia.NoPage:
                print "INF: Linked %s page doesn't exist" % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent page)\n' % (ext_page_title, wikiprefix, ext_page_title) )
            except wikipedia.SectionError:
                print "\tINF: Linked %s section doesn't exist" % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent section)\n' % (ext_page_title, wikiprefix, ext_page_title) )

            m = search_regex, ext_page_text) # look for a {{wikisource author}} template

            if not m: # didn't find it, record the page
                print "\tINF: %s --> Wikisource link not found." % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]]\n' % (ext_page_title, wikiprefix, ext_page_title) )

                print "\tINF: %s --> Wikisource link found." % wikiname

    def __init__(self):

        #deal with output files, change filenames as needed
        out_filelistname    = '/home/john/src/pw/zz_filelist1.txt'
        self.out_filelist =, 'w', 'utf-8')

        # set up the sites we will be looking at
        self.ws_site  = wikipedia.getSite("en", "wikisource")
        self.cm_site  = wikipedia.getSite("commons", "commons")



if __name__ == '__main__':
    blc = WS_External_Backlink_Checker()