User:Inductiveload/Scripts/WP backlink finder

From Wikisource
Jump to navigation Jump to search

This program finds author pages which link to Wikipedia using the "wikipedia" field in the {{author}} template, but whcih do not have a return link from Wikipedia to the author page.

It takes no parameters, the input and output files are specified in the script:

python find_wp_author_without_backlinks.py

Source[edit]

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#       find_wp_author_without_backlinks.py
#
#       This scripts finds authors which have a link to wikipedia, wikiquote
#       or commons but no return link from that site.
#
#       It uses User:Phe's query_ext.py script to find a list of pages in the
#       relevant categories
#
#       A list of authors and corresponding WP articles without backlinks
#       is returned as the output file.
#

import pw_script_header #adds the pywikipedia directory to the python PATH
import wikipedia
import query_ext

import codecs
import re

class WS_External_Backlink_Checker():
    def check_for_missing_backlinks(self, wiki):

        if wiki == 'wikipedia':
            wikiname = 'Wikipedia'
            wikiprefix = 'w'
            category = 'Category:Author pages linking to Wikipedia'
            ext_site  = wikipedia.getSite("en", "wikipedia")
            header_regex = r"\| *[Ww]ikipedia *= *(.*)"
            search_regex = r"{{ *[Ww]ikisource[ -]?author"

        elif wiki == 'wikiquote':
            wikiname = 'Wikiquote'
            wikiprefix = 'q'
            category = 'Category:Author pages linking to Wikiquote'
            ext_site  = wikipedia.getSite("en", "wikiquote")
            header_regex = r"\| *[Ww]ikiquote *= *(.*)"
            search_regex = r"{{ *[Ww]ikisource[ -]?author"
        else:
            print '(ERR) Unknown wiki: %s' % wiki



        ws_pages_with_links = query_ext.PreloadingCategory(category, recurse = False, filtered_cat = [], site = self.ws_site)

        for ws_page in ws_pages_with_links: #for every author page with a WP link
            ws_page_title = ws_page['title']

            print 'INF: Processing page: %s' % ws_page_title

            ws_page = wikipedia.Page(self.ws_site, ws_page_title) # get the page
            ws_page_text = ws_page.get() #extract wikitext

            m = re.search( header_regex, ws_page_text) #look for the name of the wikipedia article

            if not m: #failed to find, skip this one
                print "\t(INF) %s article not found" % wikiname
                continue

            ext_page_title = m.group(1) # this is the WP article name

            print "\tINF: Found %s page: %s" % (wikiname, ext_page_title)

            ext_page = wikipedia.Page(ext_site, ext_page_title) #get the WP article page
            try:
                ext_page_text = ext_page.get(get_redirect=True) #extract wikitext
            except wikipedia.NoPage:
                print "INF: Linked %s page doesn't exist" % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent page)\n' % (ext_page_title, wikiprefix, ext_page_title) )
                continue
            except wikipedia.SectionError:
                print "\tINF: Linked %s section doesn't exist" % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent section)\n' % (ext_page_title, wikiprefix, ext_page_title) )
                continue


            m = re.search( search_regex, ext_page_text) # look for a {{wikisource author}} template

            if not m: # didn't find it, record the page
                print "\tINF: %s --> Wikisource link not found." % wikiname
                self.out_filelist.write('* [[%s]] ------> [[%s:%s]]\n' % (ext_page_title, wikiprefix, ext_page_title) )

            else:
                print "\tINF: %s --> Wikisource link found." % wikiname


    def __init__(self):

        #deal with output files, change filenames as needed
        out_filelistname    = '/home/john/src/pw/zz_filelist1.txt'
        self.out_filelist = codecs.open(out_filelistname, 'w', 'utf-8')

        # set up the sites we will be looking at
        self.ws_site  = wikipedia.getSite("en", "wikisource")
        self.cm_site  = wikipedia.getSite("commons", "commons")

        self.check_for_missing_backlinks('wikipedia')
        self.check_for_missing_backlinks('wikiquote')

        return

if __name__ == '__main__':
    blc = WS_External_Backlink_Checker()