User:Phe/Scripts/missing author.py

From Wikisource
Jump to navigation Jump to search
# -*- coding: utf-8 -*-

import botpywi
import sys
import query_ext
import wikipedia
import re
import create_page

try:
    import psyco
    psyco.full()
except ImportError:
    pass

class Options:
    pass

base_pagename = u'User:Phe/Author/'

def get_author_list(site):
    extraParams = { 'gapfilterredir' : 'all' }
    gen = query_ext.PreloadingPagesStartswith(u'Author:', site = site, extraParams = extraParams)

    pages = [ x[u'title'] for x in gen if not u'/' in x[u'title'] ]
    print 'nr pages', len(pages)
    return set(pages)

def filter_link_from_nms(data):
    # check missing is needed to avoid an exception if the page has been
    # deleted between PreloadingAllLinksTo() and PreloadingPageinfoFromIds()
    if data.has_key(u'missing') or data['ns'] in [ 2, 3 ]:
        return False
    return True

# get the links to Author: namespace but not if the links comes from user or
# user talk
def get_link_to_author(site):
    gen = query_ext.PreloadingAllLinksTo(102, site)
    temp = [ (x['title'], x['fromid'])  for x in gen if not u'/' in x[u'title'] ]
    pageids = set([ x[1] for x in temp ])
    print 'pageids', len(pageids)

    gen = query_ext.PreloadingPageinfoFromIds(site = site, generator = pageids)

    validids = set([ x['pageid'] for x in gen if filter_link_from_nms(x) ])
    print 'validids', len(validids)

    results = set([x[0] for x in temp if x[1] in validids])
    results = [x for x in results]
    results.sort()
    return results

# called only if the last word already match
def filter_candidate(title, lst):
    result = []
    title = title.replace(u'Author:', u'')
    words = title.split(u' ')[:-1]
    for l in lst:
        l = l.replace(u'Author:', u'')
        new_words = l.split(u' ')[:-1]
        found = 0
        for w in words:
            for n in new_words:
                if w[0] == n[0] and (len(w) <= 2 or len(n) <= 2):
                    #print title, lst, w, n
                    found += 1
        if found >= 2 or found == 1 and len(words) == 1:
            result.append(u'Author:' + l)
    result.sort()
    return result

def get_last_word(words):
    last_word = words[len(words)-1]
    if last_word == u'Jr.':
        last_word = words[len(words)-2]
        if last_word.endswith(u','):
            last_word = last_word[:-1]
    return last_word

def filter_missing(missing_author, site):
    results = []
    for p, lst in missing_author:
        print p
        #extraParams = { u'blnamespace' : '0|4|102|104' }
        gen = query_ext.PreloadingBackLinks(p, depth = 10,
                                            #extraParams=extraParams,
                                            site = site)
        refs = [ x for x in gen if not x['ns'] in [ 2, 3 ] ]
        if len(refs) != 0:
            results.append((p, lst))
    return results

def filter_missing_author(site):
    pages = get_author_list(site)
    all_author = get_link_to_author(site)
    missing_author = [x for x in all_author if not x in pages]

    existing_pages = {}
    for p in pages:
        words = p.split(u' ')
        last_word = get_last_word(words)
        existing_pages.setdefault(last_word, [])
        existing_pages[last_word].append(p)

    possible_match = []
    for p in missing_author:
        words = p.split(u' ')
        last_word = get_last_word(words)
        if existing_pages.has_key(last_word):
            lst = filter_candidate(p, existing_pages[last_word])
            if len(lst):
                possible_match.append( [ p, lst ] )

    if False:
        possible_match = filter_missing(possible_match, site)

    possible_match.sort()
    return (possible_match, missing_author)

def split_list_by_letter(lst):
    result = {}
    for f in lst:
        letter = f[len(u'Author:')]
        if letter in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
            key = letter
        else:
            key = u'Other'
        result.setdefault(key, []).append(f)
    return result

def paginate_page_list(lst, add_section = True, what_links_here = False):
    count = 0
    text = u'__NOTOC__\n\n'
    for f in lst:
        f = f[len(u'Author:'):]
        if add_section:
            count += 1
            if (count - 1) % 50 == 0:
                text += u'== ' + str(count) + u'-' + str(count + 49) + u'==\n'
        text += u'# {{Search author|' + f + '}}\n'
    return text

def write_page(pagename, lst, site):
    lst.sort()
    text = paginate_page_list(lst, True, True)
    create_page.write_page(pagename, text, u'update', True, site = site)

def write_pages(basename, lst, site):
    for key in lst:
        write_page(basename + key, lst[key], site)

def blank_page(pagename):
    create_page.write_page(pagename, u'-', u'blanked before update',
                           True, site = site)

# This is required to ensure link to non-existing author doesn't come
# from the /Missing/* page. It's an optimization to avoid to do a
# whatslinkehere for each missing author.
def blank_subpage_before_update(site):
    for key in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
        blank_page(base_pagename + u'Missing/' + key)
    blank_page(base_pagename + u'Missing/' + 'Other')

def main(site, opt):
    #blank_subpage_before_update(site)
    possible_match, missing_author = filter_missing_author(site)

    fd = open('Possible_match', 'w')
    for l in missing_author:
        print >> fd, '#[[' + l[0].encode('utf-8') + ']]',
        for t in l[1]:
            print >> fd, '-- [[' + t.encode('utf-8') + ']]',
        print >> fd
    fd.close()

    nr_missing = len(missing_author)
    create_page.write_page(base_pagename + u'Missing/Count',
                           unicode(nr_missing),
                           u'update', True, site = site)

    missing_author = split_list_by_letter(missing_author)
    write_pages(base_pagename + u'Missing/', missing_author, site)

if __name__ == "__main__":
    options = Options()
    try:
        for arg in sys.argv:
            if arg.startswith('-lang:'):
                options.lang = arg[len('-lang:'):]
            elif arg == '-help':
                print sys.argv[0], '-lang:code'
                sys.exit(1)

        site = wikipedia.getSite(options.lang, 'wikisource')

        main(site, options)
    finally:
        wikipedia.stopme()