User:Phe/Scripts/missing author.py
Jump to navigation
Jump to search
# -*- coding: utf-8 -*-
import botpywi
import sys
import query_ext
import wikipedia
import re
import create_page
try:
import psyco
psyco.full()
except ImportError:
pass
class Options:
pass
base_pagename = u'User:Phe/Author/'
def get_author_list(site):
extraParams = { 'gapfilterredir' : 'all' }
gen = query_ext.PreloadingPagesStartswith(u'Author:', site = site, extraParams = extraParams)
pages = [ x[u'title'] for x in gen if not u'/' in x[u'title'] ]
print 'nr pages', len(pages)
return set(pages)
def filter_link_from_nms(data):
# check missing is needed to avoid an exception if the page has been
# deleted between PreloadingAllLinksTo() and PreloadingPageinfoFromIds()
if data.has_key(u'missing') or data['ns'] in [ 2, 3 ]:
return False
return True
# get the links to Author: namespace but not if the links comes from user or
# user talk
def get_link_to_author(site):
gen = query_ext.PreloadingAllLinksTo(102, site)
temp = [ (x['title'], x['fromid']) for x in gen if not u'/' in x[u'title'] ]
pageids = set([ x[1] for x in temp ])
print 'pageids', len(pageids)
gen = query_ext.PreloadingPageinfoFromIds(site = site, generator = pageids)
validids = set([ x['pageid'] for x in gen if filter_link_from_nms(x) ])
print 'validids', len(validids)
results = set([x[0] for x in temp if x[1] in validids])
results = [x for x in results]
results.sort()
return results
# called only if the last word already match
def filter_candidate(title, lst):
result = []
title = title.replace(u'Author:', u'')
words = title.split(u' ')[:-1]
for l in lst:
l = l.replace(u'Author:', u'')
new_words = l.split(u' ')[:-1]
found = 0
for w in words:
for n in new_words:
if w[0] == n[0] and (len(w) <= 2 or len(n) <= 2):
#print title, lst, w, n
found += 1
if found >= 2 or found == 1 and len(words) == 1:
result.append(u'Author:' + l)
result.sort()
return result
def get_last_word(words):
last_word = words[len(words)-1]
if last_word == u'Jr.':
last_word = words[len(words)-2]
if last_word.endswith(u','):
last_word = last_word[:-1]
return last_word
def filter_missing(missing_author, site):
results = []
for p, lst in missing_author:
print p
#extraParams = { u'blnamespace' : '0|4|102|104' }
gen = query_ext.PreloadingBackLinks(p, depth = 10,
#extraParams=extraParams,
site = site)
refs = [ x for x in gen if not x['ns'] in [ 2, 3 ] ]
if len(refs) != 0:
results.append((p, lst))
return results
def filter_missing_author(site):
pages = get_author_list(site)
all_author = get_link_to_author(site)
missing_author = [x for x in all_author if not x in pages]
existing_pages = {}
for p in pages:
words = p.split(u' ')
last_word = get_last_word(words)
existing_pages.setdefault(last_word, [])
existing_pages[last_word].append(p)
possible_match = []
for p in missing_author:
words = p.split(u' ')
last_word = get_last_word(words)
if existing_pages.has_key(last_word):
lst = filter_candidate(p, existing_pages[last_word])
if len(lst):
possible_match.append( [ p, lst ] )
if False:
possible_match = filter_missing(possible_match, site)
possible_match.sort()
return (possible_match, missing_author)
def split_list_by_letter(lst):
result = {}
for f in lst:
letter = f[len(u'Author:')]
if letter in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
key = letter
else:
key = u'Other'
result.setdefault(key, []).append(f)
return result
def paginate_page_list(lst, add_section = True, what_links_here = False):
count = 0
text = u'__NOTOC__\n\n'
for f in lst:
f = f[len(u'Author:'):]
if add_section:
count += 1
if (count - 1) % 50 == 0:
text += u'== ' + str(count) + u'-' + str(count + 49) + u'==\n'
text += u'# {{Search author|' + f + '}}\n'
return text
def write_page(pagename, lst, site):
lst.sort()
text = paginate_page_list(lst, True, True)
create_page.write_page(pagename, text, u'update', True, site = site)
def write_pages(basename, lst, site):
for key in lst:
write_page(basename + key, lst[key], site)
def blank_page(pagename):
create_page.write_page(pagename, u'-', u'blanked before update',
True, site = site)
# This is required to ensure link to non-existing author doesn't come
# from the /Missing/* page. It's an optimization to avoid to do a
# whatslinkehere for each missing author.
def blank_subpage_before_update(site):
for key in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
blank_page(base_pagename + u'Missing/' + key)
blank_page(base_pagename + u'Missing/' + 'Other')
def main(site, opt):
#blank_subpage_before_update(site)
possible_match, missing_author = filter_missing_author(site)
fd = open('Possible_match', 'w')
for l in missing_author:
print >> fd, '#[[' + l[0].encode('utf-8') + ']]',
for t in l[1]:
print >> fd, '-- [[' + t.encode('utf-8') + ']]',
print >> fd
fd.close()
nr_missing = len(missing_author)
create_page.write_page(base_pagename + u'Missing/Count',
unicode(nr_missing),
u'update', True, site = site)
missing_author = split_list_by_letter(missing_author)
write_pages(base_pagename + u'Missing/', missing_author, site)
if __name__ == "__main__":
options = Options()
try:
for arg in sys.argv:
if arg.startswith('-lang:'):
options.lang = arg[len('-lang:'):]
elif arg == '-help':
print sys.argv[0], '-lang:code'
sys.exit(1)
site = wikipedia.getSite(options.lang, 'wikisource')
main(site, options)
finally:
wikipedia.stopme()