User:Inductiveload/Scripts/Page namespace editor

From Wikisource
Jump to: navigation, search
import pw_script_header
import wikipedia
import codecs
import re
 
FIX = 'newline'
FILE= r'/home/john/src/pw/zz_filelist0.txt'
SUMMARY =  "[bot] Tidying formatting."
 
def decomposePage(wikiText):
 
    regex = re.compile(ur'(?ms)^<noinclude>(.*)</noinclude>(.*?)<noinclude>(.*)</noinclude>$')
    m = regex.search(wikiText)
 
    if m:
        header =  m.group(1)
        body   =  m.group(2)
        footer =  m.group(3)
        return header, body, footer
 
    else:
        print "Can't find header, body, footer"
        return None
 
 
def composePage(header, body, footer):
 
    return '<noinclude>%s</noinclude>%s<noinclude>%s</noinclude>'%(header, body, footer)
 
def process_body(body):
 
    body = re.sub(ur'([^\n]) *\n([^\n])', ur'\1 \2', body)
 
    return body
 
def process_header(header):
    return header
 
def process_footer(footer):
    return footer
 
def main():
 
    in_file = codecs.open(FILE, 'r', 'utf-8')
 
    ws_site = wikipedia.getSite("en", "wikisource")
 
    for page_title in in_file:
 
        print '(INF) Processing page: %s' % page_title
 
        page = wikipedia.Page(ws_site, page_title) # get the page
        old_wikitext = page.get() #extract wikitext
 
        header, body, footer = decomposePage(old_wikitext) #decompose the page
 
        body = process_body(body) #process the body
        header = process_header(header) #header
        footer = process_footer(footer) #footer
 
        new_wikitext = composePage(header, body, footer) # make a well formed Page: namespace page
 
        wikipedia.showDiff(old_wikitext, new_wikitext)
 
        print new_wikitext
 
        cont = raw_input("Upload? [y/n]: ")
        #cont = 'y'
        if cont in ['y','Y','yes','Yes']:
            page.put(new_wikitext, SUMMARY, minorEdit=True)
 
if __name__ == "__main__":
    main()