User:Inductiveload/Scripts/Page concatenator

From Wikisource
Jump to: navigation, search
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#       untitled.py
#
#       Copyright 2010 Inductiveload
#
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.
 
import pw_script_header
import wikipedia
import codecs
import re
 
FILELIST = "/home/john/src/pw/zz_gendata.txt"
DESTINATION = 'User:Inductiveload/Sandbox7'
OUTFILE = '/tmp/concatfile.txt'
 
 
def find_matching_braces(text, first_brace_index ):
    """finds the index of the matching right brace to a left brace"""
 
    lbrace = text[first_brace_index]
 
    if lbrace == '{':
        rbrace = '}'
    elif lbrace == '[':
        rbrace = ']'
    elif lbrace == '(':
        rbrace = ')'
    elif lbrace == '<':
        rbrace == '>'
    else:
        print "(ERR) not a brace"
 
    # brace counter
    count = 0
    char_number = first_brace_index
 
    for char in text[first_brace_index:]:
        if char == lbrace:
            count += 1
        elif char == rbrace:
            count -= 1
        elif count == 0:
            break
 
        char_number += 1
 
    if count != 0:
        print "(ERR) unbalanced brackets"
        return None
    else:
        return char_number
 
def find_header(wikitext):
 
 
    m = re.search(r'({{\s*header)', wikitext)
 
    if m:
        header_start = wikitext.find(m.group(1))
        print "(INF) Header found, starting at %d" % header_start
 
    else:
        print "(ERR) Header not found."
        return 0, 0
 
    header_end = find_matching_braces(wikitext, header_start)
    print "(INF) header ends at char: %d" % header_end
 
    header = wikitext[header_start:header_end]
    #print "\n", header, "\n"
 
    return header_start, header_end
 
def main():
 
 
    filelist = codecs.open(FILELIST, 'r', 'utf-8')
    site = wikipedia.getSite()
 
    tempfile = codecs.open('/tmp/wikt.txt', 'w', 'utf-8')
 
 
    text = ''
    for line in filelist:
 
        print "(INF) processing page: %s" % line.strip()
 
        page = wikipedia.Page(site, line.strip())
        newtext = page.get()
 
        start, end = find_header(newtext)
 
        newtext = newtext[end:]
 
        newtext = re.sub("===", "", newtext)
        newtext = re.sub("==", "", newtext)
 
        text += newtext
 
    tempfile.write(text)
    tempfile.close()
 
 
    print "(INF) Saving text to %s" % OUTFILE
    outfile = codecs.open(OUTFILE, 'w', 'utf-8')
    outfile.write(text)
    outfile.close()
 
    print "(INF) Uploading concatenated pages to %s." % DESTINATION
    cont = raw_input("continue? [y/n]")
 
    if cont == 'y':
        page = wikipedia.Page(site, DESTINATION)
        page.put(text, 'Uploading concatenated pages to %s.'% DESTINATION, minorEdit=False)
 
    return 0
 
if __name__ == '__main__':
    main()