User:Inductiveload/Scripts/Page concatenator

From Wikisource
Jump to: navigation, search
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#       untitled.py
#
#       Copyright 2010 Inductiveload
#
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.

import pw_script_header
import wikipedia
import codecs
import re

FILELIST = "/home/john/src/pw/zz_gendata.txt"
DESTINATION = 'User:Inductiveload/Sandbox7'
OUTFILE = '/tmp/concatfile.txt'


def find_matching_braces(text, first_brace_index ):
    """finds the index of the matching right brace to a left brace"""

    lbrace = text[first_brace_index]

    if lbrace == '{':
        rbrace = '}'
    elif lbrace == '[':
        rbrace = ']'
    elif lbrace == '(':
        rbrace = ')'
    elif lbrace == '<':
        rbrace == '>'
    else:
        print "(ERR) not a brace"

    # brace counter
    count = 0
    char_number = first_brace_index

    for char in text[first_brace_index:]:
        if char == lbrace:
            count += 1
        elif char == rbrace:
            count -= 1
        elif count == 0:
            break

        char_number += 1

    if count != 0:
        print "(ERR) unbalanced brackets"
        return None
    else:
        return char_number

def find_header(wikitext):


    m = re.search(r'({{\s*header)', wikitext)

    if m:
        header_start = wikitext.find(m.group(1))
        print "(INF) Header found, starting at %d" % header_start

    else:
        print "(ERR) Header not found."
        return 0, 0

    header_end = find_matching_braces(wikitext, header_start)
    print "(INF) header ends at char: %d" % header_end

    header = wikitext[header_start:header_end]
    #print "\n", header, "\n"

    return header_start, header_end

def main():


    filelist = codecs.open(FILELIST, 'r', 'utf-8')
    site = wikipedia.getSite()

    tempfile = codecs.open('/tmp/wikt.txt', 'w', 'utf-8')


    text = ''
    for line in filelist:

        print "(INF) processing page: %s" % line.strip()

        page = wikipedia.Page(site, line.strip())
        newtext = page.get()

        start, end = find_header(newtext)

        newtext = newtext[end:]

        newtext = re.sub("===", "", newtext)
        newtext = re.sub("==", "", newtext)

        text += newtext

    tempfile.write(text)
    tempfile.close()
    
    
    print "(INF) Saving text to %s" % OUTFILE
    outfile = codecs.open(OUTFILE, 'w', 'utf-8')
    outfile.write(text)
    outfile.close()
    
    print "(INF) Uploading concatenated pages to %s." % DESTINATION
    cont = raw_input("continue? [y/n]")

    if cont == 'y':
        page = wikipedia.Page(site, DESTINATION)
        page.put(text, 'Uploading concatenated pages to %s.'% DESTINATION, minorEdit=False)

    return 0

if __name__ == '__main__':
    main()