User:Inductiveload/Scripts/Pagewise DJVU OCR extractor

From Wikisource
Jump to: navigation, search

This script creates a text file for each page of a DJVU.

Parameters:

-i is the input DJVU file
-o is the output directory
-p is the number of pages (the script will do the first "p" pages)
-d is and optional debug flag

Example [edit]

pagewise-ocr.py -i "C:\Documents and Settings\Me\My Documents\input.djvu" -p 100 -o "C:\Documents and Settings\John\My Documents\inputOCR"

Source code [edit]

#!/usr/bin/python
 
import os
import optparse
import subprocess
 
def main():
 
    parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
    parser.add_option('-i', dest='input', action='store',\
                             help='input DJVU (required)')
    parser.add_option('-p', dest='pages', action='store',\
                             help='number of pages (required)' )
    parser.add_option('-o', dest='output', action='store',\
                             help='output directory (required)' )
    parser.add_option('-d', dest='debug', action='store_true', default=False,\
                             help='debug flag' )
 
 
    (opts, args) = parser.parse_args()
 
    # check mandatory options
    if opts.pages is None:
        print("The input file '-i' must be given\n")
        parser.print_help()
        exit(-1)
 
    if opts.pages is None :
        print("The number of pages (-p) must be given\n")
        parser.print_help()
        exit(-1)
 
    if opts.output is None :
        print("The output file '-o' must be given\n")
        parser.print_help()
        exit(-1)
 
    PagewiseOCR(opts)
 
class PagewiseOCR():
 
    def __init__(self, opts):
 
        self.opts = opts
 
        #define djvu directory
        self.opts.djvuDir=r"c:\program files\djvuzone\djvulibre" #directory of djvu libre execuatables <--CHANGE ME
 
        for page in range(1, int(self.opts.pages) + 1): #for every page in the 
 
            filename = os.path.join(self.opts.output, 'OCRoutput_%04d'%page + '.txt')
 
            if self.opts.debug:
                print '\tProcessing page %d'%page
 
            f = open(filename,'w') #create the file
            f.close()
            cmd = [os.path.join(self.opts.djvuDir, 'djvutxt'), '-page='+str(page), self.opts.input, filename]
            subprocess.call(cmd)
 
 
if __name__ == "__main__":
    try: 
        main()
    finally:
        None
        #wikipedia.stopme()