User:Inductiveload/Scripts/DJVU OCR

From Wikisource
Jump to: navigation, search

This is a script to perform Tesseract-based OCR on a DJVU file. It requires that you have Tesseract and DjvuLibre, and if you want to convert to bitonal images, you also need ImageMagick.

Inspired by this Perl script. This script is designed for Linux, but it can be modified to run on Windows by changing the file paths as needed.

To OCR a DJVU file, and update (-u) the text layer with new text. Without -u, the DJVU will be unaltered.
python ocr-djvu-tesseract.py -i ~/INFILE.djvu -u
Enable debugging mode (reports progress), use -d
 python ocr-djvu-tesseract.py -i ~/INFILE.djvu -d
Enable Tesseract output, use -t
 python ocr-djvu-tesseract.py -i ~/INFILE.djvu -t
Use ImageMagick to convert to bitonal with a given threshold
 python ocr-djvu-tesseract.py -i ~/INFILE.djvu -b 50%
Output a human-readable transcript of the DJVU file to a given location
 python ocr-djvu-tesseract.py -i ~/INFILE.djvu -o ~/TRANSCRIPT.txt
Use a Tesseract language other than English
 python ocr-djvu-tesseract.py -i ~/INFILE.djvu -l LANG

The code[edit]

#!/usr/bin/env python
 
import os
import glob
import subprocess
import re
import optparse
 
 
def main():
 
    parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
    parser.add_option('-i', dest='djvu', action='store',\
                             help='the source djvu file to perfrom OCR on')
    parser.add_option('-l', dest='lang', action='store', default='eng',\
                             help="OCR language (default: 'eng')" )
    parser.add_option('-d', dest='debug', action='store_true', default=False,\
                             help='enable debugging information' )
    parser.add_option('-t', dest='tess_out', action='store_true', default=False,\
                             help='enable tesseract output' )
    parser.add_option('-c', dest='clean', action='store_true', default=False,\
                             help='use textcleaner and imagemagick to convert the image to bitonal black and white' )
    parser.add_option('-o', dest='output', action='store',\
                             help='output a human readable text file to a given file path' )
    parser.add_option('-u', dest='update', action='store_true', default=False,\
                             help='update the djvu file text layer' )
 
    (opts, args) = parser.parse_args()
 
    # check mandatory options
    if opts.djvu is None:
        print("The input file '-i' must be given\n")
        parser.print_help()
        exit(-1)
 
    DjvuTesseract(opts)
 
 
class DjvuTesseract():
 
    def command(self, command, out=False, err=False):
        """Use subprocess.Popen" to run a command on the terminal and return the s result
 
        Required for python 2.6 since subprocess.check_output doesn't exist
 
        This function will trash output unless you explicitly ask it not to
        with quiet=False. This is so tesseract won't spam you with rubbish"""
 
        if out:
            std_out = subprocess.PIPE
        else:
            std_out = None
 
        if not err:
            std_err = subprocess.PIPE
        else:
            std_err = None
 
 
        proc = subprocess.Popen(command, stdout = std_out,  stderr=std_err)#std_out)
        out, err = proc.communicate()
 
        return out, err
 
    def complete(self):
        """Prints a "complete" message if debugging is on"""
 
        if self.opts.debug:
            print "complete"
 
    def calculate_djvu_length(self):
 
        cmd = ['djvused', self.opts.djvu, '-e', 'n']
        out, err = self.command(cmd, out=True)
        self.num_pages = int(out)
 
        if self.opts.debug:
            print "(INF) number of pages: %d" % self.num_pages
 
    def format_ocr_text(self, page):
        """Format a page's OCR'd text into a DJVU friendly form"""
 
        #read out of the text file that tesseract made
        ocr_text = open(self.ocr_text, 'r')
 
        # write into this file
        djvu_text = open( self.djvu_text, 'w' )
 
        text = "(page 0 0 1 1\n"
 
        self.out_text.write('\n## Page %d ###\n\n' % page )
 
        for line in ocr_text:
 
            #write to the human readable file
            self.out_text.write(line)
 
            # add each line of text
            # escaping " to \" as we go
            text += '(line 0 0 1 1 "%s")\n' % line.replace('\\', '\\\\').replace('"', '\\"').strip()
 
        text += ")\n"
 
        djvu_text.write( text )
 
        ocr_text.close()
        djvu_text.close()
 
    def process_pages(self):
 
        for page in range(1, self.num_pages+1): #djvu pages are 1-indexed
 
            if self.opts.debug:
                print "\n\t(INF) Processing page %d" % page
 
            if self.opts.debug:
                print "\t(INF) Extracting DjVu page to image . . .",
            # Extract page an image
            cmd = ['ddjvu', '-format=tiff', '-page=%d' % page, self.opts.djvu, self.temp_img]
            out, err = self.command(cmd)
 
            self.complete()
 
            #Cleanup image
            if self.opts.clean:
                if self.opts.debug:
                    print "\t(INF) Applying textcleaner . . .",
 
                # apply text cleaner
                cmd = ['./textcleaner', self.temp_img, self.temp_img]
                out, err = self.command(cmd)
 
                self.complete()
 
                if self.opts.debug:
                    print "\t(INF) Applying bitonal conversion . . .",
 
                # apply text cleaner
                cmd = ['convert', self.temp_img, '-threshold', '50%', self.temp_img]
                out, err = self.command(cmd)
 
                self.complete()
 
            if self.opts.debug:
                print "\t(INF) Beginning OCR. . .",
 
            # Perform OCR on the image
            cmd = ['tesseract', self.temp_img, self.temp_ocr, '-l', self.opts.lang]
            out, err = self.command(cmd, err=self.opts.tess_out)
 
            self.complete()
 
            # convert the OCR'd text to a DJVU friendly fomat and a human-friendly format
            self.format_ocr_text(page)
 
            # update the DJVU text layer
            if self.opts.update:
 
                if self.opts.debug:
                    print "\t(INF) Updating DJVU page . . .",
 
                # replace the text in the DJVU file
                cmd = ['djvused', self.opts.djvu, '-e', 'select %d; remove-txt' % page, "-s"]
                out, err = self.command(cmd)
 
                cmd = ['djvused', self.opts.djvu, '-e', 'select %d; set-txt %s'% (page, self.djvu_text), "-s"]
                out, err = self.command(cmd)
 
                self.complete()
 
    def process_djvu(self):
 
        if self.opts.debug:
            print "(INF) Processing %s" % self.opts.djvu
 
        # calculate DJVU length
        self.calculate_djvu_length()
 
        self.process_pages()
 
 
    def __init__(self, opts):
        self.opts = opts
 
        self.temp_img = "/tmp/TESSERACT-OCR-TEMP.tif"
        self.temp_ocr = "/tmp/TESSERACT-OCR-TEMP" #tesseract adds .txt
 
        self.ocr_text = self.temp_ocr + '.txt'
 
        # file to dump pase-wise formatted OCR'd text into
        self.djvu_text = "/tmp/TESSERACT-OCR-TEMP.djvu.txt"
 
        # file to dump human readable output into for the whole file
        if self.opts.output:
            output_filename = self.opts.output
        else: #dump in /tmp/
            output_filename = "/tmp/TESSERACT-OCR-TEMP.output.txt"
 
        self.out_text = open(output_filename, 'w')
 
        self.process_djvu()
 
if __name__ == "__main__":
    try:
        main()
    finally:
        None
 
"""
# note: structure which works
# print TXTDJVU "(page 0 0 1 1\n" ;
#   print TXTDJVU "     (line 0 0 1 1 \"toto\")\n" ;
#   print TXTDJVU "     (line 0 0 1 1 \"toto la la\")\n";
#   print TXTDJVU ")\n" ;
"""