User:Inductiveload/make djvu.py

From Wikisource
Jump to navigation Jump to search
#! /usr/bin/env python3
# A hacky script to convert a directory of images into an OCR'd DJVU file

import argparse
import logging


import os
import multiprocessing
import concurrent.futures
import subprocess

import sexpdata
import xml.etree.ElementTree as ET
import tempfile


def get_dir_list_with_exts(d, want_exts):

    def want(f):
        _, ext = os.path.splitext(f)
        return os.path.isfile(f) and ext.lower() in want_exts

    files = [os.path.join(d, f) for f in os.listdir(d) if want(os.path.join(d, f))]

    files.sort()

    return files


def im_generic_convert(src, dst):

    logging.debug("IM Conversion {} -> {}".format(src, dst))
    cmd = ["convert", src, dst]
    subprocess.call(cmd)


def convert_img(src, dst):

    _, src_ext = os.path.splitext(src)
    _, dst_ext = os.path.splitext(dst)

    if dst_ext == src_ext:
        # nothing to convert
        pass
    elif src_ext in [".jp2", ".png"] and dst_ext in [".jpg", ".jpeg", ".pnm"]:
        im_generic_convert(src, dst)


def make_djvu_page(src, djvu, max_size):

    logging.debug("Making DJVU page {} -> {}".format(src, djvu))

    _, src_ext = os.path.splitext(src)

    if src_ext in [".jpg", ".jpeg", ".pnm"]:
        cmd = ["c44"]

        if max_size is not None:
            cmd += ["-size", str(max_size)]

        cmd += [src, djvu]
    else:
        raise RuntimeError("Can't convert {} -> {}".format(src, djvu))

    subprocess.call(cmd)


def do_ocr(src, dest):

    logging.debug("OCR {} -> {}".format(src, dest))

    langs = ["eng"]

    if dest.endswith(".hocr"):
        output_ne = dest[:-len(".hocr")]

    cmd = ["tesseract", src, output_ne, "-l", "+".join(langs), "hocr"]

    subprocess.call(cmd)


def insert_hocr_to_djvu(hocr, djvu):

    logging.debug("Merging OCR to DJVUs")

    sexpr_text = make_djvu_ocr_sexp(hocr)

    if sexpr_text is not None:
        set_page_djvu_ocr(djvu, sexpr_text)


def get_hocr_bbox(e, pg_bbox=None):

    title = e.attrib['title']

    parts = [x.split()[1:] for x in title.split(";") if x.strip().startswith("bbox")]

    try:
        box = [int(x) for x in parts[0]]

        if pg_bbox is not None:
            n3 = pg_bbox[3] - box[1]
            n1 = pg_bbox[3] - box[3]

            box[1] = min(n1, n3)
            box[3] = max(n1, n3)
    except KeyError:
        return None

    # zero-sized bbox
    if box[1] == box[3] or box[0] == box[2]:
        return None

    return box


def bbox_to_str(bbox):
    return "{} {} {} {}".format(bbox[0], bbox[1], bbox[2], bbox[3])


def innertext(tag):
    return (tag.text or '') + ''.join(innertext(e) for e in tag) + (tag.tail or '')


def make_djvu_ocr_sexp(hocrf):

    root = ET.parse(hocrf)

    namespaces = {'html': 'http://www.w3.org/1999/xhtml'}
    page = root.find('.//html:div[@class="ocr_page"]', namespaces=namespaces)

    pg_bbox = get_hocr_bbox(page)

    data = [sexpdata.Symbol("page")] + pg_bbox

    for col in page.findall('./html:div[@class="ocr_carea"]', namespaces=namespaces):

        bbox = get_hocr_bbox(col, pg_bbox)

        if bbox is None:
            continue

        col_d = [sexpdata.Symbol("column")] + bbox

        for par in col.findall('./html:p[@class="ocr_par"]', namespaces=namespaces):

            bbox = get_hocr_bbox(col, pg_bbox)
            if bbox is None:
                continue
            para_d = [sexpdata.Symbol("para")] + bbox

            for line in par.findall('./html:span[@class="ocr_line"]', namespaces=namespaces):
                bbox = get_hocr_bbox(line, pg_bbox)
                if bbox is None:
                    continue
                line_data = [sexpdata.Symbol("line")] + bbox

                words = []
                for word in line.findall('./html:span[@class="ocrx_word"]', namespaces=namespaces):

                    bbox = get_hocr_bbox(word, pg_bbox)
                    if bbox is None:
                        continue
                    word_data = [sexpdata.Symbol("word")] + bbox
                    word_data.append(innertext(word).strip())

                    if word_data[-1] != "":
                        words.append(word_data)

                if len(words):
                    line_data += words

                if len(line_data) > 5:
                    para_d.append(line_data)

            if len(para_d) > 5:
                col_d.append(para_d)

        if len(col_d) > 5:
            data.append(col_d)

    if len(data) <= 5:
        return None

    sexpr_data = sexpdata.dumps(data)
    # logging.debug(sexpr_data)

    return sexpr_data


def set_page_djvu_ocr(djvu_page, sexpr_text):

    logging.debug("Inserting OCR into " + djvu_page)

    cmd = ["djvused", djvu_page, "-e", "select 1; remove-txt", "-s"]
    subprocess.call(cmd)

    with tempfile.NamedTemporaryFile(mode="w") as sexp_f:

        sexp_f.write(sexpr_text)
        sexp_f.flush()

        cmd = ["djvused", djvu_page, "-e", "select 1; set-txt {}".format(sexp_f.name), "-s"]
        logging.debug(cmd)
        subprocess.call(cmd)


def process_page(img, tempdir, params):

    root, ext = os.path.splitext(img)
    head, tail = os.path.split(img)

    dest_root = os.path.join(tempdir, tail)

    logging.debug("Processing {}".format(img))

    conv_info = {
        "origimg": img,
        "djvu": dest_root + ".djvu",
        "hocr": dest_root + ".hocr"
    }

    if ext in ['.jpg', '.jpeg']:
        # do nothing
        conv_info['djvusrc'] = img
        conv_info['ocrsrc'] = img
    elif ext in [".jp2"]:
        # convert to JPG
        conv_info['djvusrc'] = dest_root + ".jpg"
        conv_info['ocrsrc'] = dest_root + ".jpg"
    elif ext in [".png"]:
        conv_info['djvusrc'] = dest_root + ".pnm"
        conv_info['ocrsrc'] = img
    else:
        RuntimeError("Unsupported file " + img)

    # first, make the djvu src file
    if img != conv_info['djvusrc']:
        convert_img(img, conv_info['djvusrc'])

    if conv_info['ocrsrc'] not in [conv_info['djvusrc'], img]:
        convert_img(img, conv_info["ocrsrc"])

    # then make the djvu
    make_djvu_page(conv_info["djvusrc"], conv_info["djvu"], params["max_page_size"])

    # thje do the OCR on the OCR source file
    do_ocr(conv_info['ocrsrc'], conv_info['hocr'])

    insert_hocr_to_djvu(conv_info["hocr"], conv_info["djvu"])

    return conv_info


def convert_pages(files, tempdir, params):

    threads = multiprocessing.cpu_count()

    page_data = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:

        futures = []
        for f in files:
            futures.append(executor.submit(process_page, f, tempdir, params))

        logging.debug("Image Conversions submitted")

        for future in concurrent.futures.as_completed(futures):
            pass

            try:
                data = future.result()
                page_data.append(data)
            except Exception as exc:
                print('%r generated an exception: %s' % (future, exc))
            except KeyboardInterrupt:
                raise

        logging.debug("Image Conversions complete")

    return page_data


def create_djvu_from_pages(djvu_files, out_djvu, delete_when_done):

    logging.debug("Concatenating djvu files to {}".format(out_djvu))

    if os.path.exists(out_djvu):
        os.remove(out_djvu)

    if len(djvu_files) == 0:
        return None

    cmd = ["djvm", "-c", out_djvu]
    cmd.extend(djvu_files)

    rc = subprocess.call(cmd)

    if rc == 0:

        if delete_when_done:
            for f in djvu_files:
                os.remove(f)
    else:
        raise ValueError("djvm returned {}".format(rc))

    # files are gone. but the names are still useful
    # for the XML
    return [os.path.basename(f) for f in djvu_files]


def main():

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='show debugging information')
    parser.add_argument('-i', '--in-dir', required=True,
                        help='Image directory')
    parser.add_argument('-2', '--jp2-only', action="store_true",
                        help='Exit after JP2 conversion (if any)')
    parser.add_argument('-C', '--skip-convert', action="store_true",
                        help='Skip image conversion')
    parser.add_argument('-O', '--skip-ocr', action="store_true",
                        help='Skip OCR conversion')
    parser.add_argument('-t', '--tempdir',
                        help='The working directory, if not given a temp dir is created')
    parser.add_argument('-o', '--out-djvu',
                        help='The output file, if not given, it goes in the tempdir')
    parser.add_argument('-s', '--djvu-size', type=int,
                        help='The output file max size in MB (not guaranteed)')

    args = parser.parse_args()

    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=log_level)

    if args.tempdir is None:
        tempdir = tempfile.mkdtemp(suffix=None, prefix=None, dir=None)
    else:
        os.makedirs(args.tempdir, exist_ok=True)
        tempdir = args.tempdir

    files = get_dir_list_with_exts(args.in_dir, [".jpg", ".jpeg", ".jp2", ".png"])

    if args.djvu_size:
        # in bytes
        max_page_size = (args.djvu_size * 1024 * 1024) // len(files)
    else:
        max_page_size = None

    params = {
        "max_page_size": max_page_size,
    }

    page_data = convert_pages(files, tempdir, params)

    djvu_files = [x['djvu'] for x in page_data]
    create_djvu_from_pages(djvu_files, args.out_djvu, False)


if __name__ == "__main__":
    main()