User:Inductiveload/make djvu.py
Jump to navigation
Jump to search
#! /usr/bin/env python3
# A hacky script to convert a directory of images into an OCR'd DJVU file
import argparse
import logging
import os
import multiprocessing
import concurrent.futures
import subprocess
import sexpdata
import xml.etree.ElementTree as ET
import tempfile
def get_dir_list_with_exts(d, want_exts):
def want(f):
_, ext = os.path.splitext(f)
return os.path.isfile(f) and ext.lower() in want_exts
files = [os.path.join(d, f) for f in os.listdir(d) if want(os.path.join(d, f))]
files.sort()
return files
def im_generic_convert(src, dst):
logging.debug("IM Conversion {} -> {}".format(src, dst))
cmd = ["convert", src, dst]
subprocess.call(cmd)
def convert_img(src, dst):
_, src_ext = os.path.splitext(src)
_, dst_ext = os.path.splitext(dst)
if dst_ext == src_ext:
# nothing to convert
pass
elif src_ext in [".jp2", ".png"] and dst_ext in [".jpg", ".jpeg", ".pnm"]:
im_generic_convert(src, dst)
def make_djvu_page(src, djvu, max_size):
logging.debug("Making DJVU page {} -> {}".format(src, djvu))
_, src_ext = os.path.splitext(src)
if src_ext in [".jpg", ".jpeg", ".pnm"]:
cmd = ["c44"]
if max_size is not None:
cmd += ["-size", str(max_size)]
cmd += [src, djvu]
else:
raise RuntimeError("Can't convert {} -> {}".format(src, djvu))
subprocess.call(cmd)
def do_ocr(src, dest):
logging.debug("OCR {} -> {}".format(src, dest))
langs = ["eng"]
if dest.endswith(".hocr"):
output_ne = dest[:-len(".hocr")]
cmd = ["tesseract", src, output_ne, "-l", "+".join(langs), "hocr"]
subprocess.call(cmd)
def insert_hocr_to_djvu(hocr, djvu):
logging.debug("Merging OCR to DJVUs")
sexpr_text = make_djvu_ocr_sexp(hocr)
if sexpr_text is not None:
set_page_djvu_ocr(djvu, sexpr_text)
def get_hocr_bbox(e, pg_bbox=None):
title = e.attrib['title']
parts = [x.split()[1:] for x in title.split(";") if x.strip().startswith("bbox")]
try:
box = [int(x) for x in parts[0]]
if pg_bbox is not None:
n3 = pg_bbox[3] - box[1]
n1 = pg_bbox[3] - box[3]
box[1] = min(n1, n3)
box[3] = max(n1, n3)
except KeyError:
return None
# zero-sized bbox
if box[1] == box[3] or box[0] == box[2]:
return None
return box
def bbox_to_str(bbox):
return "{} {} {} {}".format(bbox[0], bbox[1], bbox[2], bbox[3])
def innertext(tag):
return (tag.text or '') + ''.join(innertext(e) for e in tag) + (tag.tail or '')
def make_djvu_ocr_sexp(hocrf):
root = ET.parse(hocrf)
namespaces = {'html': 'http://www.w3.org/1999/xhtml'}
page = root.find('.//html:div[@class="ocr_page"]', namespaces=namespaces)
pg_bbox = get_hocr_bbox(page)
data = [sexpdata.Symbol("page")] + pg_bbox
for col in page.findall('./html:div[@class="ocr_carea"]', namespaces=namespaces):
bbox = get_hocr_bbox(col, pg_bbox)
if bbox is None:
continue
col_d = [sexpdata.Symbol("column")] + bbox
for par in col.findall('./html:p[@class="ocr_par"]', namespaces=namespaces):
bbox = get_hocr_bbox(col, pg_bbox)
if bbox is None:
continue
para_d = [sexpdata.Symbol("para")] + bbox
for line in par.findall('./html:span[@class="ocr_line"]', namespaces=namespaces):
bbox = get_hocr_bbox(line, pg_bbox)
if bbox is None:
continue
line_data = [sexpdata.Symbol("line")] + bbox
words = []
for word in line.findall('./html:span[@class="ocrx_word"]', namespaces=namespaces):
bbox = get_hocr_bbox(word, pg_bbox)
if bbox is None:
continue
word_data = [sexpdata.Symbol("word")] + bbox
word_data.append(innertext(word).strip())
if word_data[-1] != "":
words.append(word_data)
if len(words):
line_data += words
if len(line_data) > 5:
para_d.append(line_data)
if len(para_d) > 5:
col_d.append(para_d)
if len(col_d) > 5:
data.append(col_d)
if len(data) <= 5:
return None
sexpr_data = sexpdata.dumps(data)
# logging.debug(sexpr_data)
return sexpr_data
def set_page_djvu_ocr(djvu_page, sexpr_text):
logging.debug("Inserting OCR into " + djvu_page)
cmd = ["djvused", djvu_page, "-e", "select 1; remove-txt", "-s"]
subprocess.call(cmd)
with tempfile.NamedTemporaryFile(mode="w") as sexp_f:
sexp_f.write(sexpr_text)
sexp_f.flush()
cmd = ["djvused", djvu_page, "-e", "select 1; set-txt {}".format(sexp_f.name), "-s"]
logging.debug(cmd)
subprocess.call(cmd)
def process_page(img, tempdir, params):
root, ext = os.path.splitext(img)
head, tail = os.path.split(img)
dest_root = os.path.join(tempdir, tail)
logging.debug("Processing {}".format(img))
conv_info = {
"origimg": img,
"djvu": dest_root + ".djvu",
"hocr": dest_root + ".hocr"
}
if ext in ['.jpg', '.jpeg']:
# do nothing
conv_info['djvusrc'] = img
conv_info['ocrsrc'] = img
elif ext in [".jp2"]:
# convert to JPG
conv_info['djvusrc'] = dest_root + ".jpg"
conv_info['ocrsrc'] = dest_root + ".jpg"
elif ext in [".png"]:
conv_info['djvusrc'] = dest_root + ".pnm"
conv_info['ocrsrc'] = img
else:
RuntimeError("Unsupported file " + img)
# first, make the djvu src file
if img != conv_info['djvusrc']:
convert_img(img, conv_info['djvusrc'])
if conv_info['ocrsrc'] not in [conv_info['djvusrc'], img]:
convert_img(img, conv_info["ocrsrc"])
# then make the djvu
make_djvu_page(conv_info["djvusrc"], conv_info["djvu"], params["max_page_size"])
# thje do the OCR on the OCR source file
do_ocr(conv_info['ocrsrc'], conv_info['hocr'])
insert_hocr_to_djvu(conv_info["hocr"], conv_info["djvu"])
return conv_info
def convert_pages(files, tempdir, params):
threads = multiprocessing.cpu_count()
page_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for f in files:
futures.append(executor.submit(process_page, f, tempdir, params))
logging.debug("Image Conversions submitted")
for future in concurrent.futures.as_completed(futures):
pass
try:
data = future.result()
page_data.append(data)
except Exception as exc:
print('%r generated an exception: %s' % (future, exc))
except KeyboardInterrupt:
raise
logging.debug("Image Conversions complete")
return page_data
def create_djvu_from_pages(djvu_files, out_djvu, delete_when_done):
logging.debug("Concatenating djvu files to {}".format(out_djvu))
if os.path.exists(out_djvu):
os.remove(out_djvu)
if len(djvu_files) == 0:
return None
cmd = ["djvm", "-c", out_djvu]
cmd.extend(djvu_files)
rc = subprocess.call(cmd)
if rc == 0:
if delete_when_done:
for f in djvu_files:
os.remove(f)
else:
raise ValueError("djvm returned {}".format(rc))
# files are gone. but the names are still useful
# for the XML
return [os.path.basename(f) for f in djvu_files]
def main():
parser = argparse.ArgumentParser(description='')
parser.add_argument('-v', '--verbose', action='store_true',
help='show debugging information')
parser.add_argument('-i', '--in-dir', required=True,
help='Image directory')
parser.add_argument('-2', '--jp2-only', action="store_true",
help='Exit after JP2 conversion (if any)')
parser.add_argument('-C', '--skip-convert', action="store_true",
help='Skip image conversion')
parser.add_argument('-O', '--skip-ocr', action="store_true",
help='Skip OCR conversion')
parser.add_argument('-t', '--tempdir',
help='The working directory, if not given a temp dir is created')
parser.add_argument('-o', '--out-djvu',
help='The output file, if not given, it goes in the tempdir')
parser.add_argument('-s', '--djvu-size', type=int,
help='The output file max size in MB (not guaranteed)')
args = parser.parse_args()
log_level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=log_level)
if args.tempdir is None:
tempdir = tempfile.mkdtemp(suffix=None, prefix=None, dir=None)
else:
os.makedirs(args.tempdir, exist_ok=True)
tempdir = args.tempdir
files = get_dir_list_with_exts(args.in_dir, [".jpg", ".jpeg", ".jp2", ".png"])
if args.djvu_size:
# in bytes
max_page_size = (args.djvu_size * 1024 * 1024) // len(files)
else:
max_page_size = None
params = {
"max_page_size": max_page_size,
}
page_data = convert_pages(files, tempdir, params)
djvu_files = [x['djvu'] for x in page_data]
create_djvu_from_pages(djvu_files, args.out_djvu, False)
if __name__ == "__main__":
main()