import logging
import os
from emop.lib.processes.processes_base import ProcessesBase
from emop.lib.utilities import mkdirs_exists_ok, exec_cmd
logger = logging.getLogger('emop')
[docs]class Tesseract(ProcessesBase):
def __init__(self, job):
super(self.__class__, self).__init__(job)
self.tessdata_prefix = os.path.dirname(self.job.settings.tesseract_tessdata_dir)
os.environ["TESSDATA_PREFIX"] = self.tessdata_prefix
self.cfg = os.path.join(self.job.settings.emop_home, "tess_cfg.txt")
# Strip file extension, tesseract auto-appends it
output_filename, output_extension = os.path.splitext(self.job.xml_file)
self.output_filename = output_filename
self.output_parent_dir = os.path.dirname(self.job.xml_file)
[docs] def should_run(self):
if (self.job.page_result.ocr_text_path_exists
and self.job.page_result.ocr_xml_path_exists):
return False
else:
return True
[docs] def run(self):
if not self.job.image_path:
stderr = "No image path could be determined"
return self.results(stdout=None, stderr=stderr, exitcode=1)
if not os.path.isfile(self.job.image_path):
stderr = "Could not find page image %s" % self.job.image_path
return self.results(stdout=None, stderr=stderr, exitcode=1)
# Create output parent directory if it doesn't exist
if not os.path.isdir(self.output_parent_dir):
mkdirs_exists_ok(self.output_parent_dir)
cmd = ["tesseract", self.job.image_path, self.output_filename, "-l", self.job.font.name, self.cfg]
proc = exec_cmd(cmd)
if proc.exitcode != 0:
return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)
# Rename hOCR file to XML
if os.path.isfile(self.job.hocr_file) and not os.path.isfile(self.job.xml_file):
logger.debug("Renaming %s to %s" % (self.job.hocr_file, self.job.xml_file))
os.rename(self.job.hocr_file, self.job.xml_file)
self.job.page_result.ocr_text_path = self.job.txt_file
self.job.page_result.ocr_xml_path = self.job.xml_file
return self.results(stdout=None, stderr=None, exitcode=0)