Source code for emop.lib.processes.page_corrector

import glob
import json
import os
from emop.lib.utilities import exec_cmd
from emop.lib.processes.processes_base import ProcessesBase


[docs]class PageCorrector(ProcessesBase): def __init__(self, job): super(self.__class__, self).__init__(job) self.home = self.job.settings.seasr_home self.executable = os.path.join(self.home, "PageCorrector.jar") self.cfg = os.path.join(self.job.temp_dir, "emop.properties") self.dicts_dir = os.path.join(self.home, "dictionaries") self.rules_file = os.path.join(self.home, "rules", "transformations.json") self.java_args = json.loads(self.job.settings.get_value('page-corrector', 'java_args')) self.alt_arg = self.job.settings.get_value('page-corrector', 'alt_arg') self.max_transforms = self.job.settings.get_value('page-corrector', 'max_transforms') self.noise_cutoff = self.job.settings.get_value('page-corrector', 'noise_cutoff') self.ctx_min_match = self.job.settings.get_value('page-corrector', 'ctx_min_match') self.ctx_min_vol = self.job.settings.get_value('page-corrector', 'ctx_min_vol') self.dump = self.job.settings.get_bool_value('page-corrector', 'dump', default=False) self.save = self.job.settings.get_bool_value('page-corrector', 'save', default=False) self.timeout = self.job.settings.page_corrector_timeout
[docs] def should_run(self): if (self.job.postproc_result.pp_health_exists and self.job.page_result.corr_ocr_text_path_exists and self.job.page_result.corr_ocr_xml_path_exists): return False else: return True
[docs] def run(self): if not self.job.xml_file or not os.path.isfile(self.job.xml_file): stderr = "Could not find XML file: %s" % self.job.xml_file return self.results(stdout=None, stderr=stderr, exitcode=1) dict_files = glob.glob("%s/*.dict" % self.dicts_dir) cmd = [ "java", self.java_args, "-jar", self.executable, "--dbconf", self.cfg, "-t", self.rules_file, "-o", self.job.output_dir, "--stats", "--alt", self.alt_arg, "--max-transforms", self.max_transforms, "--noiseCutoff", self.noise_cutoff, "--dict", dict_files ] if self.ctx_min_match: cmd.append("--ctx-min-match") cmd.append(self.ctx_min_match) if self.ctx_min_vol: cmd.append("--ctx-min-vol") cmd.append(self.ctx_min_vol) if self.dump: cmd.append("--dump") if self.save: cmd.append("--save") cmd.append("--") cmd.append(self.job.xml_file) proc = exec_cmd(cmd, timeout=self.timeout) if proc.exitcode != 0: # TODO: PageCorrector errors are going to stdout not stderr if not proc.stdout and proc.stderr: stderr = proc.stderr else: stderr = proc.stdout return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode) out = proc.stdout.strip() # Check that output is valid JSON try: json.loads(out) except ValueError: stderr = "PageCorrector Error: output is not valid JSON: %s" % out return self.results(stdout=None, stderr=stderr, exitcode=1) self.job.postproc_result.pp_health = out self.job.page_result.corr_ocr_text_path = self.job.alto_txt_file self.job.page_result.corr_ocr_xml_path = self.job.alto_xml_file return self.results(stdout=None, stderr=None, exitcode=0)