import ConfigParser
import getpass
import json
import os
# TODO: Need sane defaults for all settings
defaults = {
"controller": {
"scheduler": "slurm",
"skip_existing": True,
"operate_on": "pages"
},
"globus": {
"username": None,
"min_activation_time": None,
},
"scheduler": {
"mem_per_cpu": "4000",
"cpus_per_task": "1",
"set_walltime": False,
"extra_args": '[]',
},
"ocular": {
"ocr_text_suffix": "_transcription.txt",
},
"denoise": {
"enabled": True,
},
"multi-column-skew": {
"enabled": True,
},
"page-corrector": {
"enabled": True,
"java_args": '["-Xms128M", "-Xmx512M"]',
"alt_arg": "2",
"max_transforms": "20",
"noise_cutoff": "0.5",
"ctx_min_match": None,
"ctx_min_vol": None,
"dump": False,
"save": False,
"timeout": -1,
},
"page-evaluator": {
"enabled": True,
"java_args": '["-Xms128M", "-Xmx128M"]',
},
}
[docs]class EmopSettings(object):
def __init__(self, config_path):
self.config_path = os.path.abspath(config_path)
self.config = ConfigParser.ConfigParser()
self.config.read(self.config_path)
# Settings based on environment variables
if os.getenv("EMOP_HOME"):
self.emop_home = os.getenv("EMOP_HOME")
else:
self.emop_home = os.path.dirname(self.config_path)
if os.getenv("DENOISE_HOME"):
self.denoise_home = os.getenv("DENOISE_HOME")
else:
raise RuntimeError("DENOISE_HOME environment variable not set")
if os.getenv("SEASR_HOME"):
self.seasr_home = os.getenv("SEASR_HOME")
else:
raise RuntimeError("SEASR_HOME environment variable not set")
if os.getenv("JUXTA_HOME"):
self.juxta_home = os.getenv("JUXTA_HOME")
else:
raise RuntimeError("JUXTA_HOME environment variable not set")
if os.getenv("RETAS_HOME"):
self.retas_home = os.getenv("RETAS_HOME")
else:
raise RuntimeError("RETAS_HOME environment variable not set")
# Settings for communicating with dashboard
self.api_version = self.get_value('dashboard', 'api_version')
self.url_base = self.get_value('dashboard', 'url_base')
self.auth_token = self.get_value('dashboard', 'auth_token')
self.api_headers = {
'Content-Type': 'application/json',
'Accept': 'application/emop; version=%s' % self.api_version,
'Authorization': 'Token token=%s' % self.auth_token,
}
# Settings used by controller
self.payload_input_path = self.get_value('controller', 'payload_input_path')
self.payload_output_path = self.get_value('controller', 'payload_output_path')
self.payload_completed_path = os.path.join(self.payload_output_path, "completed")
self.payload_uploaded_path = os.path.join(self.payload_output_path, "uploaded")
self.ocr_root = self.get_value('controller', 'ocr_root')
self.input_path_prefix = self.get_value('controller', 'input_path_prefix')
self.output_path_prefix = self.get_value('controller', 'output_path_prefix')
self.log_level = self.get_value('controller', 'log_level')
self.scheduler = self.get_value('controller', 'scheduler')
self.controller_skip_existing = self.get_bool_value('controller', 'skip_existing')
self.operate_on = self.get_value('controller', 'operate_on')
# Settings used to interact with the cluster scheduler
self.max_jobs = int(self.get_value('scheduler', 'max_jobs'))
self.scheduler_queue = self.get_value('scheduler', 'queue')
self.scheduler_transfer_queue = self.get_value('scheduler', 'transfer_queue', default=self.scheduler_queue)
self.scheduler_job_name = self.get_value('scheduler', 'name')
self.min_job_runtime = int(eval(self.get_value('scheduler', 'min_job_runtime')))
self.max_job_runtime = int(eval(self.get_value('scheduler', 'max_job_runtime')))
self.avg_page_runtime = int(eval(self.get_value('scheduler', 'avg_page_runtime')))
self.scheduler_logdir = self.get_value('scheduler', 'logdir')
self.scheduler_mem_per_cpu = self.get_value('scheduler', 'mem_per_cpu')
self.scheduler_cpus_per_task = self.get_value('scheduler', 'cpus_per_task')
self.scheduler_set_walltime = self.get_bool_value('scheduler', 'set_walltime')
# Allow to fail if invalid type provided
self.scheduler_extra_args = json.loads(self.get_value('scheduler', 'extra_args'))
# Settings related to Globus
self.globus_auth_file = os.path.join(self.emop_home, '.globus-auth')
self.globus_cluster_endpoint = self.get_value("globus", "cluster_endpoint")
self.globus_remote_endpoint = self.get_value("globus", "remote_endpoint")
self.globus_username = self.get_value("globus", "username")
# Default min activation time is max job runtime plus 1 day
_default_globus_min_activation_time = self.max_job_runtime + 60*60*24
self.globus_min_activation_time = self.get_value("globus", "min_activation_time", default=_default_globus_min_activation_time)
# Settings used by Tesseract
self.tesseract_tessdata_dir = self.get_value("tesseract", "tessdata_dir")
# Settings used by Ocular
self.ocular_emission_engine = self.get_value("ocular", "emission_engine")
self.ocular_ocr_text_suffix = self.get_value("ocular", "ocr_text_suffix")
# Settings used by DeNoise
self.denoise_enabled = self.get_bool_value('denoise', 'enabled')
# Settings used by MultiColumnSkew
self.multi_column_skew_enabled = self.get_bool_value('multi-column-skew', 'enabled')
# Settings used by PageCorrector
self.page_corrector_enabled = self.get_bool_value('page-corrector', 'enabled')
self.page_corrector_timeout = int(self.get_value('page-corrector', 'timeout'))
# Settings used by PageEvaluator
self.page_evaluator_enabled = self.get_bool_value('page-evaluator', 'enabled')
# Settings used by Juxta-cl
self.juxta_cl_jx_algorithm = self.get_value('juxta-cl', 'jx_algorithm')
[docs] def get_value(self, section, option, default=None):
"""Get settings value
This function is a warper for ConfigParser.get() that
handles missing values and substitutes them for defaults set
in a dict within global space of EmopSettings.
Interpolation is performed on specific items found in %() within
the INI file.
``home`` - HOME environment variable
``emop_home`` - The emop_home setting value
Args:
section (str): INI file section
option (str): INI file option name
default (str): Default value if one is not found.
Defaults to None.
Returns:
str: The config value
"""
interpolation_map = {
"home": os.getenv("HOME"),
"emop_home": self.emop_home,
}
raw_value = None
try:
raw_value = self.config.get(section, option, raw=False, vars=interpolation_map)
except (ConfigParser.NoOptionError, ConfigParser.NoSectionError) as e:
if default:
raw_value = default
elif section in defaults:
if option in defaults[section]:
raw_value = defaults[section][option]
else:
raise e
if isinstance(raw_value, basestring):
value = raw_value.strip("'")
else:
value = raw_value
return value
[docs] def get_bool_value(self, section, option, default=None):
"""Get settings bool value
This function is a warper for RawConfigParser.getboolean() that
handles missing values and substitutes them for defaults set
in a dict within global space of EmopSettings.
Args:
section (str): INI file section
option (str): INI file option name
default (str): Default value if one is not found.
Defaults to None.
Returns:
bool: The config value
"""
bool_value = None
try:
bool_value = self.config.getboolean(section, option)
except (ConfigParser.NoOptionError, ConfigParser.NoSectionError) as e:
if default:
bool_value = default
elif section in defaults:
if option in defaults[section]:
bool_value = defaults[section][option]
else:
raise e
return bool_value