Source code for psyrun.scheduler

"""Job scheduler."""

from collections import namedtuple
import getpass
import os
import os.path
import re
import subprocess
import sys


JobStatus = namedtuple('JobStatus', ['id', 'status', 'name'])


[docs]class Scheduler(object):
    """Scheduler interface."""

    USER_DEFAULT_ARGS = {}

[docs]    def submit(
            self, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        """Submit a job.

        Parameters
        ----------
        args : sequence
            The command and arguments to execute.
        output_filename : str
            File to write process output to.
        name : str, optional
            Name of job.
        depends_on : sequence of int, optional
            IDs of jobs that need to finish first before the submitted job can
            be started.
        scheduler_args : dict, optional
            Additional arguments for the scheduler.

        Returns
        -------
        Job ID
        """
        raise NotImplementedError()

[docs]    def submit_array(
            self, n, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        """Submit a job array.

        If the scheduler does not support job arrays, this method should raise
        *NotImplementedError*.

        Parameters
        ----------
        n : int
            Number of tasks to submit.
        args : sequence
            The command and arguments to execute. The string ``'%a'`` will be
            replaced with the task number in each argument.
        output_filename : str
            File to write process output to. The string ``'%a'`` will be
            replaced by the task number.
        name : str, optional
            Name of job.
        depends_on : sequence of int, optional
            IDs of jobs that need to finish first before the submitted job can
            be started.
        scheduler_args : dict, optional
            Additional arguments for the scheduler.

        Returns
        -------
        Job ID
        """
        raise NotImplementedError()

[docs]    def kill(self, jobid):
        """Kill a job.

        Parameters
        ----------
        jobid
            Job to kill.
        """
        raise NotImplementedError()

[docs]    def get_status(self, jobid):
        """Get the status of a job.

        Parameters
        ----------
        jobid
            Job to request status of.

        Returns
        -------
        `JobStatus`
            Returns a tuple with *(id, status, name)* wherein status can be

            * ``'Q'`` for a queued job
            * ``'*Q'`` for a queued job waiting on another job to finish
            * ``'Z'`` for a sleeping job
            * ``'D'`` for a completed job

            If no status data is available for the job ID, None will be
            returned.
        """
        raise NotImplementedError()

[docs]    def get_jobs(self):
        """Get all queued, running, and recently finished jobs.

        Returns
        -------
        sequence
            Job IDs
        """
        raise NotImplementedError()


[docs]class ImmediateRun(Scheduler):
    """Runs jobs immediately on the local machine."""

    _cur_id = 0
    _failed_jobs = []

[docs]    def submit(
            self, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        """Submit a job.

        Parameters
        ----------
        args : sequence
            The command and arguments to execute.
        output_filename : str
            File to write process output to.
        name
            Unused.
        depends_on
            Unused
        scheduler_args
            Unused.

        Returns
        -------
        int
            Job ID
        """
        if depends_on is None:
            depends_on = []

        self._cur_id += 1

        if any(x in self._failed_jobs for x in depends_on):
            self._failed_jobs.append(self._cur_id)
            return self._cur_id

        with open(output_filename, 'a') as f:
            if subprocess.call(args, stdout=f, stderr=subprocess.STDOUT) != 0:
                self._failed_jobs.append(self._cur_id)
        return self._cur_id

[docs]    def submit_array(
            self, n, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        raise NotImplementedError("Job arrays not supported.")

[docs]    def kill(self, jobid):
        """Has no effect."""
        pass

[docs]    def get_status(self, jobid):
        """Has no effect."""
        pass

[docs]    def get_jobs(self):
        """Returns an empty list."""
        return []


[docs]class ExternalScheduler(Scheduler):
[docs]    class Option(object):
        def __init__(self, name, conversion=str):
            self.name = name
            self.conversion = conversion

[docs]        def build(self, value):
            raise NotImplementedError()

        def __repr__(self):
            return "<{clsname} '{name}'>".format(
                clsname=self.__class__.__name__, name=self.name)

[docs]    class ShortOption(Option):
[docs]        def build(self, value):
            if value is None:
                return []
            return [self.name, self.conversion(value)]

[docs]    class LongOption(Option):
[docs]        def build(self, value):
            if value is None:
                return []
            return [self.name + '=' + self.conversion(value)]

    KNOWN_ARGS = {}

[docs]    def build_args(self, **kwargs):
        args = []
        for k, v in kwargs.items():
            args.extend(self.KNOWN_ARGS[k].build(v))
        return args


[docs]class Sqsub(ExternalScheduler):
    """sqsub (sharcnet) scheduler."""

    USER_DEFAULT_ARGS = {
        'timelimit': '1h',
        'n_cpus': 1,
        'n_nodes': 1,
        'memory': '1G',
    }

    KNOWN_ARGS = {
        'timelimit': ExternalScheduler.ShortOption('-r'),
        'output_file': ExternalScheduler.ShortOption('-o'),
        'n_cpus': ExternalScheduler.ShortOption('-n'),
        'n_nodes': ExternalScheduler.ShortOption('-N'),
        'memory': ExternalScheduler.LongOption('--mpp'),
        'depends_on': ExternalScheduler.ShortOption(
            '-w', lambda jobids: ','.join(str(x) for x in jobids)),
        'idfile': ExternalScheduler.LongOption('--idfile'),
        'name': ExternalScheduler.ShortOption('-j')
    }

    def __init__(self, workdir=None):
        super(Sqsub, self).__init__()

        if workdir is None:
            workdir = '/work/{user}/psyrun'.format(user=os.environ['USER'])
        if not os.path.exists(workdir):
            os.makedirs(workdir)
        self.workdir = workdir
        self.idfile = os.path.join(workdir, 'idfile')
        self._jobs = None

[docs]    def submit(
            self, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        """Submit a job.

        Parameters
        ----------
        args : list
            The command and arguments to execute.
        output_filename : str
            File to write process output to.
        name : str, optional
            Name of job.
        depends_on : list of int, optional
            IDs of jobs that need to finish first before the submitted job can
            be started.
        scheduler_args : dict, optional
            Additional arguments for the scheduler.

        Returns
        -------
        int
            Job ID
        """
        if scheduler_args is None:
            scheduler_args = dict()
        else:
            scheduler_args = {k: v(name) if callable(v) else v
                              for k, v in scheduler_args.items()}

        # Checking whether jobs depending on are completed before submitting
        # the new job is a race condition, but I see no possibility to avoid
        # it.
        if depends_on is not None:
            statii = [self.get_status(x) for x in depends_on]
            depends_on = [x for x, s in zip(depends_on, statii)
                          if s is not None and s.status != 'D']

        scheduler_args.update({
            'idfile': self.idfile,
            'output_file': output_filename,
            'depends_on': depends_on,
            'name': name,
        })
        subprocess.check_call(
            ['sqsub'] + self.build_args(**scheduler_args) + args)
        with open(self.idfile, 'r') as f:
            return int(f.read())

[docs]    def submit_array(
            self, n, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        raise NotImplementedError("Job arrays not supported.")

[docs]    def kill(self, jobid):
        """Kill a job.

        Parameters
        ----------
        jobid : int
            Job to kill.
        """
        subprocess.check_call(['sqkill', str(jobid)])

[docs]    def get_status(self, jobid=None):
        """Get the status of a job.

        Parameters
        ----------
        jobid : int
            Job to request status of.

        Returns
        -------
        namedtuple or None
            Returns a tuple with *(id, status, name)* wherein status can be

            - ``'R'`` for a running job
            - ``'Q'`` for a queued job
            - ``'*Q'`` for a queued job waiting on another job to finish
            - ``'Z'`` for a sleeping job
            - ``'D'`` for a completed job

            If no status data is available for the job ID, None will be
            returned.
        """
        if self._jobs is None:
            self.refresh_job_info()
        elif jobid not in self._jobs:
            try:
                stdout = subprocess.check_output(
                    ['sqjobs', str(jobid)], stderr=subprocess.STDOUT,
                    universal_newlines=True)
                for line in stdout.split('\n')[2:]:
                    cols = line.split(None, 5)
                    if len(cols) > 3 and int(cols[0]) == jobid:
                        if cols[2] == 'C':
                            cols[2] = 'D'
                        self._jobs[jobid] = JobStatus(jobid, cols[2], cols[-1])
            except subprocess.CalledProcessError as err:
                # 1 if none pending, running, or suspended
                if err.returncode != 1:
                    raise
        return self._jobs.get(jobid, None)

[docs]    def get_jobs(self):
        """Get all queued and running jobs.

        Returns
        -------
        list of int
            Job IDs
        """
        if self._jobs is None:
            self.refresh_job_info()
        return self._jobs.keys()

[docs]    def refresh_job_info(self):
        self._jobs = {}
        stdout = subprocess.check_output(['sqjobs'], universal_newlines=True)
        for line in stdout.split('\n')[2:]:
            cols = line.split(None, 5)
            if len(cols) > 2 and cols[2] in ['R', 'Q', '*Q', 'Z']:
                jobid = int(cols[0])
                self._jobs[jobid] = JobStatus(jobid, cols[2], cols[-1])


[docs]class Slurm(ExternalScheduler):
    """Slurm (sbatch) scheduler."""

    USER_DEFAULT_ARGS = {
        'timelimit': '1h',
        'memory': '1G',
    }

    KNOWN_ARGS = {
        'timelimit': ExternalScheduler.ShortOption('-t'),
        'output_file': ExternalScheduler.ShortOption('-o'),
        'cores-per-socket': ExternalScheduler.LongOption('--cores-per-socket'),
        'sockets-per-node': ExternalScheduler.LongOption('--sockets-per-node'),
        'n_cpus': ExternalScheduler.ShortOption('-c'),
        'n_nodes': ExternalScheduler.ShortOption('-N'),
        'memory': ExternalScheduler.LongOption('--mem'),
        'memory_per_cpu': ExternalScheduler.LongOption('--mem-per-cpu'),
        'depends_on': ExternalScheduler.ShortOption(
            '-d', lambda jobids: 'afterok:' + ':'.join(
                str(x) for x in jobids)),
        'name': ExternalScheduler.ShortOption('-J'),
        'array': ExternalScheduler.LongOption('--array'),
    }

    STATUS_MAP = {
        'PD': 'Q',
        'R': 'R',
        'CA': 'D',
        'CF': 'R',
        'CG': 'R',
        'CD': 'D',
        'F': 'D',
        'TO': 'D',
        'NF': 'D',
        'RV': 'D',
        'SE': 'D',
    }

    def __init__(self, workdir=None):
        super(Slurm, self).__init__()
        if workdir is None:
            workdir = '/project/{user}/psyrun'.format(user=os.environ['USER'])
        if not os.path.exists(workdir):
            os.makedirs(workdir)
        self.workdir = workdir
        self._jobs = None

[docs]    def submit(
            self, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        """Submit a job.

        Parameters
        ----------
        args : list
            The command and arguments to execute.
        output_filename : str
            File to write process output to.
        name : str, optional
            Name of job.
        depends_on : list of int, optional
            IDs of jobs that need to finish first before the submitted job can
            be started.
        scheduler_args : dict, optional
            Additional arguments for the scheduler.

        Returns
        -------
        str
            Job ID
        """
        out = subprocess.check_output(self._prepare_submisson_cmd(
            args, output_filename, name, scheduler_args, depends_on))
        out = out.decode('utf-8')
        return out.split(None)[-1]

[docs]    def submit_array(
            self, n, args, output_filename, name=None, depends_on=None,
            scheduler_args=None):
        if scheduler_args is None:
            scheduler_args = dict()
        else:
            scheduler_args = dict(scheduler_args)

        scheduler_args['array'] = '0-' + str(n - 1)

        code = '''#!{python}
import os
import subprocess
import sys

task_id = os.environ['SLURM_ARRAY_TASK_ID']
sys.exit(subprocess.call([a.replace('%a', str(task_id)) for a in {args!r}]))
'''.format(python=sys.executable, args=args)

        cmd = self._prepare_submisson_cmd(
            [], output_filename, name, scheduler_args, depends_on)
        p = subprocess.Popen(
            cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        out, _ = p.communicate(code.encode('utf-8'))
        out = out.decode('utf-8')
        if p.returncode != 0:
            raise subprocess.CalledProcessError(p.returncode, cmd, out)
        return out.split(None)[-1]

    def _prepare_submisson_cmd(
            self, args, output_filename, name=None, scheduler_args=None,
            depends_on=None):
        if scheduler_args is None:
            scheduler_args = dict()
        else:
            scheduler_args = {k: v(name) if callable(v) else v
                              for k, v in scheduler_args.items()}

        # Checking whether jobs depending on are completed before submitting
        # the new job is a race condition, but I see no possibility to avoid
        # it.
        if depends_on is not None:
            statii = [self.get_status(x) for x in depends_on]
            depends_on = [x for x, s in zip(depends_on, statii)
                          if s is not None and s.status != 'D']

        scheduler_args.update({
            'output_file': output_filename,
            'name': name,
        })
        if depends_on is not None and len(depends_on) > 0:
            scheduler_args['depends_on'] = depends_on
        return ['sbatch'] + self.build_args(**scheduler_args) + args

[docs]    def kill(self, jobid):
        """Kill a job.

        Parameters
        ----------
        jobid : str
            Job to kill.
        """
        subprocess.check_call(['scancel', str(jobid)])

[docs]    def get_status(self, jobid=None):
        """Get the status of a job.

        Parameters
        ----------
        jobid : str
            Job to request status of.

        Returns
        -------
        namedtuple or None
            Returns a tuple with *(id, status, name)* wherein status can be

            - ``'R'`` for a running job
            - ``'Q'`` for a queued job
            - ``'*Q'`` for a queued job waiting on another job to finish
            - ``'Z'`` for a sleeping job
            - ``'D'`` for a completed job

            If no status data is available for the job ID, None will be
            returned.
        """
        if self._jobs is None:
            self.refresh_job_info()
        elif jobid not in self._jobs:
            stdout = subprocess.check_output(
                ['squeue', '-u', getpass.getuser(), '-h', '-o', '%A %t %j %r',
                 '-j', str(jobid)],
                stderr=subprocess.STDOUT, universal_newlines=True)
            self._update_jobs(stdout)
        return self._jobs.get(jobid, None)

[docs]    def get_jobs(self):
        """Get all queued and running jobs.

        Returns
        -------
        list of str
            Job IDs
        """
        if self._jobs is None:
            self.refresh_job_info()
        return self._jobs.keys()

[docs]    def refresh_job_info(self):
        self._jobs = {}
        stdout = subprocess.check_output(
            ['squeue', '-u', getpass.getuser(), '-h', '-o', '%i %t %j %r'],
            universal_newlines=True)
        self._update_jobs(stdout)

    def _update_jobs(self, squeue_out):
        for line in squeue_out.split('\n'):
            cols = line.split(None, 4)
            if len(cols) > 3 and cols[1] in ['PD', 'R', 'CF', 'CG']:
                jobid = cols[0]
                status = self.STATUS_MAP[cols[1]]
                if status == 'Q' and cols[3] == 'Dependency':
                    status = '*Q'
                m = re.match(r'^(\d+)_\[(\d+)-(\d+)\]$', jobid)
                if m:
                    for i in range(int(m.group(2)), int(m.group(3)) + 1):
                        sub_id = m.group(1) + '_' + str(i)
                        self._jobs[sub_id] = JobStatus(
                            sub_id, status, cols[2] + ':' + str(i))
                else:
                    self._jobs[jobid] = JobStatus(jobid, status, cols[2])
Source code for psyrun.scheduler

Table Of Contents

Related Topics

Useful Links