Source code for rna_predict.simulation

# coding: utf-8

import glob
import numpy as np
import os
from os.path import splitext, basename, abspath
try:
    import cPickle as pickle
except ImportError:
    import pickle
import pprint
import re
import shutil
import string
import struct
import subprocess
import sys
import time

from . import dcatools
from . import pdbtools
from . import utils


[docs]def check_file_existence(path, alternative_error_text=None):
    """
    Make sure a file exists and raise an exception otherwise.

    :param path: filename to check
    :param alternative_error_text: alternative error text passed to exception
    """
    if not os.path.isfile(path):
        raise SimulationException(alternative_error_text if alternative_error_text is not None else "Cannot find file: %s" % path)


[docs]def check_dir_existence(path, alternative_error_text=None):
    """
    Make sure a directory exists and raise an exception otherwise.

    :param path: directory to check
    :param alternative_error_text: alternative error text passed to exception
    """
    if not os.path.isdir(path):
        raise SimulationException(alternative_error_text if alternative_error_text is not None else "Cannot find directory: %s" % path)


[docs]def delete_glob(pattern, print_notice=True):
    """
    Helper function to delete files while expanding shell globs.

    :param pattern: pattern to delete
    :param print_notice: if True print verbose notice
    """
    for f in glob.glob(pattern):
        if print_notice:
            print "deleting %s..." % f
        try:
            if os.path.isdir(f):
                shutil.rmtree(f, ignore_errors=True)
            else:
                os.remove(f)
        except IOError:
            pass


[docs]def get_model_count_in_silent_file(silent_file):
    """
    Returns the number of models present in a Rosetta silent file.

    :param silent_file: filename
    :return: model count
    """
    n = 0
    with open(silent_file, "r") as t:
        pattern_model = re.compile("^SCORE:\s+[0-9.-]+.*")
        for line in t:
            if pattern_model.match(line):
                n += 1

    return n


[docs]def merge_silent_files(target, sources):
    """
    Merges rosetta silent files (.out).

    All source files are appended to target.
    Model numbers are incremented uniquely.
    Source files are deleted after a successful merge.

    :param target: target filename
    :param sources: source filenames
    :return: model count
    """
    # see how many structures we have already
    n = get_model_count_in_silent_file(target) if os.path.isfile(target) else 0

    # loop through all source files and append all new structures to target
    # while adjusting the numbering
    pattern_header = re.compile("^(?:SEQUENCE:|SCORE:\s+score).*")
    pattern_normal = re.compile("^(.*)S_\d+$")
    for source in glob.glob(sources):
        print "merging %s into %s" % (source, target)
        with open(target, "a+") as t:
            with open(source, "r") as s:
                for line in s:
                    m = pattern_header.match(line)
                    if m:
                        # only write header if this is he first one
                        if n == 0:
                            t.write(line)
                        continue
                    m = pattern_normal.match(line)
                    if m:
                        if line.startswith("SCORE:"):
                            n += 1
                        t.write("%sS_%06d\n" % (m.group(1), n))
                        continue
                    t.write(line)

    # delete source files
    delete_glob(sources, print_notice=True)

    # return total number of structures
    return n


[docs]def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    """Helper function to be used as sort key in sorted() to naturally sort numeric parts."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]


# TODO: Correcting atom names IS MOST LIKELY WRONG! It is commented out for now and can probably be removed completely.
[docs]def fix_atom_names_in_cst(cst_info, sequence):
    """
    Switches N1 and N3 atom names in a residue.

    :param cst_info: list of constraints
    :param sequence: residue sequence in lower case
    :return: modified list of constraints
    """
    pyrimidines = ['c', 'u']
    purines = ['a', 'g']
    cst_info_new = []
    for cst in cst_info:
        atom_name1, res1, atom_name2, res2, function = cst
        if sequence[res1 - 1] in pyrimidines and atom_name1 == 'N1':
            atom_name1 = 'N3'
            print 'correcting atom name for ', res1
        if sequence[res2 - 1] in pyrimidines and atom_name2 == 'N1':
            atom_name2 = 'N3'
            print 'correcting atom name for ', res2
        if sequence[res1 - 1] in purines and atom_name1 == 'N3':
            atom_name1 = 'N1'
            print 'correcting atom name for ', res1
        if sequence[res2 - 1] in purines and atom_name2 == 'N3':
            atom_name2 = 'N1'
            print 'correcting atom name for ', res2
        cst_info_new.append([atom_name1, res1, atom_name2, res2, function])
    return cst_info_new


[docs]class SimulationException(Exception):
    """
    Custom exception class for foreseeable prediction errors.
    """
    pass


[docs]class Command(object):
    """
    Helper class to store external program calls plus additional flags.
    """

[docs]    def __init__(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
        """
        Create a new command wrapper.

        :param command: list of external command and parameters
        :param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
        :param dry_run: don't actually execute anything
        :param print_commands: print commandline when running
        :param stdin: optional text to use as standard input
        :param quiet: hide output of external program
        """
        self.command = command
        self.add_suffix = add_suffix
        self.dry_run = dry_run
        self.print_commands = print_commands
        self.stdin = stdin
        self.quiet = quiet

[docs]    def get_full_command(self, sysconfig):
        """
        If necessary, add the suffix to the command. Depending on user / system configuration.

        :param sysconfig: instance of SysConfig
        :return: suffixed command as a list
        """
        com = self.command[:]
        if self.add_suffix == "rosetta":
            com[0] = sysconfig.rosetta_exe_path + com[0] + sysconfig.rosetta_exe_suffix
        if sysconfig.subprocess_buffsize is not None:
            com = ["stdbuf", "-o", sysconfig.subprocess_buffsize] + com
        return com


[docs]class EvalData(object):
    """
    Evaluation data storing model and cluster information.
    """
[docs]    def __init__(self):
        """
        Create empty EvalData object.
        """
        self.models = {}
        self.clusters = {}
        self.cluster_cutoff = 0.0

    @staticmethod
[docs]    def load_from_cst(constraints):
        """
        Load evaluation data from a prediction.

        :param constraints: constraints selection
        """
        cst_name, cst_file = RNAPrediction.parse_cst_name_and_filename(constraints)
        return EvalData.load_from_file("predictions/%s/output/evaldata.dat" % cst_name)

    @staticmethod
[docs]    def load_from_file(filename):
        """
        Load evaluation data from a file.

        :param filename: path to a file containing the evaluation data
        """
        try:
            with open(filename, "r") as f:
                eval_data = pickle.load(f)
                if "rmsd_cluster_1" not in next(eval_data.models.itervalues()):
                    raise SimulationException("evaluation data file '%s' does not contain needed information" % filename)
                return eval_data
        except (IOError, pickle.PickleError, AttributeError, EOFError, IndexError, KeyError):
            # file broken or missing, force full evaluation
            raise SimulationException("evaluation data file '%s' missing or broken" % filename)

[docs]    def save_to_file(self, filename):
        """
        Dump evaldata to a file.

        :param filename: path to a file to store the data
        """
        with open(filename, "w") as f:
            pickle.dump(self, f)

[docs]    def get_model_count(self):
        """
        Returns the number of models in the vaulation data

        :return: number of models
        """
        return len(self.models)

[docs]    def get_models(self, model_list, kind="tag"):
        """Returns a selection of models with specific properties

        :param model_list: list of models to get, might be a list of numbers, or a list of model names
        :param kind: model selection mode:

            - ``tag``: str: internal name such as S_000123_5
            - ``top``: int: models ordered by score
            - ``ntop``: int: models ordered by rmsd_native
            - ``cluster``: int: nth cluster decoy
            - ``cluster_ntop``: n[/m]: nth cluster ordered by native rmsd [of first m]

        :return: list of selected models
        """
        # initialize list
        models_sorted = None

        results = []

        for model_i in model_list:
            # decide which model we want
            if kind == "cluster":
                model_i = int(model_i)
                tag = self.clusters[model_i]["primary_model"]
            elif kind == "cluster_ntop":
                if "/" in model_i:
                    i, limit = (int(x) for x in model_i.split("/"))
                    if i > limit:
                        raise SimulationException("get_models: Cluster number (%d) can't be larger than the limit (%d)" % (i, limit))
                else:
                    i = int(model_i)
                    limit = None
                model_selection = [self.models[m["primary_model"]] for m in self.clusters.values()[:limit]]
                models_sorted = sorted(model_selection, key=lambda i: i["rmsd_native"])
                tag = models_sorted[i - 1]["tag"]
            elif kind == "tag":
                tag = model_i
            elif kind == "top":
                model_i = int(model_i)
                # do we need to create a sorted list?
                if models_sorted is None:
                    models_sorted = sorted(self.models.items(), key=lambda x: x[1]["score"])
                tag = models_sorted[model_i - 1][0]
            elif kind == "ntop":
                model_i = int(model_i)
                if models_sorted is None:
                    try:
                        models_sorted = sorted(self.models.items(), key=lambda x: x[1]["rmsd_native"])
                    except KeyError:
                        raise SimulationException("get_models: No native rmsd information stored")
                tag = models_sorted[model_i - 1][0]
            else:
                raise SimulationException("get_models: Invalid 'kind' parameter: '%s'" % kind)

            if tag not in self.models:
                raise SimulationException("get_models: Invalid tag: '%s'" % tag)
            results.append(self.models[tag])

        return results

[docs]    def print_models(self, model_list, kind="tag"):
        """
        Print a list of models

        :param model_list: see get_models
        :param kind: see get_models
        """
        models = self.get_models(model_list, kind)
        for model in models:
            print "Model: %s" % model["tag"]
            pprint.pprint(model)

    @staticmethod
[docs]    def get_weighted_model_score(model, score_weights):
        """
        Calculate a model score based on different weights for the invidiual Rosetta scores

        :param model: model to reweight
        :param score_weights: dict of rosetta score names and their weights. ``default`` to set the default weight for all scores. Example: ``{'default':0, 'atom_pair_constraint': 1}`` to only use atom pair constraints.
        """
        score = 0
        for i in model["rosetta_scores"].keys():
            if i == "description":
                continue
            score += (score_weights[i] if i in score_weights else (score_weights["default"] if "default" in score_weights else 1)) * model["rosetta_scores"][i]
        return score


[docs]class RNAPrediction(object):
    """
    Base class used for prediction simulation
    """

    CONFIG_FILE = ".config"

[docs]    def load_config(self):
        """Load simulation configuration of current directory"""
        if os.path.exists(RNAPrediction.CONFIG_FILE):
            c = open(RNAPrediction.CONFIG_FILE)
            self.config = pickle.load(c)
            c.close()

[docs]    def save_config(self):
        """Save simulation configuration of current directorx"""
        c = open(RNAPrediction.CONFIG_FILE, "w")
        pickle.dump(self.config, c)
        c.close()

[docs]    def print_config(self):
        """Print simulation configuration."""
        print "Simulation configuration:"
        print "    path: %s" % (abspath(os.getcwd()))
        if self.config:
            for key, value in sorted(self.config.iteritems()):
                if key == "motif_res_maps" or key == "motif_stem_sets" or key == "motifs" or key == "stems" or key == "evaluate":
                    print "    %s: ..." % key
                else:
                    print "    %s: %s" % (key, value)
        else:
            print "    No configuration found."

    # TODO: support different data types?
[docs]    def modify_config(self, key, value):
        """
        Modify configuration entry.

        :param key: setting to modify
        :param value: new value ("-" to store None)
        """
        if key in self.config:
            self.config[key] = None if value == "-" else value
        else:
            raise SimulationException("No such config entry: %s" % key)

[docs]    def check_config(self):
        """Check if current directory contains a valid configuraion."""
        if not self.config:
            raise SimulationException("No config file found. Please run 'prepare' first!")

[docs]    def __init__(self, sysconfig):
        """Create a prediction simulation for the current directory and try to load an existing configuration."""
        self.config = {}
        self.sysconfig = sysconfig
        self.load_config()

[docs]    def execute_command(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
        """
        Execute an external command.

        :param command: list of external command and parameters
        :param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
        :param dry_run: don't actually execute anything
        :param print_commands: print commandline when running
        :param stdin: optional text to use as standard input
        :param quiet: hide output of external program
        """
        self.execute_commands([Command(command, add_suffix, dry_run, print_commands, stdin, quiet)])

[docs]    def execute_command_and_capture(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
        """
        Execute an external command while capturing output.

        :param command: list of external command and parameters
        :param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
        :param dry_run: don't actually execute anything
        :param print_commands: print commandline when running
        :param stdin: optional text to use as standard input
        :param quiet: hide output of external program
        :return: generator over lines of standard output
        """
        c = Command(command, add_suffix, dry_run, print_commands, stdin, quiet)
        command = c.get_full_command(self.sysconfig)
        if print_commands:
            print " ".join(command)
        try:
            p = subprocess.Popen(command, stdin=(subprocess.PIPE if c.stdin is not None else None), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            if c.stdin is not None:
                p.stdin.write(c.stdin)
        except OSError, e:
            raise SimulationException("Failed to execute command: %s, Reason: %s" % (" ".join(c.command), e))
        for line in p.stdout:
            if not quiet:
                sys.stdout.write(line)
            yield line
        p.wait()
        if p.returncode != 0:
            raise SimulationException("Non-zero return code from executed command: %s" % " ".join(c.command))

[docs]    def execute_commands(self, commands, threads=1):
        """
        Execute a list of commands parallelly.

        :param commands: list of Commands
        :param threads: number of parallel invocations
        """
        processes = []
        stdout = None
        try:
            while True:
                while commands and len(processes) < threads:
                    c = commands.pop(0)
                    command = c.get_full_command(self.sysconfig)
                    if c.print_commands:
                        print " ".join(command)
                    if not c.dry_run:
                        stdout = None
                        if c.quiet and stdout is None:
                            stdout = open(os.devnull, "w")
                        stderr = stdout
                        try:
                            p = subprocess.Popen(command, stdin=(subprocess.PIPE if c.stdin is not None else None), stdout=stdout, stderr=stderr)
                            # store original command in process
                            p.command = command
                            processes.append(p)
                            if c.stdin is not None:
                                p.stdin.write(c.stdin)
                        except OSError, e:
                            raise SimulationException("Failed to execute command: %s, Reason: %s" % (" ".join(c.command), e))

                for p in processes:
                    if p.poll() is not None:
                        processes.remove(p)
                        if p.returncode != 0:
                            raise SimulationException("Non-zero return code from executed command: %s" % " ".join(p.command))

                if not processes and not commands:
                    break
                else:
                    time.sleep(0.1)

        except:
            # kill all other processes
            for p in processes:
                p.kill()
            raise
        finally:
            if stdout is not None:
                stdout.close()

    @staticmethod
    def _make_tag_with_dashes(int_vector):
        """
        Transform a list of values into a list of strings while using "n-m" for adjacent values.

        Example: Turns ``[1, 2, 3, 5, 7, 8]`` into ``['1-3', '5', '7-8']``.

        :param int_vector: list of values
        :return: list of strings
        """
        tag = []

        start_res = int_vector[0]
        for i in range(1, len(int_vector) + 1):
            if i == len(int_vector) or int_vector[i] != int_vector[i - 1] + 1:

                stop_res = int_vector[i - 1]
                if stop_res > start_res:
                    tag += ["%d-%d" % (start_res, stop_res)]
                else:
                    tag += ["%d" % stop_res]

                if i < len(int_vector):
                    start_res = int_vector[i]

        return tag

    # noinspection PyMethodMayBeStatic
[docs]    def get_csts(self):
        """
        Returns a list of all constraints.

        :return: list of all constraints names
        """
        # loop over all constraint sets
        # that is either files in the "constraints" directory, or directories under "predictions", and always "none"
        cst_names = ["none"]
        cst_names += [splitext(basename(cst_file))[0] for cst_file in glob.glob("constraints/*.cst")]
        cst_names += [basename(cst_dir) for cst_dir in glob.glob("predictions/*")]
        return sorted(set(cst_names), key=natural_sort_key)

[docs]    def prepare(self, fasta_file="sequence.fasta", params_file="secstruct.txt", native_pdb_file=None, data_file=None, torsions_file=None, name=None):
        """
        Preparation step. Parse out stems and motifs from sequence and secondary structure information and create necessary base files.

        :param fasta_file: fasta file containing the sequence
        :param params_file: text file containing the secondary structure
        :param native_pdb_file: native pdb file if available
        :param data_file: additional data file if available
        :param torsions_file: additional torsions file if available
        :param name: optional name for this set of predictions
        """
        if name is None:
            name = os.path.basename(os.path.abspath(os.getcwd()))
        for f in [fasta_file, params_file, native_pdb_file, data_file, torsions_file]:
            if f is not None:
                check_file_existence(f)

        utils.mkdir_p("constraints")
        utils.mkdir_p("preparation")
        utils.mkdir_p("dca")
        utils.mkdir_p("predictions")
        self.config["fasta_file"] = fasta_file
        self.config["params_file"] = params_file
        self.config["native_pdb_file"] = native_pdb_file
        self.config["data_file"] = data_file
        self.config["torsions_file"] = torsions_file
        self.config["name"] = name

        print
        print "Preparation:"

        # Read in files
        for line in utils.read_file_line_by_line(fasta_file):
            if line[0] == ">":
                continue
            self.config["sequence"] = line.lower()
            break
        print self.config["sequence"]
        numres = len(self.config["sequence"])

        # Read in data information
        self.config["data_info"] = []
        if data_file is not None:
            self.config["backbone_burial_info"] = []
            for line in utils.read_file_line_by_line(data_file):
                if len(line) > 6 and line[:6] == 'EXPOSE':
                    cols = string.split(line)
                    for i in range(len(cols) / 3):
                        self.config["data_info"].append([int(cols[3 * i + 1]), cols[3 * i + 2], cols[3 * i + 3]])
                if len(line) > 15 and line[:15] == 'BACKBONE_BURIAL':
                    cols = string.split(line)
                    for i in range(1, len(cols)):
                        self.config["backbone_burial_info"].append(int(cols[i]))

        pair_map = {}
        all_pairs = []

        complement = {'a': ['u'], 'u': ['a', 'g'], 'c': ['g'], 'g': ['c', 'u']}

        # Parse out stems
        basepair_mismatch = []
        cutpoints_original = []
        for line in utils.read_file_line_by_line(params_file):
            if line[:4] == 'STEM':
                cols = string.split(line)
                for i in range(len(cols)):
                    if cols[i] == 'PAIR':
                        # Offset to get to python numbering (starts with zero)
                        res1 = int(cols[i + 1]) - 1
                        res2 = int(cols[i + 2]) - 1
                        pair_map[res1] = res2
                        pair_map[res2] = res1
                        all_pairs.append([res1, res2])
                        # check if the secondary structure watson-crick pair is allowed
                        if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
                            basepair_mismatch += [[res1, res2]]
            elif line.count('(') > 0:  # maybe dot/bracket notation (((...)))
                print line
                self.config["secstruc"] = line
                left_brackets = []
                for i in range(len(line)):
                    if line[i] == '(':
                        left_brackets.append(i)
                    if line[i] == ')':
                        res1 = left_brackets[-1]
                        res2 = i
                        del (left_brackets[-1])
                        pair_map[res1] = res2
                        pair_map[res2] = res1
                        all_pairs.append([res1, res2])
                        # check if the secondary structure watson-crick pair is allowed
                        if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
                            basepair_mismatch += [[res1, res2]]
                if left_brackets:
                    raise SimulationException("unbalanced paranthesis in secondary structure")
            else:
                cols = string.split(line)
                res1 = int(cols[0]) - 1
                res2 = int(cols[1]) - 1
                pair_map[res1] = res2
                pair_map[res2] = res1
                all_pairs.append([res1, res2])
                # check if the secondary structure watson-crick pair is allowed
                if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
                    basepair_mismatch += [[res1, res2]]

            if line[:13] == 'CUTPOINT_OPEN':
                cutpoints_original = map(lambda x: int(x), string.split(line[14:]))

        # print all helix pairs that don't fit the allowed complements
        if basepair_mismatch:
            basepair_mismatch_str = [" "] * len(self.config["sequence"])
            for r in basepair_mismatch:
                basepair_mismatch_str[r[0]] = "^"
                basepair_mismatch_str[r[1]] = "^"
            print "".join(basepair_mismatch_str)

            # more verbose:
            for r in basepair_mismatch:
                res1 = r[0]
                res2 = r[1]
                resstr1 = str(res1 + 1) + self.config["sequence"][res1]
                resstr2 = str(res2 + 1) + self.config["sequence"][res2]
                print " " * res1 + "^" + " " * (res2 - res1 - 1) + "^"
                print " " * res1 + resstr1 + " " * (res2 - res1 - len(resstr1)) + resstr2

            raise SimulationException("some helix pairs in secondary structure cannot be realized (see above)")

        # print pair_map

        # Parse out stems
        already_in_stem = {}
        for i in range(numres):
            already_in_stem[i] = 0

        self.config["stems"] = []
        for i in range(numres):
            if i in pair_map and not already_in_stem[i]:  # In a base pair
                k = i
                stem_res = [[k, pair_map[k]]]

                already_in_stem[k] = 1
                already_in_stem[pair_map[k]] = 1

                # Can we extend in one direction?
                while (k + 1) in pair_map and pair_map[k + 1] == pair_map[k] - 1 and not already_in_stem[k + 1]:
                    k += 1
                    stem_res.append([k, pair_map[k]])
                    already_in_stem[k] = 1
                    already_in_stem[pair_map[k]] = 1

                # Do not allow single WC base pairs.
                if len(stem_res) < 2:
                    print 'All stems must have length > 1 bp '
                    print stem_res
                    exit()
                self.config["stems"].append(stem_res)

        # Parse out motifs
        already_in_motif = {}
        for i in range(numres):
            already_in_motif[i] = 0

        motif_cutpoints = []
        self.config["motif_res_maps"] = []
        self.config["motif_stem_sets"] = []
        self.config["motifs"] = []
        for i in range(numres):

            if not already_in_motif[i] and (not already_in_stem[i] or (i > 0
                                                                       and already_in_stem[i - 1]
                                                                       and pair_map[i] + 1 != pair_map[i - 1])):

                motif_res = []
                motif_stem_set = []
                cutpoints = []

                if i > 1:

                    # back up to beginning of stem.
                    k = i - 1

                    # first base pair.
                    motif_stem = [[k, pair_map[k]]]
                    motif_res.append(k)
                    motif_res.append(pair_map[k])

                    k -= 1
                    while k >= 0 and already_in_stem[k] and \
                            (pair_map[k] - 1 == pair_map[k + 1]):
                        motif_stem.append([k, pair_map[k]])
                        motif_res.append(k)
                        motif_res.append(pair_map[k])
                        k -= 1
                    motif_stem_set.append(motif_stem)
                    k += 1
                    cutpoints.append(pair_map[k])

                # print 'AFTER FIRST HELIX: ', motif_res

                k = i
                while k not in motif_res and k < numres:
                    # Move forward to next stem:
                    while k < numres and not already_in_stem[k]:
                        if already_in_motif[k]:
                            print 'Hey cant deal with pseudoknots!'
                            exit()
                        motif_res.append(k)
                        k += 1

                    stem_start = k

                    if k >= numres:
                        cutpoints.append(k - 1)
                        break

                    if k in motif_res:
                        break

                    motif_stem = [[k, pair_map[k]]]
                    motif_res.append(k)
                    motif_res.append(pair_map[k])
                    k += 1

                    while k < numres and already_in_stem[k] and pair_map[k - 1] == pair_map[k] + 1 and k not in motif_res:
                        motif_stem.append([k, pair_map[k]])
                        motif_res.append(k)
                        motif_res.append(pair_map[k])
                        k += 1
                    motif_stem_set.append(motif_stem)
                    cutpoints.append(k - 1)

                    # Next non-helical part..
                    k = pair_map[stem_start] + 1

                    # print 'AFTER NEXT HELIX: ', motif_res

                motif_res.sort()

                motif_res_map = {}
                for k in range(len(motif_res)):
                    motif_res_map[motif_res[k]] = k
                    already_in_motif[motif_res[k]] = 1

                self.config["motifs"].append(motif_res)
                self.config["motif_stem_sets"].append(motif_stem_set)
                motif_cutpoints.append(cutpoints)
                self.config["motif_res_maps"].append(motif_res_map)
                # print 'CUTPOINTS ', cutpoints

        for i in range(len(self.config["stems"])):

            # Fasta
            tag = 'preparation/stem%d.fasta' % (i + 1)
            fid = open(tag, 'w')
            fid.write('>' + tag + '\n')

            stem_res = self.config["stems"][i]
            stem_length = len(stem_res)

            for k in range(stem_length):
                fid.write(self.config["sequence"][stem_res[k][0]])
                # print stem_res[k][0]+1,
            for k in range(stem_length):
                fid.write(self.config["sequence"][stem_res[stem_length - k - 1][1]])
                # print stem_res[stem_length-k-1][1]+1,

            # print
            fid.write('\n')
            fid.close()
            print 'Created: ', tag

            # pdb_file
            if native_pdb_file is not None:
                command = ["pdbslice.py",
                           native_pdb_file,
                           "-segment",
                           "%d" % (stem_res[0][0] + 1),
                           "%d" % (stem_res[-1][0] + 1),
                           "%d" % (stem_res[-1][-1] + 1),
                           "%d" % (stem_res[0][-1] + 1),
                           "preparation/stem%d_" % (i + 1)]
                self.execute_command(command)
                native_pdb_file_subset = 'preparation/stem%d_%s' % (i + 1, native_pdb_file)
                print 'Created: ', native_pdb_file_subset

        # Output motif jobs
        for i in range(len(self.config["motifs"])):

            # Fasta
            motif_fasta_file = 'preparation/motif%d.fasta' % (i + 1)
            fid = open(motif_fasta_file, 'w')
            fid.write('>' + motif_fasta_file + '\n')

            motif_res = self.config["motifs"][i]
            motif_length = len(motif_res)

            for k in range(motif_length):
                fid.write(self.config["sequence"][motif_res[k]])
            fid.write('\n')
            fid.close()
            print 'Created: ', motif_fasta_file

            # params file
            motif_stem_set = self.config["motif_stem_sets"][i]
            motif_res_map = self.config["motif_res_maps"][i]
            motif_cutpoint = motif_cutpoints[i]

            motif_params_file = 'preparation/motif%d.params' % (i + 1)
            fid = open(motif_params_file, 'w')

            for k in range(len(motif_stem_set)):
                motif_stem = motif_stem_set[k]
                fid.write('STEM   ')
                for n in range(len(motif_stem)):
                    fid.write('  PAIR %d %d W W A' %
                              (motif_res_map[motif_stem[n][0]] + 1,
                               motif_res_map[motif_stem[n][1]] + 1))
                fid.write('\n')

                fid.write('OBLIGATE PAIR %d %d W W A \n\n' %
                          (motif_res_map[motif_stem[-1][0]] + 1,
                           motif_res_map[motif_stem[-1][1]] + 1))

            motif_cutpoint.sort()

            # print motif_res
            # print motif_cutpoint

            if len(motif_cutpoint) > 1:
                fid.write('CUTPOINT_OPEN ')
                for k in range(len(motif_cutpoint)):
                    if motif_res_map[motif_cutpoint[k]] < (len(motif_res) - 1):
                        fid.write(' %d' % (motif_res_map[motif_cutpoint[k]] + 1))
            fid.write('\n')
            fid.close()
            print 'Created: ', motif_params_file

            # pdb_file
            if native_pdb_file is not None:
                command = ["pdbslice.py",
                           native_pdb_file,
                           "-subset"]
                for k in range(motif_length):
                    command += ["%d" % (motif_res[k] + 1)]
                command += ["preparation/motif%d_" % (i + 1)]
                self.execute_command(command)
                native_pdb_file_subset = 'preparation/motif%d_%s' % (i + 1, native_pdb_file)
                print 'Created: ', native_pdb_file_subset

            if data_file is not None:
                motif_data_file = 'preparation/motif%d.data' % (i + 1)
                fid_data = open(motif_data_file, 'w')
                fid_data.write('EXPOSE')
                for data in self.config["data_info"]:
                    if data[0] - 1 in motif_res_map.keys():
                        fid_data.write('   %d %s %s ' % (motif_res_map[data[0] - 1] + 1, data[1], data[2]))
                fid_data.write('\n')

                if len(self.config["backbone_burial_info"]) > 0:
                    fid_data.write('BACKBONE_BURIAL ')
                    for k in self.config["backbone_burial_info"]:
                        if k - 1 in motif_res_map.keys():
                            fid_data.write(' %d' % (motif_res_map[k - 1] + 1))
                    fid_data.write('\n')
                fid_data.close()
                print 'Created: ', motif_data_file

        if len(cutpoints_original) > 0:
            # noinspection PyUnboundLocalVariable
            fid.write('CUTPOINT_OPEN ')
            for cutpoint in cutpoints_original:
                fid.write(' %d' % (cutpoint + 1))
            fid.write('\n')

        cutpoints = []
        for i in range(len(self.config["motifs"])):

            motif_stem_set = self.config["motif_stem_sets"][i]

            motif_stem = motif_stem_set[0]

            possible_cutpoints = [motif_stem[0][0], motif_stem[1][1]]
            possible_cutpoints.sort()
            # print possible_cutpoints
            if possible_cutpoints[0] not in cutpoints:
                cutpoints.append(possible_cutpoints[0])

        params_file = "preparation/sequence.params"
        fid = open(params_file, 'w')

        if len(cutpoints) > 0:
            fid.write('CUTPOINT_CLOSED ')
            # cutpoints.sort()
            for cutpoint in cutpoints:
                fid.write(' %d' % (cutpoint + 1))
            fid.write('\n')

        # for cutpoint in cutpoints:
        # fid.write( 'OBLIGATE   PAIR %d %d W W A\n' % (cutpoint+1, pair_map[cutpoint]+1) )

        for i in range(len(self.config["stems"])):
            stem_res = self.config["stems"][i]
            fid.write('STEM ')
            for k in range(len(stem_res)):
                fid.write(' PAIR %d %d W W A ' %
                          (stem_res[k][0] + 1, stem_res[k][1] + 1))
            fid.write('\n')
            fid.write('OBLIGATE PAIR %d %d W W A \n\n' %
                      (stem_res[-1][0] + 1,
                       stem_res[-1][1] + 1))

        fid.close()
        print 'Created: ', params_file

        # default cutpoint constraints
        assemble_cst_file = "preparation/cutpoints.cst"
        fid = open(assemble_cst_file, 'w')
        fid.write('[ atompairs ]\n')
        for cutpoint in cutpoints:
            fid.write("O3'  %d  P     %d  HARMONIC  1.619  2.0\n" % (cutpoint + 1, cutpoint + 2))
        fid.close()
        print 'Created: ', assemble_cst_file

[docs]    def create_helices(self, dry_run=False, threads=1):
        """
        Helix creation step. Create one ideal helix model for each helix.

        :param dry_run: don't actually run any external command
        :param threads: number of threads to use
        """
        self.check_config()
        commands = list()
        delete_glob("preparation/stem*.out")
        for i in range(len(self.config["stems"])):
            command = ["rna_helix",
                       "-fasta", "preparation/stem%d.fasta" % (i + 1),
                       "-out:file:silent", "preparation/stem%d.out" % (i + 1)]
            commands.append(Command(command, add_suffix="rosetta", dry_run=dry_run))
        self.execute_commands(commands, threads=threads)

        # rna_helix dumps <sequence>.pdb files in the working directory.
        # These can be useful for us, so move them to the preparation directory with a proper stemX.pdb filename.
        # TODO: What happens if there are multiple helices with the same sequence? In that case we need to move them
        # out of the way before running the next command.
        # Just disable multithreading here (not really needed with stem generation anyways)?
        for i in range(len(self.config["stems"])):
            sequence = [self.config["sequence"][j[0]] for j in self.config["stems"][i]]
            sequence += [self.config["sequence"][j[1]] for j in reversed(self.config["stems"][i])]
            shutil.move("%s.pdb" % ("".join(sequence)), "preparation/stem%d.pdb" % (i + 1))

    @staticmethod
[docs]    def parse_cst_name_and_filename(constraints):
        """
        Find and clean up constraints by name or filename

        :param constraints: constraints name or filename
        :return: tuple of constraints name and filename
        """
        if constraints is None or constraints == "none":
            cst_name = "none"
            cst_file = None
        elif os.path.isfile(constraints):
            cst_file = constraints
            cst_name = splitext(basename(cst_file))[0]
        else:
            constraints = basename(constraints)
            cst_name, cst_ext = splitext(constraints)
            if cst_ext != ".cst":
                cst_name = constraints
            cst_file = "constraints/%s.cst" % cst_name
            check_file_existence(cst_file)
        return cst_name, cst_file

    @staticmethod
[docs]    def parse_cst_file(constraints_file):
        """
        Parse .cst file as a list of constraints

        :param constraints_file: path to a .cst file
        :return: list of constraints
        """
        cst_info = []
        for line in utils.read_file_line_by_line(constraints_file):
            if len(line) > 6 and line[0] != '[':
                cols = string.split(line)
                atom_name1 = cols[0]
                res1 = int(cols[1])
                atom_name2 = cols[2]
                res2 = int(cols[3])
                cst_info.append([atom_name1, res1, atom_name2, res2, cols[4:]])
        return cst_info

[docs]    def prepare_cst(self, constraints=None, motifs_override=None):
        """
        Constraints preparation step. Prepare constraints files for motif generation and assembly.

        :param constraints: constraints selection
        :param motifs_override: optional name of a different set of constraints to use as motifs.
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        print "Constraints preparation:"
        print "    constraints: %s" % cst_name

        dir_prediction = "predictions/%s" % cst_name
        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_motifs = "predictions/%s/motifs" % cst_name

        if motifs_override:
            motifcst_name, motifcst_file = self.parse_cst_name_and_filename(motifs_override)
            print "    motif constraints: %s" % motifcst_name
            if motifcst_name == cst_name:
                raise SimulationException("Motif override cst name can't be the same as the cst name!")

        utils.mkdir_p("predictions/%s" % cst_name)
        utils.mkdir_p(dir_assembly)

        if motifs_override:
            # TODO: just make all that stuff a cst config file
            # create MOTIF_OVERRIDE file and store the cst name of the motifs
            file_motif_override = "%s/MOTIF_OVERRIDE" % dir_prediction
            with open(file_motif_override, "w") as f:
                f.write(motifcst_name)

        # assembly constraints: start with default harmonic cutpoints
        assembly_cst = "%s/assembly.cst" % dir_assembly
        shutil.copy("preparation/cutpoints.cst", assembly_cst)
        print "Created: %s" % assembly_cst

        # if cst is "none" we are done now
        if cst_file is None:
            return

        # load constraints information
        cst_info = self.parse_cst_file(cst_file)

        # probably wrong, commented out for now!
        # cst_info = fix_atom_names_in_cst(cst_info, self.config["sequence"])

        # add tertiary constraints
        with open(assembly_cst, "a") as ft:
            for cst in cst_info:
                ft.write('%s %d %s %d %s \n' % (cst[0], cst[1], cst[2], cst[3], " ".join(map(str, cst[4]))))
        print "Added tertiary constraints: %s" % assembly_cst

        # extract relevant constraints for motif generation from global cst file if necessary
        if not motifs_override:
            utils.mkdir_p(dir_motifs)
            for i in range(len(self.config["motifs"])):
                motif_res_map = self.config["motif_res_maps"][i]
                motif_cst_file = '%s/motif%d.cst' % (dir_motifs, i + 1)
                fid_cst = open(motif_cst_file, 'w')
                fid_cst.write('[ atompairs ]\n')
                for cst in cst_info:
                    if cst[1] - 1 in motif_res_map.keys() and cst[3] - 1 in motif_res_map.keys():
                        fid_cst.write('%s %d %s %d %s\n' % (cst[0], motif_res_map[cst[1] - 1] + 1, cst[2], motif_res_map[cst[3] - 1] + 1, " ".join(map(str, cst[4]))))
                fid_cst.close()
                print 'Created: ', motif_cst_file

[docs]    def get_status(self, constraints=None, include_evaluation_data=False, include_motif_model_count=False, include_assembly_model_count=False):
        """
        Returns a dict containing status information for a cst.

        :param constraints: constraints selection
        :param include_motif_model_count: set to True to include model count (longer processing time)
        :param include_assembly_model_count: set to True to include model count (longer processing time)
        :param include_evaluation_data: set to True to include evaluation model count and native RMSD values (best of first 1,5,10 clusters) if available (longer processing time)
        :return: status dict
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        dir_prediction = "predictions/%s" % cst_name
        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_motifs = "predictions/%s/motifs" % cst_name
        dir_output = "predictions/%s/output" % cst_name
        file_motifoverride = dir_prediction + "/MOTIF_OVERRIDE"
        file_evaldata = "%s/evaldata.dat" % dir_output

        res = {"step_preparation_done": os.path.isfile(dir_assembly + "/assembly.cst"),
               "step_motifs_done": bool(glob.glob(dir_motifs + "/*.out")),
               "step_motifs_overridden": None,
               "step_assembly_done": bool(glob.glob(dir_assembly + "/*.out")),
               "step_evaluation_done": os.path.isfile(file_evaldata)}
        if os.path.isfile(file_motifoverride):
            with open(file_motifoverride, "r") as f:
                res["step_motifs_overridden"] = f.read().strip()

        # evaluation data
        if include_evaluation_data:
            res["evaluation_rmsd_native"] = None
            res["evaluation_model_count"] = None
            try:
                eval_data = EvalData.load_from_cst(cst_name)
                res["evaluation_model_count"] = eval_data.get_model_count()
                if "rmsd_native" in next(eval_data.models.itervalues()):
                    res["evaluation_rmsd_native"] = [m["rmsd_native"] for m in eval_data.get_models(["1/1", "1/5", "1/10"], kind="cluster_ntop")]
            except SimulationException:
                pass

        # motif model count
        if include_motif_model_count:
            res["model_count_motifs"] = {i: sum(map(get_model_count_in_silent_file, glob.glob("%s/motif%d*.out" % (dir_motifs, i)))) for i in xrange(1, len(self.config["motifs"]) + 1)}

        # assembly model count
        if include_assembly_model_count:
            res["model_count_assembly"] = sum(map(get_model_count_in_silent_file, (glob.glob("%s/assembly_*.out" % dir_assembly))))

        return res

[docs]    def create_motifs(self, nstruct=50000, cycles=20000, dry_run=False, seed=None, use_native_information=False, threads=1, constraints=None, motif_subset=None):
        """
        Motif generation step. Generate models for each motif.

        :param nstruct: number of models to create for each motif
        :param cycles: number of monte-carlo cycles per model
        :param dry_run: don't actually run any external command
        :param seed: optionally override random seed
        :param use_native_information: use native pdb file to calculate rmsd for each model
        :param threads: number of threads to use
        :param constraints: constraints selection
        :param motif_subset: sepcify a list of motifs to generate instead of all, counting starts at 1 (i.e. [1, 3, 4])
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        print "Motif creation configuration:"
        print "    constraints: %s" % cst_name
        print "    cycles: %s" % cycles
        print "    nstruct: %s" % nstruct
        print "    dry_run: %s" % dry_run
        print "    random_seed: %s" % seed
        print "    threads: %s" % threads
        if motif_subset is not None:
            print "    subset: %s" % motif_subset

        dir_motifs = "predictions/%s/motifs" % cst_name
        utils.mkdir_p("predictions/%s" % cst_name)
        utils.mkdir_p(dir_motifs)

        n_motifs = len(self.config["motifs"])

        # list of motifs that we should generate models for
        # when no subset was given create models for all motifs
        if motif_subset is None:
            motif_subset = [i + 1 for i in range(n_motifs)]
        else:
            # remove duplicates if any
            motif_subset = list(set(motif_subset))
            # make sure that all the numbers actually exist
            if max(motif_subset) > n_motifs:
                raise SimulationException("Invalid motif subset. Structure only contains %d motifs." % n_motifs)

        # check if motif constraints were created correctly
        if cst_file is not None:
            for i in motif_subset:
                check_file_existence("%s/motif%d.cst" % (dir_motifs, i), "Motif cst files not found. Please run the 'prepare-cst' step!")

        # merge all motifs and check what we have so far
        completed = {}
        for i in motif_subset:
            completed[i] = merge_silent_files("%s/motif%d.out" % (dir_motifs, i), "%s/motif%d_*.out" % (dir_motifs, i))

        commands = list()
        for i in motif_subset:
            # split the rest of the work in multiple jobs
            structs_missing = nstruct - completed[i]
            print "motif %d:  completed: %d  missing: %d" % (i, completed[i], structs_missing)
            if structs_missing <= 0:
                continue

            structs_threads = [0 for _ in range(threads)]
            for j in range(threads):
                structs_threads[j] = structs_missing / threads
            for j in range(structs_missing % threads):
                structs_threads[j] += 1

            command = ["rna_denovo",
                       "-fasta", "preparation/motif%d.fasta" % i,
                       "-params_file", "preparation/motif%d.params" % i,
                       "-cycles", "%d" % cycles,
                       "-mute", "all",
                       "-close_loops",
                       "-close_loops_after_each_move",
                       "-minimize_rna"]

            if cst_file is not None:
                command += ["-cst_file", '%s/motif%d.cst' % (dir_motifs, i)]
            if self.config["native_pdb_file"] is not None and use_native_information:
                command += ["-native", "preparation/motif%d_%s" % (i, self.config["native_pdb_file"])]
            if self.config["data_file"] is not None:
                command += ["-data_file", "preparation/motif%d.data" % i]
            if self.config["torsions_file"] is not None:
                command += ["-vall_torsions", "preparation/motif%d.torsions" % i]

            which_stems = []
            stem_chunk_res = []
            motif_stem_set = self.config["motif_stem_sets"][i - 1]
            motif_res_map = self.config["motif_res_maps"][i - 1]
            for k in range(len(motif_stem_set)):
                motif_stem = motif_stem_set[k]
                # need to find in stems
                for n in range(len(self.config["stems"])):
                    stem = self.config["stems"][n]
                    found_match = 0
                    for q in range(len(stem)):
                        if motif_stem[0][0] in stem[q]:
                            found_match = 1
                            break
                    if found_match:
                        which_stems.append(n)
                        break

                for q in range(len(stem)):
                    stem_chunk_res.append(motif_res_map[stem[q][0]] + 1)
                for q in range(len(stem)):
                    stem_chunk_res.append(motif_res_map[stem[len(stem) - q - 1][1]] + 1)

            command += ["-in:file:silent_struct_type", "rna",
                        "-in:file:silent"]
            for n in which_stems:
                command += ["preparation/stem%d.out" % (n + 1)]

            command += ["-in:file:input_res"]
            command += self._make_tag_with_dashes(stem_chunk_res)

            for j in range(threads):
                if structs_threads[j] == 0:
                    continue
                command_full = command + ["-out:file:silent", "%s/motif%d_%d.out" % (dir_motifs, i, j + 1),
                                          "-nstruct", "%d" % (structs_threads[j])]
                if seed is not None:
                    command_full += ["-constant_seed", "-jran", "%d" % seed]
                    seed += 1

                commands.append(Command(command_full, add_suffix="rosetta", dry_run=dry_run))
        self.execute_commands(commands, threads=threads)

        # merge motifs
        for i in motif_subset:
            merge_silent_files("%s/motif%d.out" % (dir_motifs, i), "%s/motif%d_*.out" % (dir_motifs, i))

[docs]    def assemble(self, nstruct=50000, cycles=20000, constraints=None, dry_run=False, seed=None, use_native_information=False, threads=1):
        """
        Assembly step. Assemble helices and motifs into complete models.

        :param nstruct: number of models to create
        :param cycles: number of monte-carlo cycles per model
        :param constraints: constraints selection
        :param dry_run: don't actually run any external command
        :param seed: optionally override random seed
        :param use_native_information: use native pdb file to calculate rmsd for each model
        :param threads: number of threads to use
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        print "Assembly configuration:"
        print "    constraints: %s" % cst_name
        print "    cycles: %s" % cycles
        print "    nstruct: %s" % nstruct
        print "    dry_run: %s" % dry_run
        print "    random_seed: %s" % seed

        # Check if a different set of motifs should be used.
        file_motif_override = "predictions/%s/MOTIF_OVERRIDE" % cst_name
        if os.path.isfile(file_motif_override):
            with open(file_motif_override, "r") as f:
                motifcst_name = f.read().strip()
                print "    using motifs: %s" % motifcst_name
        else:
            motifcst_name = cst_name

        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_motifs = "predictions/%s/motifs" % motifcst_name
        file_assembly_cst = "%s/assembly.cst" % dir_assembly

        check_file_existence(file_assembly_cst, "Preparation incomplete. Please run 'prepare-cst'.")
        check_file_existence("%s/motif1.out" % dir_motifs, "Motifs not found. Please run 'create-motifs'.")

        commands = list()

        command = ["rna_denovo",
                   "-minimize_rna",
                   "-fasta", self.config["fasta_file"],
                   "-in:file:silent_struct_type", "binary_rna",
                   "-cycles", "%d" % cycles,
                   "-params_file", "preparation/sequence.params",
                   "-close_loops",
                   "-in:file:silent"]

        for i in range(len(self.config["stems"])):
            command += ["preparation/stem%d.out" % (i + 1)]

        for i in range(len(self.config["motifs"])):
            command += ["%s/motif%d.out" % (dir_motifs, i + 1)]

        if cst_file is not None:
            command += ["-cst_file", file_assembly_cst]

        chunk_res = []
        for n in range(len(self.config["stems"])):
            stem = self.config["stems"][n]
            for q in range(len(stem)):
                chunk_res.append(stem[q][0] + 1)
            for q in range(len(stem)):
                chunk_res.append(stem[len(stem) - q - 1][1] + 1)

        for n in range(len(self.config["motifs"])):
            motif_res = self.config["motifs"][n]
            for m in motif_res:
                chunk_res.append(m + 1)

        command += ["-in:file:input_res"]
        command += self._make_tag_with_dashes(chunk_res)

        if self.config["native_pdb_file"] is not None and use_native_information:
            command += ["-native", self.config["native_pdb_file"]]
        if self.config["torsions_file"] is not None:
            command += ["-vall_torsions", self.config["torsions_file"]]
        if self.config["data_file"] is not None:
            command += ["-data_file", self.config["data_file"]]

        # distribute nstruct among threads
        structs_threads = [0 for _ in range(threads)]
        for j in range(threads):
            structs_threads[j] = nstruct / threads
        for j in range(nstruct % threads):
            structs_threads[j] += 1

        for j in range(threads):
            if structs_threads[j] == 0:
                continue

            # In case no seed is specified, we do what rosetta does to get a random number, and seed it with that.
            # This way we know the number beforehand and can choose an appropriate output filename.
            # In case a seed was specified, we increment it here with the thread number.
            if seed is None:
                seed_used = struct.unpack("=i", os.urandom(4))[0]
            else:
                seed_used = seed + j

            command_full = command + ["-out:file:silent", "%s/assembly_%d.out" % (dir_assembly, seed_used),
                                      "-constant_seed", "-jran", "%d" % seed_used, "--nstruct", "%d" % structs_threads[j]]

            commands.append(Command(command_full, add_suffix="rosetta", dry_run=dry_run))

        self.execute_commands(commands, threads=threads)

    # TODO: set the cutoff value back to 4.0? It was set to 4.1 because the old bash script used integer comparison and even 4.09 was treated as 4.0
[docs]    def evaluate(self, constraints=None, cluster_limit=10, cluster_cutoff=4.1, full_evaluation=False):
        """
        Evaluation step. Extract model information, cluster models and calculate rmsd values.

        :param constraints: constraints selection
        :param cluster_limit: maximum number of clusters to produce
        :param cluster_cutoff: rmsd distance in A at which a new cluster is created
        :param full_evaluation: discard existing evaluation data, re-extract model information and re-calculate rmsd values.
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        print "Evaluation configuration:"
        print "    constraints: %s" % cst_name
        print "    cluster_limit: %s" % cluster_limit
        print "    cluster_cutoff: %s" % cluster_cutoff

        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_output = "predictions/%s/output" % cst_name
        dir_tmp = "predictions/%s/temp" % cst_name
        file_evaldata = "%s/evaldata.dat" % dir_output

        if not os.path.isdir("predictions/%s" % cst_name):
            raise SimulationException("No prediction directory for constraint '%s' found! Maybe you should assemble first?" % cst_name)

        files_assembly = sorted(glob.glob("%s/assembly_*.out" % dir_assembly), key=natural_sort_key)
        if len(files_assembly) == 0:
            raise SimulationException("No assembly files for constraint '%s' found! Maybe you should assemble first?" % cst_name)

        if not full_evaluation:
            if not os.path.isdir(dir_output) or not os.path.isfile(file_evaldata):
                # first time, silently do a full eval
                full_evaluation = True
            else:
                try:
                    eval_data = EvalData.load_from_file(file_evaldata)
                except SimulationException:
                    # file broken or missing, force full evaluation
                    print "Warning: %s missing or broken, running full evaluation." % file_evaldata
                    full_evaluation = True

        print "    full_evaluation: %s" % full_evaluation

        # cleanup
        if full_evaluation:
            delete_glob(dir_output)
            delete_glob(dir_tmp)
            # create empty evaluation data
            eval_data = EvalData()
        else:
            # clean out old cluster information from models
            for model in eval_data.models.itervalues():
                if "cluster" in model:
                    del model["cluster"]

        eval_data.clusters = {}

        delete_glob(dir_output + os.sep + "*.pdb")
        delete_glob(dir_output + os.sep + "*.out")
        utils.mkdir_p(dir_output)
        utils.mkdir_p(dir_tmp)

        # extract info from .out files
        if full_evaluation:
            # store all the rosetta scores in a dict
            scores_headers = None

            # loop over all out files matching the constraint
            for i, f in enumerate(files_assembly):
                print "processing rosetta silent file: %s..." % f

                # read out file and store a dict of the scores and the full score line
                regex_score = re.compile("^SCORE:\s+(.*)$")
                for line in utils.read_file_line_by_line(f):
                    m = regex_score.match(line)
                    if m:
                        # split rosetta score entries into a list
                        scores = m.group(1).split()
                        # store the header names if not already known
                        if scores[-1] == "description":
                            if not scores_headers:
                                scores_headers = scores
                            continue
                        # create a score dict
                        scores_dict = {}
                        for s_index, s_name in enumerate(scores_headers):
                            # store float values as float
                            try:
                                scores_dict[scores_headers[s_index]] = float(scores[s_index])
                            except ValueError:
                                scores_dict[scores_headers[s_index]] = scores[s_index]

                        # name models exactly like rosetta would (that is, append _<num> when already present)
                        j = 1
                        tag = scores_dict["description"]
                        while tag in eval_data.models:
                            tag = "%s_%d" % (scores_dict["description"], j)
                            j += 1
                        eval_data.models[tag] = {"source_file": basename(f),
                                                 "score": scores_dict["score"],
                                                 "tag": tag,
                                                 "tag_source": scores_dict["description"],
                                                 "rosetta_scores": scores_dict}

        # clustering
        eval_data.cluster_cutoff = cluster_cutoff
        print "clustering models..."
        filename_clusters = "%s/clusters.out" % dir_output
        sys.stdout.write("  ")

        regex_clusters = re.compile("^.*RNA_PREDICT (old|new) cluster ([0-9]+) score ([-0-9.]+) model (S_[0-9_]+)$")
        for line in self.execute_command_and_capture(["rna_cluster", "-in:file:silent"] + files_assembly + ["-cluster:radius", "%s" % cluster_cutoff, "-nstruct", str(cluster_limit), "-out:file:silent", filename_clusters], add_suffix="rosetta", quiet=True):
            m = regex_clusters.match(line)
            if m:
                eval_data.models[m.group(4)]["cluster"] = int(m.group(2))
                if m.group(1) == "new":
                    eval_data.clusters[int(m.group(2))] = {"primary_model": m.group(4)}
                print "  %s cluster: %02s, model: %-10s, score: %.3f" % ("new" if m.group(1) == "new" else "   ", m.group(2), m.group(4), float(m.group(3)))

        # extract cluster pdbs
        print "extracting cluster decoy pdbs..."
        owd = os.getcwd()
        try:
            os.chdir(dir_output)
            self.execute_command(["rna_extract", "-in:file:silent", basename(filename_clusters), "-in:file:silent_struct_type", "rna"], add_suffix="rosetta")
        finally:
            os.chdir(owd)

        # loop over all extracted pdb files
        for i, fe in enumerate(sorted(glob.glob("%s/S_*.pdb" % dir_output), key=natural_sort_key)):
            shutil.move(fe, "%s/cluster_%d.pdb" % (dir_output, i + 1))

        # rmsdcalc helper function
        # noinspection PyShadowingNames
        def calculate_rmsd(name, filename_comparison):
            # calculate native rmsd values for all models if native pdb available
            filename_tmp = "%s/rmsd.out" % dir_tmp
            print "caluculating rmsd values to %s for all models..." % name
            sys.stdout.write("  ")

            regex_rmsd = re.compile("^All atom rmsd over moving residues: (S_[0-9_]+) ([-0-9.]+)$")
            for line in self.execute_command_and_capture(["rna_score", "-in:file:silent"] + files_assembly + ["-in:file:native", filename_comparison, "-score:just_calc_rmsd", "-out:file:silent", filename_tmp], add_suffix="rosetta", quiet=True):
                m = regex_rmsd.match(line)
                if m:
                    print m.group(1), m.group(2)
                    eval_data.models[m.group(1)]["rmsd_%s" % name] = float(m.group(2))

            delete_glob(filename_tmp, print_notice=False)

        # only calculate rmsds if full_evalation is needed or wanted
        if full_evaluation:
            # calculate rmsd to native structure if native pdb available
            if self.config["native_pdb_file"] is not None:
                calculate_rmsd("native", self.config["native_pdb_file"])

            # calculate rmsd to best model
            calculate_rmsd("cluster_1", "%s/cluster_1.pdb" % dir_output)

        # save evaluation data
        eval_data.save_to_file(file_evaldata)

[docs]    def evaluate_custom(self, constraints=None, dca_prediction_filename="dca/dca.txt", full_evaluation=False, threshold=7.5, radius=2, number_dca_predictions=100, threads=4):
        """
        Custom scoring algorithmy by inspecting neighboring residues of dca contact pairs

        :param constraints: constraints selection
        :param dca_prediction_filename: dca filename
        :param full_evaluation: discard existing evaluation data and extracted models, re-extract and re-calculate distance information
        :param threshold: threshold in A to count a contact
        :param radius: number of adjacent residues to inspect
        :param number_dca_predictions: maximum number of DCA predictions to use
        :param threads: number of threads to use for extraction
        """
        self.check_config()
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        print "Evaluation configuration:"
        print "    constraints: %s" % cst_name
        print "    dca_file: %s" % dca_prediction_filename
        print "    dca_count: %s" % number_dca_predictions
        print "    threshold: %f" % threshold
        print "    radius: %s" % radius

        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_output = "predictions/%s/output" % cst_name
        dir_tmp = "predictions/%s/temp" % cst_name
        file_evaldata = "%s/evaldata.dat" % dir_output
        file_matrices = "%s/matrices.dat" % dir_tmp

        files_assembly = sorted(glob.glob("%s/assembly_*.out" % dir_assembly), key=natural_sort_key)

        try:
            eval_data = EvalData.load_from_file(file_evaldata)
        except SimulationException:
            raise SimulationException("Normal --evaluate step needs to be run before --evaluate-custom.")

        slen = len(self.config["sequence"])

        utils.mkdir_p(dir_output)
        utils.mkdir_p(dir_tmp)

        extract = full_evaluation
        build_matrices = full_evaluation

        # check if we can use existing data
        if not build_matrices:
            # try to load existing distance matrix
            print "loading existing distance matrices..."
            try:
                with open(file_matrices, "r") as f:
                    distance_matrices = pickle.load(f)
            except (IOError, pickle.PickleError, AttributeError, EOFError, IndexError, KeyError):
                print "error loading existing matrices. forcing rebuild..."
                build_matrices = True

                # check if we need to extract pdb files
                if not os.path.isfile("%s/%s.pdb" % (dir_tmp, next(eval_data.models.iterkeys()))):
                    print "no extraced pdb files found. forcing extraction..."
                    extract = True

        # extract all pdb files to temp dir
        if extract:
            delete_glob(dir_tmp)
            print "extracing pdb files..."
            commands = []
            for f in files_assembly:
                commands.append(Command(["rna_extract", "-in:file:silent", f, "-in:file:silent_struct_type", "rna", "-out:prefix", dir_tmp + os.sep + splitext(basename(f))[0] + "_"], add_suffix="rosetta"))
            self.execute_commands(commands, threads=threads)

            for model in eval_data.models.itervalues():
                shutil.move(dir_tmp + os.sep + splitext(basename(model["source_file"]))[0] + "_" + model["tag_source"] + ".pdb", dir_tmp + os.sep + model["tag"] + ".pdb")

        # build residue distance matrix for all models
        if build_matrices:
            delete_glob(file_matrices)
            print "building matrices for %d models..." % len(eval_data.models)
            distance_matrices = {}
            x = 1
            for model in eval_data.models.itervalues():
                matrix = np.zeros([len(self.config["sequence"]), len(self.config["sequence"])])
                chain = pdbtools.parse_pdb("", dir_tmp + os.sep + model["tag"] + ".pdb")[0].child_list[0]

                for i in range(2, slen + 1):
                    for j in range(2, slen + 1):
                        if j == i:
                            continue
                        matrix[i - 1][j - 1] = chain[i]["P"] - chain[j]["P"]
                distance_matrices[model["tag"]] = matrix
                if x % 10 == 0:
                    sys.stdout.write("%d..." % x)
                    sys.stdout.flush()
                x += 1
            if x % 10 != 0:
                print "%x..." % x
            print "done."
            with open(file_matrices, "w") as f:
                pickle.dump(distance_matrices, f)

        # calculate scores
        print "calculating scores..."
        dca = dcatools.parse_dca_data(dca_prediction_filename)
        for model in eval_data.models.itervalues():
            score_custom = 0
            matrix = distance_matrices[model["tag"]]
            c_count = 0
            for contact in dca:
                c_count += 1
                if c_count > number_dca_predictions:
                    break
                found = False
                for i in range(-radius, radius + 1):
                    if contact.res1 + i < 1 or contact.res1 + i > slen:
                        continue
                    for j in range(-radius, radius + 1):
                        if contact.res2 + j < 1 or contact.res2 + j > slen:
                            continue
                        if abs(contact.res1 + i - contact.res2 - j) <= 3:
                            continue
                        if matrix[contact.res1 + i - 1][contact.res2 + j - 1] < threshold:
                            found = True
                            score_custom -= 1
                            break
                    if found:
                        break
            print "model: %-11s, score: %8.3f, score_custom: %3d" % (model["tag"], model["score"], score_custom)
            model["score_custom"] = score_custom

        eval_data.save_to_file(file_evaldata)

    # returns the path to the extracted pdb file
[docs]    def extract_pdb(self, constraints, model):
        """
        Extract PDB file of a model to the tmp directory.

        :param constraints: constraints selection
        :param model: tag of the model
        :return: path to the extracted PDB file
        """
        cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
        dir_assembly = "predictions/%s/assembly" % cst_name
        dir_tmp = "predictions/%s/temp" % cst_name
        prefix = dir_tmp + os.path.sep + "tmp_"
        pdbfile = "%s/%s.pdb" % (dir_tmp, model["tag"])
        self.execute_command(["rna_extract", "-in:file:silent", "%s/%s" % (dir_assembly, model["source_file"]), "-out:prefix", prefix, "-tags", model["tag_source"]], add_suffix="rosetta", quiet=True)
        shutil.move("%s/tmp_%s.pdb" % (dir_tmp, model["tag_source"]), pdbfile)
        return pdbfile

    @staticmethod
[docs]    def get_models(constraints, model_list, kind="tag"):
        """
        Returns a selection of models with specific properties

        :param constraints: constraints selection
        :param model_list: see EvalData.print_models
        :param kind: see EvalData.get_models
        """
        return EvalData.load_from_cst(constraints).get_models(model_list=model_list, kind=kind)

    @staticmethod
[docs]    def print_models(constraints, model_list, kind="tag"):
        """
        Print a set of models

        :param constraints: constraints selection
        :param model_list: see EvalData.print_models
        :param kind: see EvalData.print_models
        """
        EvalData.load_from_cst(constraints).print_models(model_list=model_list, kind=kind)

[docs]    def extract_models(self, constraints, model_list, kind="tag"):
        """
        Extract PDB files of a set of models

        :param constraints: constraints selection
        :param model_list: see EvalData.print_models
        :param kind: see EvalData.print_models
        """
        for model in EvalData.load_from_cst(constraints).get_models(model_list=model_list, kind=kind):
            pdbfile = self.extract_pdb(constraints, model)
            print "Extracted: %s" % pdbfile

    # TODO: include mapping_mode?
    @staticmethod
    def _create_constraints_output_filename(input_filename, output_filename, cst_function, number_dca_predictions=None, output_format="%n_%f"):
        """
        Create a reasonable constraints output filename based on input filename and a formatting string.

        The formatting string can include placeholdes

        :param input_filename: input cst filename
        :param output_filename: fixed output filename or basename or None to automatically format using output_format
        :param cst_function: rosetta function
        :param number_dca_predictions: maximum number of DCA predictions to use
        :param output_format: formatting string containing placeholders: %f: rosetta function
                                                                         %n: basename of source file
                                                                         %d: number of dca predictions
        :return: output filename
        """
        source_basename = splitext(basename(input_filename))[0]
        cst_function_underscore = cst_function.replace(" ", "_")

        # chose a default filename, is none given
        if output_filename is None:
            output_filename = "constraints/%s.cst" % output_format
        # check if a path was given, if not put it in the constraints dir
        elif basename(output_filename) == output_filename:
            output_filename = "constraints/%s" % output_filename

        # make sure the filename always ends in .cst
        if splitext(output_filename)[1] != ".cst":
            output_filename += ".cst"

        # replace placeholders with actual values
        output_filename = output_filename.replace("%f", cst_function_underscore).replace("%n", source_basename).replace("%d", str(number_dca_predictions))
        return output_filename

[docs]    def parse_dca_filter_string(self, line):
        """
        Parse a text string and turn it tinto a list of DCA filters

        Multiple filters are separated by command and have the folling format:

        ``<filter>:<arg>:<arg>:...``


        Threshold filter: Lookup dca contacts in a PDB file, discard or keep contact depending on whether the contact is realized:

        Format: ``threshold:<n>:<cst>:<mode>:<moodel>``

        - n: threshold (< 0: keep below, > 0: keep above)
        - cst: constraints selection to look up PDB file
        - mode, model: model selection mode (see EvalData.get_models for details)

        Example: ``threshold:8.0:100rnaDCA_FADE_-100_26_20_-2_2:cluster:1,threshold:-6.0:100rnaDCA_FADE_-100_26_20_-2_2:cluster:1``


        None filter: Empty filter

        Format: ``none``

        :param line: string to parse
        :return: list of DcaFilter objects
        """
        filter_chain = []

        # filters are split by ","
        for filtertext in line.split(","):
            # initialize to None to suppress warning
            dca_filter = None
            # filter fields are split by ":"
            fields = filtertext.split(":")
            if fields[0] == "none":
                continue
            elif fields[0] == "threshold":
                threshold = float(fields[1])
                cst_name = fields[2]
                model_kind = fields[3]
                model_i = fields[4]
                eval_data = EvalData.load_from_cst(cst_name)
                model = eval_data.get_models([model_i], model_kind)[0]
                pdbfile = self.extract_pdb(cst_name, model)
                filter_pdb_chain = pdbtools.parse_pdb("", pdbfile)[0].child_list[0]
                dca_filter = dcatools.DcaFilterThreshold(filter_pdb_chain, abs(threshold), keep_below=threshold > 0, mode="minimum_heavy")
            filter_chain.append(dca_filter)

        return filter_chain

[docs]    def make_constraints(self, dca_prediction_filename="dca/dca.txt", output_filename=None, number_dca_predictions=100, cst_function="FADE -100 26 20 -2 2", filter_text=None, mapping_mode="minAtom"):
        """
        Create a set of constraints from a DCA prediction.

        :param dca_prediction_filename: DCA input file
        :param output_filename: fixed output filename or None to automatically create
        :param number_dca_predictions: maximum number of DCA predictions to use
        :param cst_function: rosetta function and parameters as text string
        :param filter_text: optional: List of DCA filters (see parse_dca_filter_string for details)
        :param mapping_mode: atom-to-atom mapping mode to use, supported values: ``minAtom`` or ``pOnly``
        """
        self.check_config()
        output_filename = self._create_constraints_output_filename(dca_prediction_filename, output_filename, cst_function, number_dca_predictions, "%d%n_%f")
        print "Constraints creation:"
        print "    dca_prediction_filename: %s" % dca_prediction_filename
        print "    output_filename: %s" % output_filename
        print "    number_dca_predictions: %d" % number_dca_predictions
        print "    mode: %s" % mapping_mode
        print "    function: %s" % cst_function
        print "    filter: %s" % filter_text
        check_file_existence(dca_prediction_filename)

        # load dca contacts from file
        dca = dcatools.parse_dca_data(dca_prediction_filename)

        # filter dca data
        if filter_text is not None:
            print "Filtering dca data:"
            dca_filter_chain = self.parse_dca_filter_string(filter_text)
            dcatools.filter_dca_data(dca_data=dca, dca_filter_chain=dca_filter_chain)

        # create constraints
        print "Creating constraints:"

        cst_info = dcatools.build_cst_info_from_dca_contacts(dca, sequence=self.config["sequence"], mapping_mode=mapping_mode, cst_function=cst_function, number_dca_predictions=number_dca_predictions)

        # write to file
        with open(output_filename, "w") as out:
            for c in cst_info:
                out.write("%s %d %s %d %s\n" % (c[0], c[1], c[2], c[3], " ".join(map(str, c[4]))))

[docs]    def edit_constraints(self, constraints, output_filename=None, cst_function="FADE -100 26 20 -2 2"):
        """
        Edit an existing .cst file, replacing the rosetta function.

        :param constraints: constraints selection
        :param output_filename: fixed output filename or None to automatically create
        :param cst_function: rosetta function and parameters as text string
        """
        cst_name, input_filename = self.parse_cst_name_and_filename(constraints)
        output_filename = self._create_constraints_output_filename(input_filename, output_filename, cst_function)
        print "Constraints editing:"
        print "    input_filename:  %s" % input_filename
        print "    output_filename: %s" % output_filename
        print "    function:       %s" % cst_function
        check_file_existence(input_filename)

        if input_filename == output_filename:
            raise SimulationException("Input and output filename cannot be the same")

        pattern = re.compile(r"^(\S+\s+\d+\s+\S+\s+\d+)\s+(\S+)\s+.+$")
        with open(output_filename, "w") as output_fd:
            for line in utils.read_file_line_by_line(input_filename):
                m = pattern.match(line)
                output_fd.write("%s %s\n" % (m.group(1), cst_function))

[docs]    def print_status(self, native_compare=False, csts=None):
        """
        Print summary of predictions and their current status.

        Output format always contains the following columns:

        - P: preparation step
        - M: motif generation
        - A: assembly
        - E: evaluation

        If a step is completed, ``X`` is shown, ``-`` otherwise.

        For motif generation a ``*`` may be shown to indicate that models from a different
        set of constraints are used.

        If *native_compare* is set to True another 4 columns are printed:

        - 1: native rmsd score of the first cluster
        - 5: lowest native rmsd score of the first 5 clusters
        - 10: lowest native rmsd score of the first 10 clusters
        - n: number of models

        :param native_compare: print rmsd comparison to native structure
        :param csts: list of constraints to include in output (default: all)
        """
        self.check_config()
        if native_compare and self.config["native_pdb_file"] is None:
            raise SimulationException("Cannot compare without native information.")

        if csts:
            # cst selection
            cst_names = [self.parse_cst_name_and_filename(cst)[0] for cst in csts]
        else:
            cst_names = self.get_csts()

        print "Status: %s" % self.config["name"]
        print "  cst                                 P M A E" + ("       1     5    10     n" if native_compare else "")
        print "  -------------------------------------------" + ("--------------------------" if native_compare else "")

        for cst_name in cst_names:
            status = self.get_status(cst_name, include_evaluation_data=native_compare)

            done_preparation = "X" if status["step_preparation_done"] else "-"
            done_motifs = "*" if status["step_motifs_overridden"] else ("X" if status["step_motifs_done"] else "-")
            done_assembly = "X" if status["step_assembly_done"] else "-"
            done_evaluation = "X" if status["step_evaluation_done"] else "-"

            line = "  %-035s %s %s %s %s" % (cst_name, done_preparation, done_motifs, done_assembly, done_evaluation)
            if native_compare:
                if "evaluation_rmsd_native" in status and status["evaluation_rmsd_native"] is not None:
                    rmsds = tuple("%.2f" % x for x in status["evaluation_rmsd_native"])
                else:
                    rmsds = ("-", "-", "-")
                line += "   %05s %05s %05s" % rmsds
                line += " %05s" % (status["evaluation_model_count"] if "evaluation_model_count" in status and status["evaluation_model_count"] is not None else "-")

            print line
Navigation

Source code for rna_predict.simulation

Quick search

Navigation