# coding: utf-8
import glob
import numpy as np
import os
from os.path import splitext, basename, abspath
try:
import cPickle as pickle
except ImportError:
import pickle
import pprint
import re
import shutil
import string
import struct
import subprocess
import sys
import time
from . import dcatools
from . import pdbtools
from . import utils
[docs]def check_file_existence(path, alternative_error_text=None):
"""
Make sure a file exists and raise an exception otherwise.
:param path: filename to check
:param alternative_error_text: alternative error text passed to exception
"""
if not os.path.isfile(path):
raise SimulationException(alternative_error_text if alternative_error_text is not None else "Cannot find file: %s" % path)
[docs]def check_dir_existence(path, alternative_error_text=None):
"""
Make sure a directory exists and raise an exception otherwise.
:param path: directory to check
:param alternative_error_text: alternative error text passed to exception
"""
if not os.path.isdir(path):
raise SimulationException(alternative_error_text if alternative_error_text is not None else "Cannot find directory: %s" % path)
[docs]def delete_glob(pattern, print_notice=True):
"""
Helper function to delete files while expanding shell globs.
:param pattern: pattern to delete
:param print_notice: if True print verbose notice
"""
for f in glob.glob(pattern):
if print_notice:
print "deleting %s..." % f
try:
if os.path.isdir(f):
shutil.rmtree(f, ignore_errors=True)
else:
os.remove(f)
except IOError:
pass
[docs]def get_model_count_in_silent_file(silent_file):
"""
Returns the number of models present in a Rosetta silent file.
:param silent_file: filename
:return: model count
"""
n = 0
with open(silent_file, "r") as t:
pattern_model = re.compile("^SCORE:\s+[0-9.-]+.*")
for line in t:
if pattern_model.match(line):
n += 1
return n
[docs]def merge_silent_files(target, sources):
"""
Merges rosetta silent files (.out).
All source files are appended to target.
Model numbers are incremented uniquely.
Source files are deleted after a successful merge.
:param target: target filename
:param sources: source filenames
:return: model count
"""
# see how many structures we have already
n = get_model_count_in_silent_file(target) if os.path.isfile(target) else 0
# loop through all source files and append all new structures to target
# while adjusting the numbering
pattern_header = re.compile("^(?:SEQUENCE:|SCORE:\s+score).*")
pattern_normal = re.compile("^(.*)S_\d+$")
for source in glob.glob(sources):
print "merging %s into %s" % (source, target)
with open(target, "a+") as t:
with open(source, "r") as s:
for line in s:
m = pattern_header.match(line)
if m:
# only write header if this is he first one
if n == 0:
t.write(line)
continue
m = pattern_normal.match(line)
if m:
if line.startswith("SCORE:"):
n += 1
t.write("%sS_%06d\n" % (m.group(1), n))
continue
t.write(line)
# delete source files
delete_glob(sources, print_notice=True)
# return total number of structures
return n
[docs]def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
"""Helper function to be used as sort key in sorted() to naturally sort numeric parts."""
return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]
# TODO: Correcting atom names IS MOST LIKELY WRONG! It is commented out for now and can probably be removed completely.
[docs]def fix_atom_names_in_cst(cst_info, sequence):
"""
Switches N1 and N3 atom names in a residue.
:param cst_info: list of constraints
:param sequence: residue sequence in lower case
:return: modified list of constraints
"""
pyrimidines = ['c', 'u']
purines = ['a', 'g']
cst_info_new = []
for cst in cst_info:
atom_name1, res1, atom_name2, res2, function = cst
if sequence[res1 - 1] in pyrimidines and atom_name1 == 'N1':
atom_name1 = 'N3'
print 'correcting atom name for ', res1
if sequence[res2 - 1] in pyrimidines and atom_name2 == 'N1':
atom_name2 = 'N3'
print 'correcting atom name for ', res2
if sequence[res1 - 1] in purines and atom_name1 == 'N3':
atom_name1 = 'N1'
print 'correcting atom name for ', res1
if sequence[res2 - 1] in purines and atom_name2 == 'N3':
atom_name2 = 'N1'
print 'correcting atom name for ', res2
cst_info_new.append([atom_name1, res1, atom_name2, res2, function])
return cst_info_new
[docs]class SimulationException(Exception):
"""
Custom exception class for foreseeable prediction errors.
"""
pass
[docs]class Command(object):
"""
Helper class to store external program calls plus additional flags.
"""
[docs] def __init__(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
"""
Create a new command wrapper.
:param command: list of external command and parameters
:param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
:param dry_run: don't actually execute anything
:param print_commands: print commandline when running
:param stdin: optional text to use as standard input
:param quiet: hide output of external program
"""
self.command = command
self.add_suffix = add_suffix
self.dry_run = dry_run
self.print_commands = print_commands
self.stdin = stdin
self.quiet = quiet
[docs] def get_full_command(self, sysconfig):
"""
If necessary, add the suffix to the command. Depending on user / system configuration.
:param sysconfig: instance of SysConfig
:return: suffixed command as a list
"""
com = self.command[:]
if self.add_suffix == "rosetta":
com[0] = sysconfig.rosetta_exe_path + com[0] + sysconfig.rosetta_exe_suffix
if sysconfig.subprocess_buffsize is not None:
com = ["stdbuf", "-o", sysconfig.subprocess_buffsize] + com
return com
[docs]class EvalData(object):
"""
Evaluation data storing model and cluster information.
"""
[docs] def __init__(self):
"""
Create empty EvalData object.
"""
self.models = {}
self.clusters = {}
self.cluster_cutoff = 0.0
@staticmethod
[docs] def load_from_cst(constraints):
"""
Load evaluation data from a prediction.
:param constraints: constraints selection
"""
cst_name, cst_file = RNAPrediction.parse_cst_name_and_filename(constraints)
return EvalData.load_from_file("predictions/%s/output/evaldata.dat" % cst_name)
@staticmethod
[docs] def load_from_file(filename):
"""
Load evaluation data from a file.
:param filename: path to a file containing the evaluation data
"""
try:
with open(filename, "r") as f:
eval_data = pickle.load(f)
if "rmsd_cluster_1" not in next(eval_data.models.itervalues()):
raise SimulationException("evaluation data file '%s' does not contain needed information" % filename)
return eval_data
except (IOError, pickle.PickleError, AttributeError, EOFError, IndexError, KeyError):
# file broken or missing, force full evaluation
raise SimulationException("evaluation data file '%s' missing or broken" % filename)
[docs] def save_to_file(self, filename):
"""
Dump evaldata to a file.
:param filename: path to a file to store the data
"""
with open(filename, "w") as f:
pickle.dump(self, f)
[docs] def get_model_count(self):
"""
Returns the number of models in the vaulation data
:return: number of models
"""
return len(self.models)
[docs] def get_models(self, model_list, kind="tag"):
"""Returns a selection of models with specific properties
:param model_list: list of models to get, might be a list of numbers, or a list of model names
:param kind: model selection mode:
- ``tag``: str: internal name such as S_000123_5
- ``top``: int: models ordered by score
- ``ntop``: int: models ordered by rmsd_native
- ``cluster``: int: nth cluster decoy
- ``cluster_ntop``: n[/m]: nth cluster ordered by native rmsd [of first m]
:return: list of selected models
"""
# initialize list
models_sorted = None
results = []
for model_i in model_list:
# decide which model we want
if kind == "cluster":
model_i = int(model_i)
tag = self.clusters[model_i]["primary_model"]
elif kind == "cluster_ntop":
if "/" in model_i:
i, limit = (int(x) for x in model_i.split("/"))
if i > limit:
raise SimulationException("get_models: Cluster number (%d) can't be larger than the limit (%d)" % (i, limit))
else:
i = int(model_i)
limit = None
model_selection = [self.models[m["primary_model"]] for m in self.clusters.values()[:limit]]
models_sorted = sorted(model_selection, key=lambda i: i["rmsd_native"])
tag = models_sorted[i - 1]["tag"]
elif kind == "tag":
tag = model_i
elif kind == "top":
model_i = int(model_i)
# do we need to create a sorted list?
if models_sorted is None:
models_sorted = sorted(self.models.items(), key=lambda x: x[1]["score"])
tag = models_sorted[model_i - 1][0]
elif kind == "ntop":
model_i = int(model_i)
if models_sorted is None:
try:
models_sorted = sorted(self.models.items(), key=lambda x: x[1]["rmsd_native"])
except KeyError:
raise SimulationException("get_models: No native rmsd information stored")
tag = models_sorted[model_i - 1][0]
else:
raise SimulationException("get_models: Invalid 'kind' parameter: '%s'" % kind)
if tag not in self.models:
raise SimulationException("get_models: Invalid tag: '%s'" % tag)
results.append(self.models[tag])
return results
[docs] def print_models(self, model_list, kind="tag"):
"""
Print a list of models
:param model_list: see get_models
:param kind: see get_models
"""
models = self.get_models(model_list, kind)
for model in models:
print "Model: %s" % model["tag"]
pprint.pprint(model)
@staticmethod
[docs] def get_weighted_model_score(model, score_weights):
"""
Calculate a model score based on different weights for the invidiual Rosetta scores
:param model: model to reweight
:param score_weights: dict of rosetta score names and their weights. ``default`` to set the default weight for all scores. Example: ``{'default':0, 'atom_pair_constraint': 1}`` to only use atom pair constraints.
"""
score = 0
for i in model["rosetta_scores"].keys():
if i == "description":
continue
score += (score_weights[i] if i in score_weights else (score_weights["default"] if "default" in score_weights else 1)) * model["rosetta_scores"][i]
return score
[docs]class RNAPrediction(object):
"""
Base class used for prediction simulation
"""
CONFIG_FILE = ".config"
[docs] def load_config(self):
"""Load simulation configuration of current directory"""
if os.path.exists(RNAPrediction.CONFIG_FILE):
c = open(RNAPrediction.CONFIG_FILE)
self.config = pickle.load(c)
c.close()
[docs] def save_config(self):
"""Save simulation configuration of current directorx"""
c = open(RNAPrediction.CONFIG_FILE, "w")
pickle.dump(self.config, c)
c.close()
[docs] def print_config(self):
"""Print simulation configuration."""
print "Simulation configuration:"
print " path: %s" % (abspath(os.getcwd()))
if self.config:
for key, value in sorted(self.config.iteritems()):
if key == "motif_res_maps" or key == "motif_stem_sets" or key == "motifs" or key == "stems" or key == "evaluate":
print " %s: ..." % key
else:
print " %s: %s" % (key, value)
else:
print " No configuration found."
# TODO: support different data types?
[docs] def modify_config(self, key, value):
"""
Modify configuration entry.
:param key: setting to modify
:param value: new value ("-" to store None)
"""
if key in self.config:
self.config[key] = None if value == "-" else value
else:
raise SimulationException("No such config entry: %s" % key)
[docs] def check_config(self):
"""Check if current directory contains a valid configuraion."""
if not self.config:
raise SimulationException("No config file found. Please run 'prepare' first!")
[docs] def __init__(self, sysconfig):
"""Create a prediction simulation for the current directory and try to load an existing configuration."""
self.config = {}
self.sysconfig = sysconfig
self.load_config()
[docs] def execute_command(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
"""
Execute an external command.
:param command: list of external command and parameters
:param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
:param dry_run: don't actually execute anything
:param print_commands: print commandline when running
:param stdin: optional text to use as standard input
:param quiet: hide output of external program
"""
self.execute_commands([Command(command, add_suffix, dry_run, print_commands, stdin, quiet)])
[docs] def execute_command_and_capture(self, command, add_suffix=None, dry_run=False, print_commands=True, stdin=None, quiet=False):
"""
Execute an external command while capturing output.
:param command: list of external command and parameters
:param add_suffix: type of suffix to first entry in command. currently only ``None`` or ``rosetta`` are supported
:param dry_run: don't actually execute anything
:param print_commands: print commandline when running
:param stdin: optional text to use as standard input
:param quiet: hide output of external program
:return: generator over lines of standard output
"""
c = Command(command, add_suffix, dry_run, print_commands, stdin, quiet)
command = c.get_full_command(self.sysconfig)
if print_commands:
print " ".join(command)
try:
p = subprocess.Popen(command, stdin=(subprocess.PIPE if c.stdin is not None else None), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
if c.stdin is not None:
p.stdin.write(c.stdin)
except OSError, e:
raise SimulationException("Failed to execute command: %s, Reason: %s" % (" ".join(c.command), e))
for line in p.stdout:
if not quiet:
sys.stdout.write(line)
yield line
p.wait()
if p.returncode != 0:
raise SimulationException("Non-zero return code from executed command: %s" % " ".join(c.command))
[docs] def execute_commands(self, commands, threads=1):
"""
Execute a list of commands parallelly.
:param commands: list of Commands
:param threads: number of parallel invocations
"""
processes = []
stdout = None
try:
while True:
while commands and len(processes) < threads:
c = commands.pop(0)
command = c.get_full_command(self.sysconfig)
if c.print_commands:
print " ".join(command)
if not c.dry_run:
stdout = None
if c.quiet and stdout is None:
stdout = open(os.devnull, "w")
stderr = stdout
try:
p = subprocess.Popen(command, stdin=(subprocess.PIPE if c.stdin is not None else None), stdout=stdout, stderr=stderr)
# store original command in process
p.command = command
processes.append(p)
if c.stdin is not None:
p.stdin.write(c.stdin)
except OSError, e:
raise SimulationException("Failed to execute command: %s, Reason: %s" % (" ".join(c.command), e))
for p in processes:
if p.poll() is not None:
processes.remove(p)
if p.returncode != 0:
raise SimulationException("Non-zero return code from executed command: %s" % " ".join(p.command))
if not processes and not commands:
break
else:
time.sleep(0.1)
except:
# kill all other processes
for p in processes:
p.kill()
raise
finally:
if stdout is not None:
stdout.close()
@staticmethod
def _make_tag_with_dashes(int_vector):
"""
Transform a list of values into a list of strings while using "n-m" for adjacent values.
Example: Turns ``[1, 2, 3, 5, 7, 8]`` into ``['1-3', '5', '7-8']``.
:param int_vector: list of values
:return: list of strings
"""
tag = []
start_res = int_vector[0]
for i in range(1, len(int_vector) + 1):
if i == len(int_vector) or int_vector[i] != int_vector[i - 1] + 1:
stop_res = int_vector[i - 1]
if stop_res > start_res:
tag += ["%d-%d" % (start_res, stop_res)]
else:
tag += ["%d" % stop_res]
if i < len(int_vector):
start_res = int_vector[i]
return tag
# noinspection PyMethodMayBeStatic
[docs] def get_csts(self):
"""
Returns a list of all constraints.
:return: list of all constraints names
"""
# loop over all constraint sets
# that is either files in the "constraints" directory, or directories under "predictions", and always "none"
cst_names = ["none"]
cst_names += [splitext(basename(cst_file))[0] for cst_file in glob.glob("constraints/*.cst")]
cst_names += [basename(cst_dir) for cst_dir in glob.glob("predictions/*")]
return sorted(set(cst_names), key=natural_sort_key)
[docs] def prepare(self, fasta_file="sequence.fasta", params_file="secstruct.txt", native_pdb_file=None, data_file=None, torsions_file=None, name=None):
"""
Preparation step. Parse out stems and motifs from sequence and secondary structure information and create necessary base files.
:param fasta_file: fasta file containing the sequence
:param params_file: text file containing the secondary structure
:param native_pdb_file: native pdb file if available
:param data_file: additional data file if available
:param torsions_file: additional torsions file if available
:param name: optional name for this set of predictions
"""
if name is None:
name = os.path.basename(os.path.abspath(os.getcwd()))
for f in [fasta_file, params_file, native_pdb_file, data_file, torsions_file]:
if f is not None:
check_file_existence(f)
utils.mkdir_p("constraints")
utils.mkdir_p("preparation")
utils.mkdir_p("dca")
utils.mkdir_p("predictions")
self.config["fasta_file"] = fasta_file
self.config["params_file"] = params_file
self.config["native_pdb_file"] = native_pdb_file
self.config["data_file"] = data_file
self.config["torsions_file"] = torsions_file
self.config["name"] = name
print
print "Preparation:"
# Read in files
for line in utils.read_file_line_by_line(fasta_file):
if line[0] == ">":
continue
self.config["sequence"] = line.lower()
break
print self.config["sequence"]
numres = len(self.config["sequence"])
# Read in data information
self.config["data_info"] = []
if data_file is not None:
self.config["backbone_burial_info"] = []
for line in utils.read_file_line_by_line(data_file):
if len(line) > 6 and line[:6] == 'EXPOSE':
cols = string.split(line)
for i in range(len(cols) / 3):
self.config["data_info"].append([int(cols[3 * i + 1]), cols[3 * i + 2], cols[3 * i + 3]])
if len(line) > 15 and line[:15] == 'BACKBONE_BURIAL':
cols = string.split(line)
for i in range(1, len(cols)):
self.config["backbone_burial_info"].append(int(cols[i]))
pair_map = {}
all_pairs = []
complement = {'a': ['u'], 'u': ['a', 'g'], 'c': ['g'], 'g': ['c', 'u']}
# Parse out stems
basepair_mismatch = []
cutpoints_original = []
for line in utils.read_file_line_by_line(params_file):
if line[:4] == 'STEM':
cols = string.split(line)
for i in range(len(cols)):
if cols[i] == 'PAIR':
# Offset to get to python numbering (starts with zero)
res1 = int(cols[i + 1]) - 1
res2 = int(cols[i + 2]) - 1
pair_map[res1] = res2
pair_map[res2] = res1
all_pairs.append([res1, res2])
# check if the secondary structure watson-crick pair is allowed
if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
basepair_mismatch += [[res1, res2]]
elif line.count('(') > 0: # maybe dot/bracket notation (((...)))
print line
self.config["secstruc"] = line
left_brackets = []
for i in range(len(line)):
if line[i] == '(':
left_brackets.append(i)
if line[i] == ')':
res1 = left_brackets[-1]
res2 = i
del (left_brackets[-1])
pair_map[res1] = res2
pair_map[res2] = res1
all_pairs.append([res1, res2])
# check if the secondary structure watson-crick pair is allowed
if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
basepair_mismatch += [[res1, res2]]
if left_brackets:
raise SimulationException("unbalanced paranthesis in secondary structure")
else:
cols = string.split(line)
res1 = int(cols[0]) - 1
res2 = int(cols[1]) - 1
pair_map[res1] = res2
pair_map[res2] = res1
all_pairs.append([res1, res2])
# check if the secondary structure watson-crick pair is allowed
if not self.config["sequence"][res1] in complement[self.config["sequence"][res2]]:
basepair_mismatch += [[res1, res2]]
if line[:13] == 'CUTPOINT_OPEN':
cutpoints_original = map(lambda x: int(x), string.split(line[14:]))
# print all helix pairs that don't fit the allowed complements
if basepair_mismatch:
basepair_mismatch_str = [" "] * len(self.config["sequence"])
for r in basepair_mismatch:
basepair_mismatch_str[r[0]] = "^"
basepair_mismatch_str[r[1]] = "^"
print "".join(basepair_mismatch_str)
# more verbose:
for r in basepair_mismatch:
res1 = r[0]
res2 = r[1]
resstr1 = str(res1 + 1) + self.config["sequence"][res1]
resstr2 = str(res2 + 1) + self.config["sequence"][res2]
print " " * res1 + "^" + " " * (res2 - res1 - 1) + "^"
print " " * res1 + resstr1 + " " * (res2 - res1 - len(resstr1)) + resstr2
raise SimulationException("some helix pairs in secondary structure cannot be realized (see above)")
# print pair_map
# Parse out stems
already_in_stem = {}
for i in range(numres):
already_in_stem[i] = 0
self.config["stems"] = []
for i in range(numres):
if i in pair_map and not already_in_stem[i]: # In a base pair
k = i
stem_res = [[k, pair_map[k]]]
already_in_stem[k] = 1
already_in_stem[pair_map[k]] = 1
# Can we extend in one direction?
while (k + 1) in pair_map and pair_map[k + 1] == pair_map[k] - 1 and not already_in_stem[k + 1]:
k += 1
stem_res.append([k, pair_map[k]])
already_in_stem[k] = 1
already_in_stem[pair_map[k]] = 1
# Do not allow single WC base pairs.
if len(stem_res) < 2:
print 'All stems must have length > 1 bp '
print stem_res
exit()
self.config["stems"].append(stem_res)
# Parse out motifs
already_in_motif = {}
for i in range(numres):
already_in_motif[i] = 0
motif_cutpoints = []
self.config["motif_res_maps"] = []
self.config["motif_stem_sets"] = []
self.config["motifs"] = []
for i in range(numres):
if not already_in_motif[i] and (not already_in_stem[i] or (i > 0
and already_in_stem[i - 1]
and pair_map[i] + 1 != pair_map[i - 1])):
motif_res = []
motif_stem_set = []
cutpoints = []
if i > 1:
# back up to beginning of stem.
k = i - 1
# first base pair.
motif_stem = [[k, pair_map[k]]]
motif_res.append(k)
motif_res.append(pair_map[k])
k -= 1
while k >= 0 and already_in_stem[k] and \
(pair_map[k] - 1 == pair_map[k + 1]):
motif_stem.append([k, pair_map[k]])
motif_res.append(k)
motif_res.append(pair_map[k])
k -= 1
motif_stem_set.append(motif_stem)
k += 1
cutpoints.append(pair_map[k])
# print 'AFTER FIRST HELIX: ', motif_res
k = i
while k not in motif_res and k < numres:
# Move forward to next stem:
while k < numres and not already_in_stem[k]:
if already_in_motif[k]:
print 'Hey cant deal with pseudoknots!'
exit()
motif_res.append(k)
k += 1
stem_start = k
if k >= numres:
cutpoints.append(k - 1)
break
if k in motif_res:
break
motif_stem = [[k, pair_map[k]]]
motif_res.append(k)
motif_res.append(pair_map[k])
k += 1
while k < numres and already_in_stem[k] and pair_map[k - 1] == pair_map[k] + 1 and k not in motif_res:
motif_stem.append([k, pair_map[k]])
motif_res.append(k)
motif_res.append(pair_map[k])
k += 1
motif_stem_set.append(motif_stem)
cutpoints.append(k - 1)
# Next non-helical part..
k = pair_map[stem_start] + 1
# print 'AFTER NEXT HELIX: ', motif_res
motif_res.sort()
motif_res_map = {}
for k in range(len(motif_res)):
motif_res_map[motif_res[k]] = k
already_in_motif[motif_res[k]] = 1
self.config["motifs"].append(motif_res)
self.config["motif_stem_sets"].append(motif_stem_set)
motif_cutpoints.append(cutpoints)
self.config["motif_res_maps"].append(motif_res_map)
# print 'CUTPOINTS ', cutpoints
for i in range(len(self.config["stems"])):
# Fasta
tag = 'preparation/stem%d.fasta' % (i + 1)
fid = open(tag, 'w')
fid.write('>' + tag + '\n')
stem_res = self.config["stems"][i]
stem_length = len(stem_res)
for k in range(stem_length):
fid.write(self.config["sequence"][stem_res[k][0]])
# print stem_res[k][0]+1,
for k in range(stem_length):
fid.write(self.config["sequence"][stem_res[stem_length - k - 1][1]])
# print stem_res[stem_length-k-1][1]+1,
# print
fid.write('\n')
fid.close()
print 'Created: ', tag
# pdb_file
if native_pdb_file is not None:
command = ["pdbslice.py",
native_pdb_file,
"-segment",
"%d" % (stem_res[0][0] + 1),
"%d" % (stem_res[-1][0] + 1),
"%d" % (stem_res[-1][-1] + 1),
"%d" % (stem_res[0][-1] + 1),
"preparation/stem%d_" % (i + 1)]
self.execute_command(command)
native_pdb_file_subset = 'preparation/stem%d_%s' % (i + 1, native_pdb_file)
print 'Created: ', native_pdb_file_subset
# Output motif jobs
for i in range(len(self.config["motifs"])):
# Fasta
motif_fasta_file = 'preparation/motif%d.fasta' % (i + 1)
fid = open(motif_fasta_file, 'w')
fid.write('>' + motif_fasta_file + '\n')
motif_res = self.config["motifs"][i]
motif_length = len(motif_res)
for k in range(motif_length):
fid.write(self.config["sequence"][motif_res[k]])
fid.write('\n')
fid.close()
print 'Created: ', motif_fasta_file
# params file
motif_stem_set = self.config["motif_stem_sets"][i]
motif_res_map = self.config["motif_res_maps"][i]
motif_cutpoint = motif_cutpoints[i]
motif_params_file = 'preparation/motif%d.params' % (i + 1)
fid = open(motif_params_file, 'w')
for k in range(len(motif_stem_set)):
motif_stem = motif_stem_set[k]
fid.write('STEM ')
for n in range(len(motif_stem)):
fid.write(' PAIR %d %d W W A' %
(motif_res_map[motif_stem[n][0]] + 1,
motif_res_map[motif_stem[n][1]] + 1))
fid.write('\n')
fid.write('OBLIGATE PAIR %d %d W W A \n\n' %
(motif_res_map[motif_stem[-1][0]] + 1,
motif_res_map[motif_stem[-1][1]] + 1))
motif_cutpoint.sort()
# print motif_res
# print motif_cutpoint
if len(motif_cutpoint) > 1:
fid.write('CUTPOINT_OPEN ')
for k in range(len(motif_cutpoint)):
if motif_res_map[motif_cutpoint[k]] < (len(motif_res) - 1):
fid.write(' %d' % (motif_res_map[motif_cutpoint[k]] + 1))
fid.write('\n')
fid.close()
print 'Created: ', motif_params_file
# pdb_file
if native_pdb_file is not None:
command = ["pdbslice.py",
native_pdb_file,
"-subset"]
for k in range(motif_length):
command += ["%d" % (motif_res[k] + 1)]
command += ["preparation/motif%d_" % (i + 1)]
self.execute_command(command)
native_pdb_file_subset = 'preparation/motif%d_%s' % (i + 1, native_pdb_file)
print 'Created: ', native_pdb_file_subset
if data_file is not None:
motif_data_file = 'preparation/motif%d.data' % (i + 1)
fid_data = open(motif_data_file, 'w')
fid_data.write('EXPOSE')
for data in self.config["data_info"]:
if data[0] - 1 in motif_res_map.keys():
fid_data.write(' %d %s %s ' % (motif_res_map[data[0] - 1] + 1, data[1], data[2]))
fid_data.write('\n')
if len(self.config["backbone_burial_info"]) > 0:
fid_data.write('BACKBONE_BURIAL ')
for k in self.config["backbone_burial_info"]:
if k - 1 in motif_res_map.keys():
fid_data.write(' %d' % (motif_res_map[k - 1] + 1))
fid_data.write('\n')
fid_data.close()
print 'Created: ', motif_data_file
if len(cutpoints_original) > 0:
# noinspection PyUnboundLocalVariable
fid.write('CUTPOINT_OPEN ')
for cutpoint in cutpoints_original:
fid.write(' %d' % (cutpoint + 1))
fid.write('\n')
cutpoints = []
for i in range(len(self.config["motifs"])):
motif_stem_set = self.config["motif_stem_sets"][i]
motif_stem = motif_stem_set[0]
possible_cutpoints = [motif_stem[0][0], motif_stem[1][1]]
possible_cutpoints.sort()
# print possible_cutpoints
if possible_cutpoints[0] not in cutpoints:
cutpoints.append(possible_cutpoints[0])
params_file = "preparation/sequence.params"
fid = open(params_file, 'w')
if len(cutpoints) > 0:
fid.write('CUTPOINT_CLOSED ')
# cutpoints.sort()
for cutpoint in cutpoints:
fid.write(' %d' % (cutpoint + 1))
fid.write('\n')
# for cutpoint in cutpoints:
# fid.write( 'OBLIGATE PAIR %d %d W W A\n' % (cutpoint+1, pair_map[cutpoint]+1) )
for i in range(len(self.config["stems"])):
stem_res = self.config["stems"][i]
fid.write('STEM ')
for k in range(len(stem_res)):
fid.write(' PAIR %d %d W W A ' %
(stem_res[k][0] + 1, stem_res[k][1] + 1))
fid.write('\n')
fid.write('OBLIGATE PAIR %d %d W W A \n\n' %
(stem_res[-1][0] + 1,
stem_res[-1][1] + 1))
fid.close()
print 'Created: ', params_file
# default cutpoint constraints
assemble_cst_file = "preparation/cutpoints.cst"
fid = open(assemble_cst_file, 'w')
fid.write('[ atompairs ]\n')
for cutpoint in cutpoints:
fid.write("O3' %d P %d HARMONIC 1.619 2.0\n" % (cutpoint + 1, cutpoint + 2))
fid.close()
print 'Created: ', assemble_cst_file
[docs] def create_helices(self, dry_run=False, threads=1):
"""
Helix creation step. Create one ideal helix model for each helix.
:param dry_run: don't actually run any external command
:param threads: number of threads to use
"""
self.check_config()
commands = list()
delete_glob("preparation/stem*.out")
for i in range(len(self.config["stems"])):
command = ["rna_helix",
"-fasta", "preparation/stem%d.fasta" % (i + 1),
"-out:file:silent", "preparation/stem%d.out" % (i + 1)]
commands.append(Command(command, add_suffix="rosetta", dry_run=dry_run))
self.execute_commands(commands, threads=threads)
# rna_helix dumps <sequence>.pdb files in the working directory.
# These can be useful for us, so move them to the preparation directory with a proper stemX.pdb filename.
# TODO: What happens if there are multiple helices with the same sequence? In that case we need to move them
# out of the way before running the next command.
# Just disable multithreading here (not really needed with stem generation anyways)?
for i in range(len(self.config["stems"])):
sequence = [self.config["sequence"][j[0]] for j in self.config["stems"][i]]
sequence += [self.config["sequence"][j[1]] for j in reversed(self.config["stems"][i])]
shutil.move("%s.pdb" % ("".join(sequence)), "preparation/stem%d.pdb" % (i + 1))
@staticmethod
[docs] def parse_cst_name_and_filename(constraints):
"""
Find and clean up constraints by name or filename
:param constraints: constraints name or filename
:return: tuple of constraints name and filename
"""
if constraints is None or constraints == "none":
cst_name = "none"
cst_file = None
elif os.path.isfile(constraints):
cst_file = constraints
cst_name = splitext(basename(cst_file))[0]
else:
constraints = basename(constraints)
cst_name, cst_ext = splitext(constraints)
if cst_ext != ".cst":
cst_name = constraints
cst_file = "constraints/%s.cst" % cst_name
check_file_existence(cst_file)
return cst_name, cst_file
@staticmethod
[docs] def parse_cst_file(constraints_file):
"""
Parse .cst file as a list of constraints
:param constraints_file: path to a .cst file
:return: list of constraints
"""
cst_info = []
for line in utils.read_file_line_by_line(constraints_file):
if len(line) > 6 and line[0] != '[':
cols = string.split(line)
atom_name1 = cols[0]
res1 = int(cols[1])
atom_name2 = cols[2]
res2 = int(cols[3])
cst_info.append([atom_name1, res1, atom_name2, res2, cols[4:]])
return cst_info
[docs] def prepare_cst(self, constraints=None, motifs_override=None):
"""
Constraints preparation step. Prepare constraints files for motif generation and assembly.
:param constraints: constraints selection
:param motifs_override: optional name of a different set of constraints to use as motifs.
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
print "Constraints preparation:"
print " constraints: %s" % cst_name
dir_prediction = "predictions/%s" % cst_name
dir_assembly = "predictions/%s/assembly" % cst_name
dir_motifs = "predictions/%s/motifs" % cst_name
if motifs_override:
motifcst_name, motifcst_file = self.parse_cst_name_and_filename(motifs_override)
print " motif constraints: %s" % motifcst_name
if motifcst_name == cst_name:
raise SimulationException("Motif override cst name can't be the same as the cst name!")
utils.mkdir_p("predictions/%s" % cst_name)
utils.mkdir_p(dir_assembly)
if motifs_override:
# TODO: just make all that stuff a cst config file
# create MOTIF_OVERRIDE file and store the cst name of the motifs
file_motif_override = "%s/MOTIF_OVERRIDE" % dir_prediction
with open(file_motif_override, "w") as f:
f.write(motifcst_name)
# assembly constraints: start with default harmonic cutpoints
assembly_cst = "%s/assembly.cst" % dir_assembly
shutil.copy("preparation/cutpoints.cst", assembly_cst)
print "Created: %s" % assembly_cst
# if cst is "none" we are done now
if cst_file is None:
return
# load constraints information
cst_info = self.parse_cst_file(cst_file)
# probably wrong, commented out for now!
# cst_info = fix_atom_names_in_cst(cst_info, self.config["sequence"])
# add tertiary constraints
with open(assembly_cst, "a") as ft:
for cst in cst_info:
ft.write('%s %d %s %d %s \n' % (cst[0], cst[1], cst[2], cst[3], " ".join(map(str, cst[4]))))
print "Added tertiary constraints: %s" % assembly_cst
# extract relevant constraints for motif generation from global cst file if necessary
if not motifs_override:
utils.mkdir_p(dir_motifs)
for i in range(len(self.config["motifs"])):
motif_res_map = self.config["motif_res_maps"][i]
motif_cst_file = '%s/motif%d.cst' % (dir_motifs, i + 1)
fid_cst = open(motif_cst_file, 'w')
fid_cst.write('[ atompairs ]\n')
for cst in cst_info:
if cst[1] - 1 in motif_res_map.keys() and cst[3] - 1 in motif_res_map.keys():
fid_cst.write('%s %d %s %d %s\n' % (cst[0], motif_res_map[cst[1] - 1] + 1, cst[2], motif_res_map[cst[3] - 1] + 1, " ".join(map(str, cst[4]))))
fid_cst.close()
print 'Created: ', motif_cst_file
[docs] def get_status(self, constraints=None, include_evaluation_data=False, include_motif_model_count=False, include_assembly_model_count=False):
"""
Returns a dict containing status information for a cst.
:param constraints: constraints selection
:param include_motif_model_count: set to True to include model count (longer processing time)
:param include_assembly_model_count: set to True to include model count (longer processing time)
:param include_evaluation_data: set to True to include evaluation model count and native RMSD values (best of first 1,5,10 clusters) if available (longer processing time)
:return: status dict
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
dir_prediction = "predictions/%s" % cst_name
dir_assembly = "predictions/%s/assembly" % cst_name
dir_motifs = "predictions/%s/motifs" % cst_name
dir_output = "predictions/%s/output" % cst_name
file_motifoverride = dir_prediction + "/MOTIF_OVERRIDE"
file_evaldata = "%s/evaldata.dat" % dir_output
res = {"step_preparation_done": os.path.isfile(dir_assembly + "/assembly.cst"),
"step_motifs_done": bool(glob.glob(dir_motifs + "/*.out")),
"step_motifs_overridden": None,
"step_assembly_done": bool(glob.glob(dir_assembly + "/*.out")),
"step_evaluation_done": os.path.isfile(file_evaldata)}
if os.path.isfile(file_motifoverride):
with open(file_motifoverride, "r") as f:
res["step_motifs_overridden"] = f.read().strip()
# evaluation data
if include_evaluation_data:
res["evaluation_rmsd_native"] = None
res["evaluation_model_count"] = None
try:
eval_data = EvalData.load_from_cst(cst_name)
res["evaluation_model_count"] = eval_data.get_model_count()
if "rmsd_native" in next(eval_data.models.itervalues()):
res["evaluation_rmsd_native"] = [m["rmsd_native"] for m in eval_data.get_models(["1/1", "1/5", "1/10"], kind="cluster_ntop")]
except SimulationException:
pass
# motif model count
if include_motif_model_count:
res["model_count_motifs"] = {i: sum(map(get_model_count_in_silent_file, glob.glob("%s/motif%d*.out" % (dir_motifs, i)))) for i in xrange(1, len(self.config["motifs"]) + 1)}
# assembly model count
if include_assembly_model_count:
res["model_count_assembly"] = sum(map(get_model_count_in_silent_file, (glob.glob("%s/assembly_*.out" % dir_assembly))))
return res
[docs] def create_motifs(self, nstruct=50000, cycles=20000, dry_run=False, seed=None, use_native_information=False, threads=1, constraints=None, motif_subset=None):
"""
Motif generation step. Generate models for each motif.
:param nstruct: number of models to create for each motif
:param cycles: number of monte-carlo cycles per model
:param dry_run: don't actually run any external command
:param seed: optionally override random seed
:param use_native_information: use native pdb file to calculate rmsd for each model
:param threads: number of threads to use
:param constraints: constraints selection
:param motif_subset: sepcify a list of motifs to generate instead of all, counting starts at 1 (i.e. [1, 3, 4])
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
print "Motif creation configuration:"
print " constraints: %s" % cst_name
print " cycles: %s" % cycles
print " nstruct: %s" % nstruct
print " dry_run: %s" % dry_run
print " random_seed: %s" % seed
print " threads: %s" % threads
if motif_subset is not None:
print " subset: %s" % motif_subset
dir_motifs = "predictions/%s/motifs" % cst_name
utils.mkdir_p("predictions/%s" % cst_name)
utils.mkdir_p(dir_motifs)
n_motifs = len(self.config["motifs"])
# list of motifs that we should generate models for
# when no subset was given create models for all motifs
if motif_subset is None:
motif_subset = [i + 1 for i in range(n_motifs)]
else:
# remove duplicates if any
motif_subset = list(set(motif_subset))
# make sure that all the numbers actually exist
if max(motif_subset) > n_motifs:
raise SimulationException("Invalid motif subset. Structure only contains %d motifs." % n_motifs)
# check if motif constraints were created correctly
if cst_file is not None:
for i in motif_subset:
check_file_existence("%s/motif%d.cst" % (dir_motifs, i), "Motif cst files not found. Please run the 'prepare-cst' step!")
# merge all motifs and check what we have so far
completed = {}
for i in motif_subset:
completed[i] = merge_silent_files("%s/motif%d.out" % (dir_motifs, i), "%s/motif%d_*.out" % (dir_motifs, i))
commands = list()
for i in motif_subset:
# split the rest of the work in multiple jobs
structs_missing = nstruct - completed[i]
print "motif %d: completed: %d missing: %d" % (i, completed[i], structs_missing)
if structs_missing <= 0:
continue
structs_threads = [0 for _ in range(threads)]
for j in range(threads):
structs_threads[j] = structs_missing / threads
for j in range(structs_missing % threads):
structs_threads[j] += 1
command = ["rna_denovo",
"-fasta", "preparation/motif%d.fasta" % i,
"-params_file", "preparation/motif%d.params" % i,
"-cycles", "%d" % cycles,
"-mute", "all",
"-close_loops",
"-close_loops_after_each_move",
"-minimize_rna"]
if cst_file is not None:
command += ["-cst_file", '%s/motif%d.cst' % (dir_motifs, i)]
if self.config["native_pdb_file"] is not None and use_native_information:
command += ["-native", "preparation/motif%d_%s" % (i, self.config["native_pdb_file"])]
if self.config["data_file"] is not None:
command += ["-data_file", "preparation/motif%d.data" % i]
if self.config["torsions_file"] is not None:
command += ["-vall_torsions", "preparation/motif%d.torsions" % i]
which_stems = []
stem_chunk_res = []
motif_stem_set = self.config["motif_stem_sets"][i - 1]
motif_res_map = self.config["motif_res_maps"][i - 1]
for k in range(len(motif_stem_set)):
motif_stem = motif_stem_set[k]
# need to find in stems
for n in range(len(self.config["stems"])):
stem = self.config["stems"][n]
found_match = 0
for q in range(len(stem)):
if motif_stem[0][0] in stem[q]:
found_match = 1
break
if found_match:
which_stems.append(n)
break
for q in range(len(stem)):
stem_chunk_res.append(motif_res_map[stem[q][0]] + 1)
for q in range(len(stem)):
stem_chunk_res.append(motif_res_map[stem[len(stem) - q - 1][1]] + 1)
command += ["-in:file:silent_struct_type", "rna",
"-in:file:silent"]
for n in which_stems:
command += ["preparation/stem%d.out" % (n + 1)]
command += ["-in:file:input_res"]
command += self._make_tag_with_dashes(stem_chunk_res)
for j in range(threads):
if structs_threads[j] == 0:
continue
command_full = command + ["-out:file:silent", "%s/motif%d_%d.out" % (dir_motifs, i, j + 1),
"-nstruct", "%d" % (structs_threads[j])]
if seed is not None:
command_full += ["-constant_seed", "-jran", "%d" % seed]
seed += 1
commands.append(Command(command_full, add_suffix="rosetta", dry_run=dry_run))
self.execute_commands(commands, threads=threads)
# merge motifs
for i in motif_subset:
merge_silent_files("%s/motif%d.out" % (dir_motifs, i), "%s/motif%d_*.out" % (dir_motifs, i))
[docs] def assemble(self, nstruct=50000, cycles=20000, constraints=None, dry_run=False, seed=None, use_native_information=False, threads=1):
"""
Assembly step. Assemble helices and motifs into complete models.
:param nstruct: number of models to create
:param cycles: number of monte-carlo cycles per model
:param constraints: constraints selection
:param dry_run: don't actually run any external command
:param seed: optionally override random seed
:param use_native_information: use native pdb file to calculate rmsd for each model
:param threads: number of threads to use
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
print "Assembly configuration:"
print " constraints: %s" % cst_name
print " cycles: %s" % cycles
print " nstruct: %s" % nstruct
print " dry_run: %s" % dry_run
print " random_seed: %s" % seed
# Check if a different set of motifs should be used.
file_motif_override = "predictions/%s/MOTIF_OVERRIDE" % cst_name
if os.path.isfile(file_motif_override):
with open(file_motif_override, "r") as f:
motifcst_name = f.read().strip()
print " using motifs: %s" % motifcst_name
else:
motifcst_name = cst_name
dir_assembly = "predictions/%s/assembly" % cst_name
dir_motifs = "predictions/%s/motifs" % motifcst_name
file_assembly_cst = "%s/assembly.cst" % dir_assembly
check_file_existence(file_assembly_cst, "Preparation incomplete. Please run 'prepare-cst'.")
check_file_existence("%s/motif1.out" % dir_motifs, "Motifs not found. Please run 'create-motifs'.")
commands = list()
command = ["rna_denovo",
"-minimize_rna",
"-fasta", self.config["fasta_file"],
"-in:file:silent_struct_type", "binary_rna",
"-cycles", "%d" % cycles,
"-params_file", "preparation/sequence.params",
"-close_loops",
"-in:file:silent"]
for i in range(len(self.config["stems"])):
command += ["preparation/stem%d.out" % (i + 1)]
for i in range(len(self.config["motifs"])):
command += ["%s/motif%d.out" % (dir_motifs, i + 1)]
if cst_file is not None:
command += ["-cst_file", file_assembly_cst]
chunk_res = []
for n in range(len(self.config["stems"])):
stem = self.config["stems"][n]
for q in range(len(stem)):
chunk_res.append(stem[q][0] + 1)
for q in range(len(stem)):
chunk_res.append(stem[len(stem) - q - 1][1] + 1)
for n in range(len(self.config["motifs"])):
motif_res = self.config["motifs"][n]
for m in motif_res:
chunk_res.append(m + 1)
command += ["-in:file:input_res"]
command += self._make_tag_with_dashes(chunk_res)
if self.config["native_pdb_file"] is not None and use_native_information:
command += ["-native", self.config["native_pdb_file"]]
if self.config["torsions_file"] is not None:
command += ["-vall_torsions", self.config["torsions_file"]]
if self.config["data_file"] is not None:
command += ["-data_file", self.config["data_file"]]
# distribute nstruct among threads
structs_threads = [0 for _ in range(threads)]
for j in range(threads):
structs_threads[j] = nstruct / threads
for j in range(nstruct % threads):
structs_threads[j] += 1
for j in range(threads):
if structs_threads[j] == 0:
continue
# In case no seed is specified, we do what rosetta does to get a random number, and seed it with that.
# This way we know the number beforehand and can choose an appropriate output filename.
# In case a seed was specified, we increment it here with the thread number.
if seed is None:
seed_used = struct.unpack("=i", os.urandom(4))[0]
else:
seed_used = seed + j
command_full = command + ["-out:file:silent", "%s/assembly_%d.out" % (dir_assembly, seed_used),
"-constant_seed", "-jran", "%d" % seed_used, "--nstruct", "%d" % structs_threads[j]]
commands.append(Command(command_full, add_suffix="rosetta", dry_run=dry_run))
self.execute_commands(commands, threads=threads)
# TODO: set the cutoff value back to 4.0? It was set to 4.1 because the old bash script used integer comparison and even 4.09 was treated as 4.0
[docs] def evaluate(self, constraints=None, cluster_limit=10, cluster_cutoff=4.1, full_evaluation=False):
"""
Evaluation step. Extract model information, cluster models and calculate rmsd values.
:param constraints: constraints selection
:param cluster_limit: maximum number of clusters to produce
:param cluster_cutoff: rmsd distance in A at which a new cluster is created
:param full_evaluation: discard existing evaluation data, re-extract model information and re-calculate rmsd values.
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
print "Evaluation configuration:"
print " constraints: %s" % cst_name
print " cluster_limit: %s" % cluster_limit
print " cluster_cutoff: %s" % cluster_cutoff
dir_assembly = "predictions/%s/assembly" % cst_name
dir_output = "predictions/%s/output" % cst_name
dir_tmp = "predictions/%s/temp" % cst_name
file_evaldata = "%s/evaldata.dat" % dir_output
if not os.path.isdir("predictions/%s" % cst_name):
raise SimulationException("No prediction directory for constraint '%s' found! Maybe you should assemble first?" % cst_name)
files_assembly = sorted(glob.glob("%s/assembly_*.out" % dir_assembly), key=natural_sort_key)
if len(files_assembly) == 0:
raise SimulationException("No assembly files for constraint '%s' found! Maybe you should assemble first?" % cst_name)
if not full_evaluation:
if not os.path.isdir(dir_output) or not os.path.isfile(file_evaldata):
# first time, silently do a full eval
full_evaluation = True
else:
try:
eval_data = EvalData.load_from_file(file_evaldata)
except SimulationException:
# file broken or missing, force full evaluation
print "Warning: %s missing or broken, running full evaluation." % file_evaldata
full_evaluation = True
print " full_evaluation: %s" % full_evaluation
# cleanup
if full_evaluation:
delete_glob(dir_output)
delete_glob(dir_tmp)
# create empty evaluation data
eval_data = EvalData()
else:
# clean out old cluster information from models
for model in eval_data.models.itervalues():
if "cluster" in model:
del model["cluster"]
eval_data.clusters = {}
delete_glob(dir_output + os.sep + "*.pdb")
delete_glob(dir_output + os.sep + "*.out")
utils.mkdir_p(dir_output)
utils.mkdir_p(dir_tmp)
# extract info from .out files
if full_evaluation:
# store all the rosetta scores in a dict
scores_headers = None
# loop over all out files matching the constraint
for i, f in enumerate(files_assembly):
print "processing rosetta silent file: %s..." % f
# read out file and store a dict of the scores and the full score line
regex_score = re.compile("^SCORE:\s+(.*)$")
for line in utils.read_file_line_by_line(f):
m = regex_score.match(line)
if m:
# split rosetta score entries into a list
scores = m.group(1).split()
# store the header names if not already known
if scores[-1] == "description":
if not scores_headers:
scores_headers = scores
continue
# create a score dict
scores_dict = {}
for s_index, s_name in enumerate(scores_headers):
# store float values as float
try:
scores_dict[scores_headers[s_index]] = float(scores[s_index])
except ValueError:
scores_dict[scores_headers[s_index]] = scores[s_index]
# name models exactly like rosetta would (that is, append _<num> when already present)
j = 1
tag = scores_dict["description"]
while tag in eval_data.models:
tag = "%s_%d" % (scores_dict["description"], j)
j += 1
eval_data.models[tag] = {"source_file": basename(f),
"score": scores_dict["score"],
"tag": tag,
"tag_source": scores_dict["description"],
"rosetta_scores": scores_dict}
# clustering
eval_data.cluster_cutoff = cluster_cutoff
print "clustering models..."
filename_clusters = "%s/clusters.out" % dir_output
sys.stdout.write(" ")
regex_clusters = re.compile("^.*RNA_PREDICT (old|new) cluster ([0-9]+) score ([-0-9.]+) model (S_[0-9_]+)$")
for line in self.execute_command_and_capture(["rna_cluster", "-in:file:silent"] + files_assembly + ["-cluster:radius", "%s" % cluster_cutoff, "-nstruct", str(cluster_limit), "-out:file:silent", filename_clusters], add_suffix="rosetta", quiet=True):
m = regex_clusters.match(line)
if m:
eval_data.models[m.group(4)]["cluster"] = int(m.group(2))
if m.group(1) == "new":
eval_data.clusters[int(m.group(2))] = {"primary_model": m.group(4)}
print " %s cluster: %02s, model: %-10s, score: %.3f" % ("new" if m.group(1) == "new" else " ", m.group(2), m.group(4), float(m.group(3)))
# extract cluster pdbs
print "extracting cluster decoy pdbs..."
owd = os.getcwd()
try:
os.chdir(dir_output)
self.execute_command(["rna_extract", "-in:file:silent", basename(filename_clusters), "-in:file:silent_struct_type", "rna"], add_suffix="rosetta")
finally:
os.chdir(owd)
# loop over all extracted pdb files
for i, fe in enumerate(sorted(glob.glob("%s/S_*.pdb" % dir_output), key=natural_sort_key)):
shutil.move(fe, "%s/cluster_%d.pdb" % (dir_output, i + 1))
# rmsdcalc helper function
# noinspection PyShadowingNames
def calculate_rmsd(name, filename_comparison):
# calculate native rmsd values for all models if native pdb available
filename_tmp = "%s/rmsd.out" % dir_tmp
print "caluculating rmsd values to %s for all models..." % name
sys.stdout.write(" ")
regex_rmsd = re.compile("^All atom rmsd over moving residues: (S_[0-9_]+) ([-0-9.]+)$")
for line in self.execute_command_and_capture(["rna_score", "-in:file:silent"] + files_assembly + ["-in:file:native", filename_comparison, "-score:just_calc_rmsd", "-out:file:silent", filename_tmp], add_suffix="rosetta", quiet=True):
m = regex_rmsd.match(line)
if m:
print m.group(1), m.group(2)
eval_data.models[m.group(1)]["rmsd_%s" % name] = float(m.group(2))
delete_glob(filename_tmp, print_notice=False)
# only calculate rmsds if full_evalation is needed or wanted
if full_evaluation:
# calculate rmsd to native structure if native pdb available
if self.config["native_pdb_file"] is not None:
calculate_rmsd("native", self.config["native_pdb_file"])
# calculate rmsd to best model
calculate_rmsd("cluster_1", "%s/cluster_1.pdb" % dir_output)
# save evaluation data
eval_data.save_to_file(file_evaldata)
[docs] def evaluate_custom(self, constraints=None, dca_prediction_filename="dca/dca.txt", full_evaluation=False, threshold=7.5, radius=2, number_dca_predictions=100, threads=4):
"""
Custom scoring algorithmy by inspecting neighboring residues of dca contact pairs
:param constraints: constraints selection
:param dca_prediction_filename: dca filename
:param full_evaluation: discard existing evaluation data and extracted models, re-extract and re-calculate distance information
:param threshold: threshold in A to count a contact
:param radius: number of adjacent residues to inspect
:param number_dca_predictions: maximum number of DCA predictions to use
:param threads: number of threads to use for extraction
"""
self.check_config()
cst_name, cst_file = self.parse_cst_name_and_filename(constraints)
print "Evaluation configuration:"
print " constraints: %s" % cst_name
print " dca_file: %s" % dca_prediction_filename
print " dca_count: %s" % number_dca_predictions
print " threshold: %f" % threshold
print " radius: %s" % radius
dir_assembly = "predictions/%s/assembly" % cst_name
dir_output = "predictions/%s/output" % cst_name
dir_tmp = "predictions/%s/temp" % cst_name
file_evaldata = "%s/evaldata.dat" % dir_output
file_matrices = "%s/matrices.dat" % dir_tmp
files_assembly = sorted(glob.glob("%s/assembly_*.out" % dir_assembly), key=natural_sort_key)
try:
eval_data = EvalData.load_from_file(file_evaldata)
except SimulationException:
raise SimulationException("Normal --evaluate step needs to be run before --evaluate-custom.")
slen = len(self.config["sequence"])
utils.mkdir_p(dir_output)
utils.mkdir_p(dir_tmp)
extract = full_evaluation
build_matrices = full_evaluation
# check if we can use existing data
if not build_matrices:
# try to load existing distance matrix
print "loading existing distance matrices..."
try:
with open(file_matrices, "r") as f:
distance_matrices = pickle.load(f)
except (IOError, pickle.PickleError, AttributeError, EOFError, IndexError, KeyError):
print "error loading existing matrices. forcing rebuild..."
build_matrices = True
# check if we need to extract pdb files
if not os.path.isfile("%s/%s.pdb" % (dir_tmp, next(eval_data.models.iterkeys()))):
print "no extraced pdb files found. forcing extraction..."
extract = True
# extract all pdb files to temp dir
if extract:
delete_glob(dir_tmp)
print "extracing pdb files..."
commands = []
for f in files_assembly:
commands.append(Command(["rna_extract", "-in:file:silent", f, "-in:file:silent_struct_type", "rna", "-out:prefix", dir_tmp + os.sep + splitext(basename(f))[0] + "_"], add_suffix="rosetta"))
self.execute_commands(commands, threads=threads)
for model in eval_data.models.itervalues():
shutil.move(dir_tmp + os.sep + splitext(basename(model["source_file"]))[0] + "_" + model["tag_source"] + ".pdb", dir_tmp + os.sep + model["tag"] + ".pdb")
# build residue distance matrix for all models
if build_matrices:
delete_glob(file_matrices)
print "building matrices for %d models..." % len(eval_data.models)
distance_matrices = {}
x = 1
for model in eval_data.models.itervalues():
matrix = np.zeros([len(self.config["sequence"]), len(self.config["sequence"])])
chain = pdbtools.parse_pdb("", dir_tmp + os.sep + model["tag"] + ".pdb")[0].child_list[0]
for i in range(2, slen + 1):
for j in range(2, slen + 1):
if j == i:
continue
matrix[i - 1][j - 1] = chain[i]["P"] - chain[j]["P"]
distance_matrices[model["tag"]] = matrix
if x % 10 == 0:
sys.stdout.write("%d..." % x)
sys.stdout.flush()
x += 1
if x % 10 != 0:
print "%x..." % x
print "done."
with open(file_matrices, "w") as f:
pickle.dump(distance_matrices, f)
# calculate scores
print "calculating scores..."
dca = dcatools.parse_dca_data(dca_prediction_filename)
for model in eval_data.models.itervalues():
score_custom = 0
matrix = distance_matrices[model["tag"]]
c_count = 0
for contact in dca:
c_count += 1
if c_count > number_dca_predictions:
break
found = False
for i in range(-radius, radius + 1):
if contact.res1 + i < 1 or contact.res1 + i > slen:
continue
for j in range(-radius, radius + 1):
if contact.res2 + j < 1 or contact.res2 + j > slen:
continue
if abs(contact.res1 + i - contact.res2 - j) <= 3:
continue
if matrix[contact.res1 + i - 1][contact.res2 + j - 1] < threshold:
found = True
score_custom -= 1
break
if found:
break
print "model: %-11s, score: %8.3f, score_custom: %3d" % (model["tag"], model["score"], score_custom)
model["score_custom"] = score_custom
eval_data.save_to_file(file_evaldata)
# returns the path to the extracted pdb file
@staticmethod
[docs] def get_models(constraints, model_list, kind="tag"):
"""
Returns a selection of models with specific properties
:param constraints: constraints selection
:param model_list: see EvalData.print_models
:param kind: see EvalData.get_models
"""
return EvalData.load_from_cst(constraints).get_models(model_list=model_list, kind=kind)
@staticmethod
[docs] def print_models(constraints, model_list, kind="tag"):
"""
Print a set of models
:param constraints: constraints selection
:param model_list: see EvalData.print_models
:param kind: see EvalData.print_models
"""
EvalData.load_from_cst(constraints).print_models(model_list=model_list, kind=kind)
@staticmethod
def _create_constraints_output_filename(input_filename, output_filename, cst_function, number_dca_predictions=None, output_format="%n_%f"):
"""
Create a reasonable constraints output filename based on input filename and a formatting string.
The formatting string can include placeholdes
:param input_filename: input cst filename
:param output_filename: fixed output filename or basename or None to automatically format using output_format
:param cst_function: rosetta function
:param number_dca_predictions: maximum number of DCA predictions to use
:param output_format: formatting string containing placeholders: %f: rosetta function
%n: basename of source file
%d: number of dca predictions
:return: output filename
"""
source_basename = splitext(basename(input_filename))[0]
cst_function_underscore = cst_function.replace(" ", "_")
# chose a default filename, is none given
if output_filename is None:
output_filename = "constraints/%s.cst" % output_format
# check if a path was given, if not put it in the constraints dir
elif basename(output_filename) == output_filename:
output_filename = "constraints/%s" % output_filename
# make sure the filename always ends in .cst
if splitext(output_filename)[1] != ".cst":
output_filename += ".cst"
# replace placeholders with actual values
output_filename = output_filename.replace("%f", cst_function_underscore).replace("%n", source_basename).replace("%d", str(number_dca_predictions))
return output_filename
[docs] def parse_dca_filter_string(self, line):
"""
Parse a text string and turn it tinto a list of DCA filters
Multiple filters are separated by command and have the folling format:
``<filter>:<arg>:<arg>:...``
Threshold filter: Lookup dca contacts in a PDB file, discard or keep contact depending on whether the contact is realized:
Format: ``threshold:<n>:<cst>:<mode>:<moodel>``
- n: threshold (< 0: keep below, > 0: keep above)
- cst: constraints selection to look up PDB file
- mode, model: model selection mode (see EvalData.get_models for details)
Example: ``threshold:8.0:100rnaDCA_FADE_-100_26_20_-2_2:cluster:1,threshold:-6.0:100rnaDCA_FADE_-100_26_20_-2_2:cluster:1``
None filter: Empty filter
Format: ``none``
:param line: string to parse
:return: list of DcaFilter objects
"""
filter_chain = []
# filters are split by ","
for filtertext in line.split(","):
# initialize to None to suppress warning
dca_filter = None
# filter fields are split by ":"
fields = filtertext.split(":")
if fields[0] == "none":
continue
elif fields[0] == "threshold":
threshold = float(fields[1])
cst_name = fields[2]
model_kind = fields[3]
model_i = fields[4]
eval_data = EvalData.load_from_cst(cst_name)
model = eval_data.get_models([model_i], model_kind)[0]
pdbfile = self.extract_pdb(cst_name, model)
filter_pdb_chain = pdbtools.parse_pdb("", pdbfile)[0].child_list[0]
dca_filter = dcatools.DcaFilterThreshold(filter_pdb_chain, abs(threshold), keep_below=threshold > 0, mode="minimum_heavy")
filter_chain.append(dca_filter)
return filter_chain
[docs] def make_constraints(self, dca_prediction_filename="dca/dca.txt", output_filename=None, number_dca_predictions=100, cst_function="FADE -100 26 20 -2 2", filter_text=None, mapping_mode="minAtom"):
"""
Create a set of constraints from a DCA prediction.
:param dca_prediction_filename: DCA input file
:param output_filename: fixed output filename or None to automatically create
:param number_dca_predictions: maximum number of DCA predictions to use
:param cst_function: rosetta function and parameters as text string
:param filter_text: optional: List of DCA filters (see parse_dca_filter_string for details)
:param mapping_mode: atom-to-atom mapping mode to use, supported values: ``minAtom`` or ``pOnly``
"""
self.check_config()
output_filename = self._create_constraints_output_filename(dca_prediction_filename, output_filename, cst_function, number_dca_predictions, "%d%n_%f")
print "Constraints creation:"
print " dca_prediction_filename: %s" % dca_prediction_filename
print " output_filename: %s" % output_filename
print " number_dca_predictions: %d" % number_dca_predictions
print " mode: %s" % mapping_mode
print " function: %s" % cst_function
print " filter: %s" % filter_text
check_file_existence(dca_prediction_filename)
# load dca contacts from file
dca = dcatools.parse_dca_data(dca_prediction_filename)
# filter dca data
if filter_text is not None:
print "Filtering dca data:"
dca_filter_chain = self.parse_dca_filter_string(filter_text)
dcatools.filter_dca_data(dca_data=dca, dca_filter_chain=dca_filter_chain)
# create constraints
print "Creating constraints:"
cst_info = dcatools.build_cst_info_from_dca_contacts(dca, sequence=self.config["sequence"], mapping_mode=mapping_mode, cst_function=cst_function, number_dca_predictions=number_dca_predictions)
# write to file
with open(output_filename, "w") as out:
for c in cst_info:
out.write("%s %d %s %d %s\n" % (c[0], c[1], c[2], c[3], " ".join(map(str, c[4]))))
[docs] def edit_constraints(self, constraints, output_filename=None, cst_function="FADE -100 26 20 -2 2"):
"""
Edit an existing .cst file, replacing the rosetta function.
:param constraints: constraints selection
:param output_filename: fixed output filename or None to automatically create
:param cst_function: rosetta function and parameters as text string
"""
cst_name, input_filename = self.parse_cst_name_and_filename(constraints)
output_filename = self._create_constraints_output_filename(input_filename, output_filename, cst_function)
print "Constraints editing:"
print " input_filename: %s" % input_filename
print " output_filename: %s" % output_filename
print " function: %s" % cst_function
check_file_existence(input_filename)
if input_filename == output_filename:
raise SimulationException("Input and output filename cannot be the same")
pattern = re.compile(r"^(\S+\s+\d+\s+\S+\s+\d+)\s+(\S+)\s+.+$")
with open(output_filename, "w") as output_fd:
for line in utils.read_file_line_by_line(input_filename):
m = pattern.match(line)
output_fd.write("%s %s\n" % (m.group(1), cst_function))
[docs] def print_status(self, native_compare=False, csts=None):
"""
Print summary of predictions and their current status.
Output format always contains the following columns:
- P: preparation step
- M: motif generation
- A: assembly
- E: evaluation
If a step is completed, ``X`` is shown, ``-`` otherwise.
For motif generation a ``*`` may be shown to indicate that models from a different
set of constraints are used.
If *native_compare* is set to True another 4 columns are printed:
- 1: native rmsd score of the first cluster
- 5: lowest native rmsd score of the first 5 clusters
- 10: lowest native rmsd score of the first 10 clusters
- n: number of models
:param native_compare: print rmsd comparison to native structure
:param csts: list of constraints to include in output (default: all)
"""
self.check_config()
if native_compare and self.config["native_pdb_file"] is None:
raise SimulationException("Cannot compare without native information.")
if csts:
# cst selection
cst_names = [self.parse_cst_name_and_filename(cst)[0] for cst in csts]
else:
cst_names = self.get_csts()
print "Status: %s" % self.config["name"]
print " cst P M A E" + (" 1 5 10 n" if native_compare else "")
print " -------------------------------------------" + ("--------------------------" if native_compare else "")
for cst_name in cst_names:
status = self.get_status(cst_name, include_evaluation_data=native_compare)
done_preparation = "X" if status["step_preparation_done"] else "-"
done_motifs = "*" if status["step_motifs_overridden"] else ("X" if status["step_motifs_done"] else "-")
done_assembly = "X" if status["step_assembly_done"] else "-"
done_evaluation = "X" if status["step_evaluation_done"] else "-"
line = " %-035s %s %s %s %s" % (cst_name, done_preparation, done_motifs, done_assembly, done_evaluation)
if native_compare:
if "evaluation_rmsd_native" in status and status["evaluation_rmsd_native"] is not None:
rmsds = tuple("%.2f" % x for x in status["evaluation_rmsd_native"])
else:
rmsds = ("-", "-", "-")
line += " %05s %05s %05s" % rmsds
line += " %05s" % (status["evaluation_model_count"] if "evaluation_model_count" in status and status["evaluation_model_count"] is not None else "-")
print line