#########################################################################
# MacSyFinder - Detection of macromolecular systems in protein dataset #
# using systems modelling and similarity search. #
# Authors: Sophie Abby, Bertrand Neron #
# Copyright (c) 2014-2021 Institut Pasteur (Paris) and CNRS. #
# See the COPYRIGHT file for details #
# #
# This file is part of MacSyFinder package. #
# #
# MacSyFinder is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# MacSyFinder is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details . #
# #
# You should have received a copy of the GNU General Public License #
# along with MacSyFinder (COPYING). #
# If not, see <https://www.gnu.org/licenses/>. #
#########################################################################
import abc
from string import Template
from macsypy.gene import GeneStatus
[docs]class SystemSerializer(metaclass=abc.ABCMeta):
"""
handle the different way to serialize a system
"""
@abc.abstractmethod
def serialize(self, system, hit_system_tracker):
pass
[docs]class TxtSystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
[docs] def serialize(self, system, hit_system_tracker):
"""
:return: a string representation of system readable by human
"""
clst = ", ".join(["[" + ", ".join([str((v_h.id, v_h.gene.name, v_h.position)) for v_h in cluster.hits]) + "]"
for cluster in system.clusters])
s = f"""system id = {system.id}
model = {system.model.fqn}
replicon = {system.replicon_name}
clusters = {clst}
occ = {system.occurrence()}
wholeness = {system.wholeness:.3f}
loci nb = {system.loci_nb}
score = {system.score:.3f}
"""
for title, genes in (("mandatory", system.mandatory_occ),
("accessory", system.accessory_occ),
("neutral", system.neutral_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = []
for h in hits:
used_in_systems = [s.id for s in hit_system_tracker[h.hit]
if s.model.fqn != system.model.fqn]
used_in_systems.sort()
if used_in_systems:
hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]"
else:
hit_str = f"{h.gene.name}"
all_hits_str.append(hit_str)
s += f'({", ".join(all_hits_str)})\n'
return s
[docs]class TsvSystemSerializer(SystemSerializer):
"""
Handle System serialization in tsv format
"""
header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score\tsys_occ" \
"\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \
"hit_begin_match\thit_end_match\tused_in"
template = Template("$sys_replicon_name\t$vh_id\t$vh_gene_name\t$vh_position\t$sys_model_fqn\t"
"$sys_id\t$sys_loci\t$locus_num\t$sys_wholeness\t$sys_score\t"
"$sys_occurrence\t$vh_gene_role\t$vh_status\t$vh_seq_length\t$vh_i_eval\t"
"$vh_score\t$vh_profile_coverage\t$vh_sequence_coverage\t$vh_begin_match"
"\t$vh_end_match\t$used_in_systems\n")
[docs] def serialize(self, system, hit_system_tracker):
r"""
:return: a serialisation of this system in tabulated separated value format
each line represent a hit and have the following structure:
replicon\\thit_id\\tgene_name\\thit_pos\\tmodel_fqn\\tsys_id\\tsys_loci\\tlocus_num\\tsys_wholeness\\tsys_score
\\tsys_occ\\thit_gene_ref.alternate_of\\thit_status\\thit_seq_len\\thit_i_eval\\thit_score\\thit_profile_cov
\\thit_seq_cov\\tit_begin_match\\thit_end_match
:rtype: str
"""
tsv = ''
loci_num = system.loci_num
for locus_num, cluster in zip(loci_num, system.clusters):
for vh in sorted(cluster.hits, key=lambda vh: vh.position):
used_in_systems = [s.id for s in hit_system_tracker[vh.hit] if s.model.fqn != system.model.fqn]
used_in_systems.sort()
tsv += self.template.substitute(
sys_replicon_name=system.replicon_name,
vh_id=vh.id,
vh_gene_name=vh.gene.name,
vh_position=vh.position,
sys_model_fqn=system.model.fqn,
sys_id=system.id,
sys_loci=system.loci_nb,
locus_num=locus_num,
sys_wholeness=f"{system.wholeness:.3f}",
sys_score=f"{system.score:.3f}",
sys_occurrence=system.occurrence(),
vh_gene_role=vh.gene_ref.alternate_of().name,
vh_status=vh.status,
vh_seq_length=vh.seq_length,
vh_i_eval=vh.i_eval,
vh_score=f"{vh.score:.3f}",
vh_profile_coverage=f"{vh.profile_coverage:.3f}",
vh_sequence_coverage=f"{vh.sequence_coverage:.3f}",
vh_begin_match=vh.begin_match,
vh_end_match=vh.end_match,
used_in_systems=','.join(used_in_systems)
)
return tsv
[docs]class TsvSolutionSerializer:
"""
Handle Solution (list of Systems) serialization in tsv format
"""
[docs] def __init__(self):
"""Constructor """
__class__.header = 'sol_id\t' + TsvSystemSerializer.header
__class__.template = Template(f"$$sol_id\t{TsvSystemSerializer.template.template}")
[docs] def serialize(self, solution, sol_id, hit_system_tracker):
"""
:param solution: the solution to serialize
:type solution: list of :class:`macsypy.system.System` object
:param hit_system_tracker:
:type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object
:return: a serialisation of this solution (a list of systems) in tabulated separated value format
each line represent a hit and have the same structure as system serialization
:meth:`macsypy.serialization.TsvSystemSerializer.serialize` but with an extra column
sol_id which is a technical id to identified the different solutions.
"""
tsv = ''
sys_ser = TsvSystemSerializer()
sys_ser.template = self.template
for system in solution:
sol_temp = Template(sys_ser.serialize(system, hit_system_tracker))
tsv += f"{sol_temp.substitute(sol_id=sol_id)}\n"
return tsv
class TxtLikelySystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
def serialize(self, likely_system, hit_system_tracker):
"""
:return: a string representation of system readable by human
"""
hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in likely_system.hits])
if likely_system.forbidden_hits:
warning = "WARNING there quorum is reached but there is also some forbidden genes.\n"
else:
warning = '\n'
s = f"""This replicon contains genetic materials needed for system {likely_system.model.fqn}
{warning}
system id = {likely_system.id}
model = {likely_system.model.fqn}
replicon = {likely_system.replicon_name}
hits = [{hits}]
wholeness = {likely_system.wholeness:.3f}
"""
for title, genes in (("mandatory", likely_system.mandatory_occ),
("accessory", likely_system.accessory_occ),
("neutral", likely_system.neutral_occ),
("forbidden", likely_system.forbidden_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = []
for h in hits:
used_in_systems = [s.id for s in hit_system_tracker[h.hit]
if s.model.fqn != likely_system.model.fqn]
used_in_systems.sort()
if used_in_systems:
hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]"
else:
hit_str = f"{h.gene.name}"
all_hits_str.append(hit_str)
s += f'({", ".join(all_hits_str)})\n'
s += "\nUse ordered replicon to have better prediction.\n"
return s
class TsvLikelySystemSerializer(SystemSerializer):
"""
Handle potential System from unordered replicon
serialization in tsv format
"""
header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness" \
"\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \
"hit_begin_match\thit_end_match\tused_in"
template = Template("$sys_replicon_name\t$vh_id\t$vh_gene_name\t$vh_position\t$sys_model_fqn\t"
"$sys_id\t$sys_wholeness\t"
"$vh_gene_role\t$vh_status\t$vh_seq_length\t$vh_i_eval\t"
"$vh_score\t$vh_profile_coverage\t$vh_sequence_coverage\t$vh_begin_match"
"\t$vh_end_match\t$used_in_systems\n")
def serialize(self, likely_system, hit_system_tracker):
r"""
:return: a serialisation of this system in tabulated separated value format
each line represent a hit and have the following structure:
replicon\\thit_id\\tgene_name\\thit_pos\\tmodel_fqn\\tsys_id\\tsys_wholeness
\\thit_gene_ref.alternate_of\\thit_status\\thit_seq_len\\thit_i_eval\\thit_score\\thit_profile_cov
\\thit_seq_cov\\tit_begin_match\\thit_end_match
:rtype: str
"""
tsv = ''
for status in (s.lower() for s in GeneStatus.__members__.keys()):
try:
hits = getattr(likely_system, f"{status}_hits")
hits = sorted(hits, key=lambda vh: vh.gene.name)
except AttributeError:
continue
for vh in hits:
used_in_systems = [s.id for s in hit_system_tracker[vh.hit] if s.model.fqn != likely_system.model.fqn]
used_in_systems.sort()
tsv += self.template.substitute(
sys_replicon_name=likely_system.replicon_name,
vh_id=vh.id,
vh_gene_name=vh.gene.name,
vh_position=vh.position,
sys_model_fqn=likely_system.model.fqn,
sys_id=likely_system.id,
sys_wholeness=f"{likely_system.wholeness:.3f}",
vh_gene_role=vh.gene_ref.alternate_of().name,
vh_status=vh.status,
vh_seq_length=vh.seq_length,
vh_i_eval=vh.i_eval,
vh_score=f"{vh.score:.3f}",
vh_profile_coverage=f"{vh.profile_coverage:.3f}",
vh_sequence_coverage=f"{vh.sequence_coverage:.3f}",
vh_begin_match=vh.begin_match,
vh_end_match=vh.end_match,
used_in_systems=','.join(used_in_systems)
)
return tsv
class TxtUnikelySystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
def serialize(self, likely_system):
"""
:return: a string representation of system readable by human
"""
hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in likely_system.hits])
reasons = '\n'.join(likely_system.reasons)
s = f"""This replicon probably not contains a system {likely_system.model.fqn}:
{reasons}
system id = {likely_system.id}
model = {likely_system.model.fqn}
replicon = {likely_system.replicon_name}
hits = [{hits}]
wholeness = {likely_system.wholeness:.3f}
"""
for title, genes in (("mandatory", likely_system.mandatory_occ),
("accessory", likely_system.accessory_occ),
("neutral", likely_system.neutral_occ),
("forbidden", likely_system.forbidden_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = [f"{h.gene.name}" for h in hits]
s += f'({", ".join(all_hits_str)})\n'
s += "\nUse ordered replicon to have better prediction.\n"
return s