Source code for cadbiom_cmd.models

# -*- coding: utf-8 -*-
# Copyright (C) 2017-2020  IRISA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# The original code contained here was initially developed by:
#
#     Pierre Vignet.
#     IRISA
#     Dyliss team
#     IRISA Campus de Beaulieu
#     35042 RENNES Cedex, FRANCE
"""
Display, compare, and query a model


"""

from __future__ import unicode_literals
from __future__ import print_function

# Standard imports
from collections import defaultdict
import json
import csv
from logging import DEBUG
from urllib import quote as urllib_quote
import networkx as nx
import networkx.algorithms.isomorphism as iso

# Library imports
from cadbiom.models.biosignal.sig_expr import *
from cadbiom.models.guard_transitions.translators.chart_xml import MakeModelFromXmlFile
from cadbiom.models.guard_transitions.analyser.static_analysis import StaticAnalyzer

from tools.models import Reporter
from tools.models import (
    get_transitions,
    get_frontier_places,
    get_model_identifier_mapping,
)
from tools.models import get_places_data
from tools.graphs import (
    build_graph,
    get_json_graph,
    export_graph,
    get_solutions_graph_data,
)

import cadbiom.commons as cm

LOGGER = cm.logger()


[docs]def graph_isomorph_test(
    model_file_1, model_file_2, output_dir="graphs/", make_graphs=False, make_json=False
):
    """Entry point for model consistency checking.

    This functions checks if the graphs based on the two given models have
    the same topology, nodes & edges attributes/roles.

    .. todo:: This function should not write any file, and should be exported
        to the module tools.

    .. note:: Cf graphmatcher
        https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.isomorphism.categorical_edge_match.html

    :Use in scripts:

        .. code-block:: python

            >>> from cadbiom_cmd.models import graph_isomorph_test
            >>> print(graph_isomorph_test('model_1.bcx', 'model_2.bcx'))
            INFO: 3 transitions loaded
            INFO: 3 transitions loaded
            INFO: Build graph for the solution: Connexin_32_0 Connexin_26_0
            INFO: Build graph for the solution: Connexin_32_0 Connexin_26_0
            INFO: Topology checking: True
            INFO: Nodes checking: True
            INFO: Edges checking: True
            {'nodes': True, 'edges': True, 'topology': True}


    :param model_file_1: Filepath of the first model.
    :param model_file_2: Filepath of the second model.
    :key output_dir: Output path.
    :key make_graphs: If True, make a GraphML file in output path.
    :key make_json: If True, make a JSON dump of results in output path.
    :type model_file_1: <str>
    :type model_file_2: <str>
    :type output_dir: <str>
    :type make_graphs: <boolean>
    :type make_json: <boolean>
    :return: Dictionary with the results of tests.
        keys: 'topology', 'nodes', 'edges'; values: booleans
    :rtype: <dict <str>: <boolean>>
    """

    # Load transitions in the models
    # Transitions structure format:
    # {u'h00': [('Ax', 'n1', {u'label': u'h00[]'}),]
    parser_1 = MakeModelFromXmlFile(model_file_1)
    parser_2 = MakeModelFromXmlFile(model_file_2)
    transitions_1 = get_transitions(parser_1)
    transitions_2 = get_transitions(parser_2)

    # Get all nodes
    all_places_1 = parser_1.handler.node_dict.keys()
    all_places_2 = parser_2.handler.node_dict.keys()

    # Get all frontier places in the models
    # (places that are never in output position in all transitions)
    # EDIT: why we use all_places from the model instead of
    # (input_places - output_places) to get frontier places ?
    # Because some nodes are only in conditions and not in transitions.
    # If we don't do that, these nodes are missing when we compute
    # valid paths from conditions.
    front_places_1 = get_frontier_places(transitions_1, all_places_1)
    front_places_2 = get_frontier_places(transitions_2, all_places_2)
    if LOGGER.getEffectiveLevel() == DEBUG:
        LOGGER.debug("Frontier places 1: %s", sorted(front_places_1))
        LOGGER.debug("Frontier places 2: %s", sorted(front_places_2))

    # Build graphs & get networkx object
    # We give all events in the model as a list of steps
    # So we simulate a cadbiom solution (with all events in the model).
    res_1 = build_graph(front_places_1, [transitions_1.keys()], transitions_1)
    G1 = res_1[0]
    res_2 = build_graph(front_places_2, [transitions_2.keys()], transitions_2)
    G2 = res_2[0]

    # Checking
    nm = iso.categorical_node_match("color", "grey")
    em = iso.categorical_edge_match("color", "")

    check_state = {
        "topology": nx.is_isomorphic(G1, G2),
        "nodes": nx.is_isomorphic(G1, G2, node_match=nm),
        "edges": nx.is_isomorphic(G1, G2, edge_match=em),
    }

    LOGGER.info("Topology checking: %s", check_state["topology"])
    LOGGER.info("Nodes checking: %s", check_state["nodes"])
    LOGGER.info("Edges checking: %s", check_state["edges"])

    # Draw graph
    if make_graphs:
        export_graph(output_dir, front_places_1, "first", *res_1)
        export_graph(output_dir, front_places_2, "second", *res_2)

    # Export to JSON file
    if make_json:
        with open(output_dir + "graph_isomorphic_test.json", "w") as fd:
            fd.write(json.dumps(check_state, sort_keys=True, indent=4) + "\n")

    return check_state


[docs]def low_graph_info(model_file, graph_data=False, centralities=False):
    """Low level function for :meth:`~cadbiom_cmd.models.model_graph`.

    Get JSON data with information about the graph based on the model.

    .. seealso:: :meth:`tools.graphs.get_solutions_graph_data`.

    :param model_file: File for the model.
    :key graph_data: Also return a dictionary with the results of measures on
        the given graph.
        keys: measure's name; values: measure's value

        Example:

        .. code-block:: javascript

            {
                'modelFile': 'string',
                'modelName': 'string',
                'events': int,
                'entities': int,
                'transitions': int,
                'graph_nodes': int,
                'graph_edges': int,
                'centralities': {
                    'degree': {
                        'entity_1': float,
                        'entity_2': float
                    },
                    'strongly_connected': boolean,
                    'weakly_connected': boolean,
                    'max_degree': int,
                    'min_degree': int,
                    'average_degree': float,
                    'connected_components_number': int,
                    'connected_components': list,
                    'average_shortest_paths': int,
                }
            }

    :key centralities: If True with, compute centralities
        (degree, closeness, betweenness).
    :type model_file: <str>
    :type centralities: <boolean>
    :return: Tuple of tuples from :meth:`tools.graphs.build_graph`, set of
        frontier places, and dictionary with the results of measures on the
        given graph if requested.
    :rtype: <tuple>, <str>, <dict>
    """

    # Load transitions in the model
    # Transitions structure format:
    # {u'h00': [('Ax', 'n1', {u'label': u'h00[]'}),]
    parser_1 = MakeModelFromXmlFile(model_file)
    transitions_1 = get_transitions(parser_1)
    # Get all nodes
    all_places_1 = parser_1.handler.node_dict.keys()
    # Get the model
    model = parser_1.handler.model

    # Get all frontier places in the models
    # (places that are never in output position in all transitions)
    # EDIT: why we use all_places from the model instead of
    # (input_places - output_places) to get frontier places ?
    # Because some nodes are only in conditions and not in transitions.
    # If we don't do that, these nodes are missing when we compute
    # valid paths from conditions.
    front_places = get_frontier_places(transitions_1, all_places_1)
    if LOGGER.getEffectiveLevel() == DEBUG:
        LOGGER.debug("Frontier places: %s", sorted(front_places))

    # Build graphs & get networkx object
    # We give all events in the model as a list of steps
    # So we simulate a cadbiom solution (with all events in the model).
    res_1 = build_graph(front_places, [transitions_1.keys()], transitions_1)
    G = res_1[0]

    if not graph_data:
        return res_1, front_places

    info = {
        "modelFile": model_file,
        "modelName": model.name,
        "events:": len(transitions_1),  # One event can have multiple transitions
        "entities": len(all_places_1),  # places
        "boundaries": len(front_places),  # frontier places
        "transitions": len(model.transition_list),
    }

    get_solutions_graph_data(G, info, centralities)

    LOGGER.info(info)

    return res_1, front_places, info


[docs]def low_model_info(
    model_file, all_entities=False, boundaries=False, genes=False, smallmolecules=False
):
    """Low level function for :meth:`~cadbiom_cmd.models.model_info`.

    Get JSON data with information about the model and its entities.

    .. TODO::
        - add dump of transitions (option)
        - See get_transitions remark about its deprecation for the current use case
        - Dump roles of boundaries, computed here or in ChartModel...
          Already implemented for queries_2_common_graph and for pie charts.

    .. seealso:: Format de sortie de: :meth:`tools.solutions.convert_solutions_to_json`

    :param model_file: File for the model.
    :key all_entities: If True, data for all places of the model are returned
        (optional).
    :key boundaries: If True, only data for the frontier places of the model
        are returned (optional).
    :key genes: If True, only data for the genes of the model are returned
        (optional).
    :key smallmolecules: If True, only data for the smallmolecules of the model
        are returned (optional).
    :type model_file: <str>
    :type all_entities: <boolean>
    :type boundaries: <boolean>
    :type genes: <boolean>
    :type smallmolecules: <boolean>
    :return: Dictionary with informations about the model and the queried nodes.

        :Example:

            .. code-block:: javascript

                {
                    'modelFile': 'string',
                    'modelName': 'string',
                    'events': int,
                    'entities': int,
                    'boundaries': int,
                    'transitions': int,
                    'entitiesLocations': {
                        'cellular_compartment_a': int,
                        'cellular_compartment_b': int,
                        ...
                    },
                    'entitiesTypes': {
                        'biological_type_a': int,
                        'biological_type_b': int,
                        ...
                    },
                    'entitiesData': {
                        [{
                            'cadbiomName': 'string',
                            'immediateSuccessors': ['string', ...],
                            'uri': 'string',
                            'entityType': 'string',
                            'entityRef': 'string',
                            'location': 'string',
                            'names': ['string', ...],
                            'xrefs': {
                                'external_database_a': ['string', ...],
                                'external_database_b': ['string', ...],
                                ...
                            }
                        }],
                        ...
                    }
                }

    :rtype: <dict>
    """

    # Load transitions in the model
    # Transitions structure format:
    # {u'h00': [('Ax', 'n1', {u'label': u'h00[]'}),]
    parser_1 = MakeModelFromXmlFile(model_file)
    transitions_1 = get_transitions(parser_1)
    # Get all nodes
    all_places_1 = parser_1.handler.node_dict.keys()
    # Get the model
    model = parser_1.handler.model

    # Get all frontier places in the models
    # (places that are never in output position in all transitions)
    # EDIT: why we use all_places from the model instead of
    # (input_places - output_places) to get frontier places ?
    # Because some nodes are only in conditions and not in transitions.
    # If we don't do that, these nodes are missing when we compute
    # valid paths from conditions.
    front_places = get_frontier_places(transitions_1, all_places_1)

    # Basic informations
    info = {
        "modelFile": model_file,
        "modelName": model.name,
        "events:": len(transitions_1),  # One event can have multiple transitions
        "entities": len(all_places_1),  # places
        "boundaries": len(front_places),  # frontier places
        "transitions": len(model.transition_list),
    }

    # Complete the data with StaticAnalysis
    # Call custom Reporter instead of CompilReporter
    # from cadbiom_gui.gt_gui.utils.reporter import CompilReporter
    static_analyser = StaticAnalyzer(Reporter())
    static_analyser.build_from_chart_model(model)
    info['entitiesLocations'], info['entitiesTypes'] = \
        static_analyser.get_stats_entities_data()

    # There are multiple successors per origin...
    # Used to add this info to every nodes requested; see below
    places_successors = defaultdict(list)
    for trans in parser_1.model.transition_list:
        places_successors[trans.ori.name].append(trans.ext.name)

    # Filter places
    if all_entities:
        info["entitiesData"] = get_places_data(all_places_1, model)

    if boundaries:
        info["entitiesData"] = get_places_data(front_places, model)

    if genes:
        g = (place_name for place_name in all_places_1 if "_gene" in place_name)
        info["entitiesData"] = get_places_data(g, model)

    if smallmolecules:
        # Filter on entityTypes
        info["entitiesData"] = [
            data
            for data in get_places_data(all_places_1, model)
            if data["entityType"] == "SmallMolecule"
        ]

    # Edit places and insert immediate successors... It is ugly but requested...
    # Another request that brings overhead for nothing...
    for place in info["entitiesData"]:
        place["immediateSuccessors"] = places_successors[place["cadbiomName"]]

    return info


[docs]def model_identifier_mapping(model_file, *args, **kwargs):
    """Entry point for the mapping of identifiers from external databases

    :param model_file: File for the model.
    :key external_file: File with 1 external identifier per line.
    :key external_identifiers: List of external identifiers to be mapped.
    :type model_file: <str>
    :type external_file: <str>
    :type external_identifiers: <list>
    """
    if kwargs.get("external_file", None):
        with open(kwargs["external_file"], "r") as f_d:
            external_identifiers = set(line.strip("\n").strip("\r") for line in f_d)
    else:
        external_identifiers = set(kwargs["external_identifiers"])

    mapping = get_model_identifier_mapping(model_file, external_identifiers)

    # Make CSV file
    with open("mapping.csv", "w") as csvfile:
        writer = csv.writer(csvfile, delimiter=str(";"))

        # Header
        writer.writerow(["external identifiers", "cadbiom identifiers"])

        # Join multiple Cadbiom names with a |
        g = (
            (external_id, "|".join(cadbiom_names))
            for external_id, cadbiom_names in mapping.iteritems()
        )
        writer.writerows(g)


[docs]def model_graph(model_file, output_dir="./graphs/", centralities=False, **kwargs):
    """Get quick information and make a graph based on the model.

    :param model_file: File for the '.bcx' model.
    :param output_dir: Output directory.
    :param centralities: If True with ``--json``, compute centralities
        (degree, in_degree, out_degree, closeness, betweenness).

    :keyword graph: If True, make a GraphML file based on the graph maked
        from the model (optional).
    :keyword json: If True, make a JSON dump of results in output path(optional).

    :type model_file: <str>
    :type output_dir: <str>
    :type centralities: <boolean>
    :type graph: <boolean>
    :type json: <boolean>
    """

    # Bind arguments to avoid overwriting previous imports
    make_json = kwargs["json"]
    make_graph = kwargs["graph"]
    make_json_graph = kwargs["json_graph"]

    # If json is not set, remove centralities parameter (time consuming)
    if not make_json:
        centralities = False

    res_1, front_places, model_graph_info = low_graph_info(
        model_file, graph_data=True, centralities=centralities
    )

    # Make json graph
    if make_json_graph:
        # Pass a Networkx graph and get dictionary
        json_data = get_json_graph(res_1[0])
        with open(output_dir + "graph.json", "w") as f_d:
            f_d.write(json.dumps(json_data, indent=2))

    # Draw graph
    if make_graph:
        export_graph(
            output_dir,
            front_places,
            urllib_quote(model_graph_info["modelName"], safe=""),
            *res_1
        )

    # Export to json file
    if make_json:
        with open(output_dir + "graph_summary.json", 'w') as f_d:
            f_d.write(
                json.dumps(model_graph_info, sort_keys=True, indent=2) + '\n'
            )


[docs]def model_info(model_file, output_dir='./',
               all_entities=False, boundaries=False,
               genes=False, smallmolecules=False, default=True, **kwargs):
    """Get quick and full informations about the model structure and places.

    :param model_file: File for the '.bcx' model.
    :key output_dir: Output directory.
    :key all_entities: If True, data for all places of the model are returned
        (optional).
    :key boundaries: If True, only data for the frontier places of the model
        are returned (optional).
    :key genes: If True, only data for the genes of the model are returned
        (optional).
    :key smallmolecules: If True, only data for the smallmolecules of the model
        are returned (optional).
    :key default: Display quick description of the model
        (Number of places, transitions, entities types, entities locations).

    :key json: If True, make a JSON dump of results in output path(optional).
    :key csv: If True, make a csv dump of informations about filtered places.

    :type model_file: <str>
    :type output_dir: <str>
    :type all_entities: <boolean>
    :type boundaries: <boolean>
    :type genes: <boolean>
    :type smallmolecules: <boolean>
    :type default: <boolean>
    :type json: <boolean>
    :type csv: <boolean>
    """

    # Bind arguments to avoid overwriting previous imports
    make_json = kwargs["json"]
    make_csv = kwargs["csv"]

    if not (make_json and make_csv):
        default = True

    def dump_places_to_csv(entities_data, output_filename):
        """Write informations about places in the model to a csv."""
        with open(output_filename, "w") as csvfile:
            # Get all database names
            database_names = {
                db_name
                for place in entities_data
                for db_name in place.get("xrefs", dict()).iterkeys()
            }

            # Write headers
            fieldnames = (
                "cadbiomName", "immediateSuccessors", "names", "uri",
                "entityType",  "entityRef", "location"
            ) + tuple(database_names)

            writer = csv.DictWriter(
                csvfile,
                fieldnames=fieldnames,
                extrasaction="ignore",  # Ignore keys not found in fieldnames (xrefs)
            )
            writer.writeheader()

            for place in entities_data:
                # Since we modify places, we need to make a copy in memory
                temp_place = place.copy()
                # Join names with a pipe...
                # Handle escaped unicode characters in model
                # Ex: \u03b2-catenin => β-Catenin
                temp_place["names"] = "|".join(place.get("names", list())).encode(
                    "utf-8"
                )
                temp_place["immediateSuccessors"] = "|".join(
                    place["immediateSuccessors"]
                ).encode("utf-8")
                # Join xrefs ids with a pipe...
                for db_name, db_ids in place.get("xrefs", dict()).iteritems():
                    temp_place[db_name] = "|".join(db_ids).encode("utf-8")

                writer.writerow(temp_place)


    def get_output_filename(filetype="csv"):
        """Return the filename according to the given filters and filetype."""
        if all_entities:
            return "all_entities." + filetype
        if boundaries:
            return "boundaries." + filetype
        if genes:
            return "genes." + filetype
        if smallmolecules:
            return "smallmolecules." + filetype


    if default:
        # Call custom Reporter instead of CompilReporter
        # from cadbiom_gui.gt_gui.utils.reporter import CompilReporter
        static_analyser = StaticAnalyzer(Reporter())
        static_analyser.build_from_chart_file(model_file)
        print(static_analyser.get_statistics())
        return

    model_info = low_model_info(
        model_file, all_entities, boundaries, genes, smallmolecules
    )

    # Export to csv file
    if make_csv:
        dump_places_to_csv(model_info['entitiesData'], output_dir + get_output_filename())

    # Export to json file
    if make_json:
        with open(output_dir + "model_summary_" + get_output_filename("json"), 'w') as f_d:
            # Handle escaped unicode characters in model
            # Ex: \u03b2-catenin => β-Catenin
            f_d.write(
                json.dumps(model_info, sort_keys=True, indent=2, ensure_ascii=False).encode('utf8')
            )