Source code for biopax2cadbiom.sparql_biopaxQueries

# -*- coding: utf-8 -*-
# MIT License
#
# Copyright (c) 2017 IRISA, Jean Coquet, Pierre Vignet, Mateo Boudet
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Contributor(s): Jean Coquet, Pierre Vignet, Mateo Boudet

"""
This module contains a list of functions to query any SPARQL endpoint with
BioPAX data.
"""
from __future__ import unicode_literals

# Standard imports
from collections import defaultdict, Counter

# Custom imports
from biopax2cadbiom import sparql_wrapper
from biopax2cadbiom.classes import Control, Location, PhysicalEntity, Reaction
from biopax2cadbiom.tools import parse_uri
import biopax2cadbiom.commons as cm

LOGGER = cm.logger()

## Triplestore querying ########################################################

[docs]def get_biopax_pathways(graph_uris, provenance_uri):
    """Extract pathways from the given graphs

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dict of pathways URIs and names.
        keys: URIs; values: names (or uri if no name)
    :rtype: <dict>
    """
    pathwayToName = {}
    query = """
        SELECT DISTINCT ?pathway ?displayName
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?pathway rdf:type biopax3:Pathway .
    """
    if provenance_uri:
        query += "?pathway biopax3:dataSource <" + provenance_uri + "> ."
    query += """
            OPTIONAL { ?pathway biopax3:displayName ?displayName . }
        }
    """

    for pathway, name in sparql_wrapper.order_results(
        query, orderby="?pathway", limit=cm.SPARQL_LIMIT
    ):

        # Handle pathways without names
        if name is not None:
            pathwayToName[pathway] = name
        else:
            pathwayToName[pathway] = pathway

    return pathwayToName


[docs]def get_biopax_parent_pathways(graph_uris, provenance_uri):
    """Get sets of direct parent pathways for every pathway in the given graphs

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dict of parent pathways.
        Pathways as keys; sets of parent pathways as values
    :rtype: <dict <str>:<set>>
    """
    query = """
        SELECT DISTINCT ?pathway ?superPathway
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?superPathway rdf:type biopax3:Pathway .
    """
    if provenance_uri:
        query += (
            "?superPathway biopax3:dataSource <{}> .\n"
            "?pathway biopax3:dataSource <{}> .".format(provenance_uri, provenance_uri)
        )
    query += """
            ?superPathway biopax3:pathwayComponent ?pathway .
            ?pathway rdf:type biopax3:Pathway .
            ?pathway biopax3:pathwayComponent* ?subPathway .
            ?subPathway rdf:type biopax3:Pathway .
        }
    """

    pathwayToSuperPathways = defaultdict(set)
    for pathway, superPathway in sparql_wrapper.order_results(
        query, orderby="?pathway", limit=cm.SPARQL_LIMIT
    ):

        pathwayToSuperPathways[pathway].add(superPathway)

    return pathwayToSuperPathways


[docs]def get_biopax_reactions(graph_uris, provenance_uri):
    """Query all Interactions of the database, minus Control objects.

    .. warning::
        We also get 'Control', if we do 'rdfs: subClassOf * biopax3: Interaction',
        but this must be done by get_biopax_controls().

        THEREFORE: Suppression of the controls from the results via MINUS {}

    .. note:: Control class contains (Catalysis, TemplateReactionRegulation, ...)

    .. note:: We correct the BioPAX hierarchy generated by some tools like BiNOM.
        This tool defines the entire hierarchy of parent classes for each
        BioPAX object instead of let users to use the RDFS reasoner and
        the rdfs: subclassof property.
        As a result, objects are queried as many times as they have parent classes.
        Fortunately, we remove Control derivatives from Interaction objects.
        However, Interaction objects are far too generic to be interpreted/used in
        the program, so we must ensure that objects created here have the most
        accurate interactionType attribute possible.
        In practice Virtuoso returns first the rdf: type most accurate property,
        then the parent classes (Ex: BiochemicalReaction then Conversion in the
        case of an object that would include these 2 properties).
        In theory, nothing seems to guarantee that this happens all the time.

    .. note:: FR version:
        Nous corrigeons la hierarchie BioPAX générée par certains outils comme BiNOM.
        Cet outil définit toute la hiérarchie des classes parentes pour chaque
        objet BioPAX au lieu de laisser les utilisateurs d'utiliser le raisonneur
        RDFS et la propriété rdfs:subclassof.
        Par conséquent les objets sont requêtés autant de fois qu'ils ont de
        classes parentes. Heureusement nous enlevons les dérivés de Control des
        objets de type Interaction.
        Toutefois les objets Interaction sont bien trop génériques pour être
        interprétés/utilisés dans le programme, nous devons donc veiller à ce
        que les objets créés ici aient un attribut interactionType le plus
        précis possible.
        En pratique Virtuoso renvoie en premier la propriété rdf:type la plus
        précise, puis ensuite les classes parentes (Ex: BiochemicalReaction
        puis Conversion dans le cas d'un objet qui comporterait ces 2 propriétés).
        En théorie, rien ne semble garantir que cela se produise tout le temps.

    .. note:: conversionDirection and catalysisDirection are respectively for
        Conversion and Catalysis subclasses.
        Do not forget that Catalysis direction overrides Conversion direction.
        Currently we assume that Conversion *are* LEFT_TO_RIGHT (although this
        is not recommended in the standard).
        Order of priority for directions:
        catalysisDirection > conversionDirection > spontaneous >
        thermodynamic constants and FBA analysis

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dictionary of reactions.
        uris as keys; Reaction objects as values
    :rtype: <dict <str>:<Reaction>>
    """
    dictReaction = {}
    query = """
        SELECT DISTINCT ?reaction ?nameReaction ?interactionType ?conversionDirection ?pathway ?leftComponent ?rightComponent ?productComponent ?participantComponent
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?reaction rdf:type ?interactionType .
    """
    if provenance_uri:
        query += "?reaction biopax3:dataSource <" + provenance_uri + "> ."
    query += """
            ?interactionType rdfs:subClassOf* biopax3:Interaction .
            OPTIONAL { ?reaction biopax3:displayName ?nameReaction . }
            OPTIONAL { ?pathway biopax3:pathwayComponent ?reaction . }
            OPTIONAL {
                ?reaction biopax3:left ?leftComponent .
                FILTER NOT EXISTS { ?leftComponent rdf:type biopax3:Pathway }
            }
            OPTIONAL {
                ?reaction biopax3:right ?rightComponent .
                FILTER NOT EXISTS { ?rightComponent rdf:type biopax3:Pathway }
            }
            OPTIONAL {
                ?reaction biopax3:product ?productComponent .
                FILTER NOT EXISTS { ?productComponent rdf:type biopax3:Pathway }
            }
            OPTIONAL {
                ?reaction biopax3:participant ?participantComponent .
                FILTER NOT EXISTS { ?participantComponent rdf:type biopax3:Pathway }
            }
            OPTIONAL {?reaction biopax3:conversionDirection ?conversionDirection }
            OPTIONAL {?reaction biopax3:catalysisDirection ?conversionDirection }
            MINUS {
                ?interactionType rdfs:subClassOf* biopax3:Control
            }
        }
    """
    # Used to test if the proposed interactionType is more accurate than
    # the one already in the reaction.
    BioPAX_HIERARCHY = [
        # Not queried by 'subClassOf* Interaction'
        #"Entity",
        "Interaction",
        "TemplateReaction",
        "GeneticInteraction",
        # Not queried by 'MINUS subClassOf* Control':
        #"Control", "Catalysis", "TemplateReactionRegulation", "Modulation",
        "MolecularInteraction",
        "Conversion", "ComplexAssembly", "Degradation", "TransportWithBiochemicalReaction", "BiochemicalReaction", "Transport",
    ]
    BioPAX_supported_directions = ("LEFT_TO_RIGHT", "LEFT-TO-RIGHT")

    def get_reaction(reaction_uri):

        # If present, return it
        if reaction_uri in dictReaction:
            reaction = dictReaction[reaction_uri]

            if parse_uri(interactionType) != reaction.interactionType:
                # Test if the proposed interactionType is more accurate than
                # the one already in the reaction.
                if (BioPAX_HIERARCHY.index(parse_uri(interactionType)) >
                    BioPAX_HIERARCHY.index(reaction.interactionType)):
                    # Replace the old interactionType; See docstring;
                    # LOGGER.debug("Reaction: <%s>", parse_uri(reaction_uri))
                    # LOGGER.debug(
                    #   "BioPAX_HIERARCHY:: replace %s by %s",
                    #   parse_uri(interactionType),
                    #   reaction.interactionType)
                    reaction.interactionType = interactionType

            return reaction


        # If not present, create it and the return it
        if (
            conversionDirection is not None
            and conversionDirection not in BioPAX_supported_directions
        ):
            LOGGER.warning(
                "Reaction <%s>:: conversionDirection <%s> not currently supported. "
                "We will assume <LEFT_TO_RIGHT> instead.",
                reaction_uri,
                conversionDirection,
            )

        reaction = Reaction(
            reaction_uri,
            nameReaction,
            interactionType,
            productComponent,
            participantComponent,
        )
        dictReaction[reaction_uri] = reaction
        return reaction


    for (
        reaction_uri,
        nameReaction,
        interactionType,
        conversionDirection,
        pathway,
        leftComponent,
        rightComponent,
        productComponent,
        participantComponent,
    ) in sparql_wrapper.order_results(
        query, orderby="?reaction", limit=cm.SPARQL_LIMIT
    ):

        # Reaction creation if not already met
        reaction = get_reaction(reaction_uri)

        if pathway is not None:
            reaction.pathways.add(pathway)
        if leftComponent is not None:
            reaction.leftComponents.add(leftComponent)
        if rightComponent is not None:
            reaction.rightComponents.add(rightComponent)

    return dictReaction


[docs]def get_biopax_physicalentities(graph_uris, provenance_uri):
    """Get objects of the PhysicalEntity class and its subclasses in the given graphs

    .. note:: From the BioPAX documentation,
        about the use of memberPhysicalEntity:\n
        Using this property is not recommended. memberPhysicalEntity is only
        defined to support legacy data in certain databases. It is used to define a
        generic physical entity that is a collection of other physical entities.
        In general, EntityReference class should be used to create generic
        groups of physical entities, however, there are some cases where this
        is not possible, and the property has to be used. For instance,
        when an entity reference is used to define a generic physical entity with
        generic features, the generic features of the same type must be
        grouped. If you do not have grouping information for features of
        generic physical entities, you cannot use entity reference to define
        generic physical entities and must use the memberPhysicalEntity
        property. Another example for using this property is to create generic
        complexes, which are currently not supported with the
        EntityReference scheme (there is no "ComplexReference" class).

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dictionary of PhysicalEntity.
        uris as keys; Reaction objects as values
    :rtype: <dict <str>:<PhysicalEntity>>
    """

    dictPhysicalEntity = {}
    query = """
        SELECT DISTINCT ?entity ?name ?synonym ?location ?type ?component ?member ?entityRef
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?entity rdf:type ?type.
    """
    if provenance_uri:
        query += "?entity biopax3:dataSource <" + provenance_uri + "> ."
    query += """
            ?type rdfs:subClassOf* biopax3:PhysicalEntity.
            OPTIONAL { ?entity biopax3:displayName ?name . }
            OPTIONAL { ?entity biopax3:name ?synonym . }
            OPTIONAL { ?entity biopax3:cellularLocation ?location . }
            OPTIONAL { ?entity biopax3:component ?component . }
            OPTIONAL { ?entity biopax3:memberPhysicalEntity ?member . }
            OPTIONAL { ?entity biopax3:entityReference ?entityRef . }
        }
    """

    def get_entity(entity_uri):

        try:
            # If present, return it
            return dictPhysicalEntity[entity_uri]
        except KeyError:
            # If not present, create it and the return it
            new_entity = PhysicalEntity(
                entity_uri, name, location_uri, entityType, entityRef
            )

            dictPhysicalEntity[entity_uri] = new_entity
            return new_entity

    for (
        entity_uri,
        name,
        synonym,
        location_uri,
        entityType,
        component_uri,
        member,
        entityRef,
    ) in sparql_wrapper.order_results(query, orderby="?entity", limit=cm.SPARQL_LIMIT):

        # Entity creation if not already met
        entity = get_entity(entity_uri)

        if synonym is not None:
            entity.synonyms.add(synonym)

        if component_uri is not None:
            # todo : reflechir à avoir 1 set de PhysicalEntity et non d'uri...
            # !!!!! le component hérite de par le fait des parametres de son parent là...
            # component = get_entity(component_uri)
            # entity.components.add(component)
            entity.components_uris.add(component_uri)

        if member is not None:
            entity.members.add(member)

    return dictPhysicalEntity


[docs]def get_biopax_modificationfeatures(graph_uris, provenance_uri):
    """Get ModificationFeatures that occur on PhysicalEntities, grouped
    by entity, modification type and number of modifications per type.

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :returns: A dict of dicts (not Counters)! Each dict contains the
        modifications as keys and their number as values.
    :rtype: <dict <dict>>
    """

    modificationFeatures = defaultdict(Counter)

    query = """
        SELECT ?entity ?term COUNT(?term) AS ?terms_number
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
          ?entity rdf:type ?type .
    """
    if provenance_uri:
        query += "?entity biopax3:dataSource <" + provenance_uri + "> ."
    query += """
          ?type rdfs:subClassOf+ biopax3:PhysicalEntity .
          ?entity biopax3:feature ?feature .
          ?feature rdf:type biopax3:ModificationFeature .
          ?feature biopax3:modificationType ?modif_voc .
          ?modif_voc biopax3:term ?term .
        }
        GROUP BY ?entity ?term
    """

    # for entity_uri, term, terms_number in sparql_wrapper.sparql_query(query):
    for entity_uri, term, terms_number in sparql_wrapper.order_results(
        query, orderby="?entity", limit=cm.SPARQL_LIMIT
    ):

        # TODO: unit test on the count of modifications please per model
        modificationFeatures[entity_uri] += Counter({term: int(terms_number)})

    # Transtype the Counters to standard dict
    return {
        entity_uri: dict(modif)
        for entity_uri, modif in modificationFeatures.iteritems()
    }


[docs]def get_biopax_locations(graph_uris):
    """Get Location objects in the given graphs

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dictionary of locations.
        uris as keys; Location objects as values
    :rtype: <dict <str>:<Location>>
    """
    dictLocation = {}
    query = """
        SELECT DISTINCT ?location ?locationTerm ?dbname ?id_ref
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?entity biopax3:cellularLocation ?location .
            OPTIONAL { ?location biopax3:term ?locationTerm . }
            OPTIONAL {
                ?location biopax3:xref ?ref .
                ?ref biopax3:db ?dbname .
                ?ref biopax3:id ?id_ref .
            }
        }
    """

    for location, locationTerm, dbname, id_ref in sparql_wrapper.order_results(
        query, orderby="?location", limit=cm.SPARQL_LIMIT
    ):

        if location not in dictLocation:
            dictLocation[location] = Location(location, locationTerm)
        if id_ref is not None:
            # Add xref
            dictLocation[location].add_xref(dbname, id_ref.strip())

    return dictLocation


[docs]def get_biopax_controls(graph_uris, provenance_uri):
    """Get objects of the Control class and its subclasses in the given graphs

    .. note:: controlType is in (ACTIVATION, INHIBITION)
        Please note that Only Catalysis is allowed to have a default
        (not specified) controlType. Because of this, this attribute is optional.

        In the near future, if you try to create a Modulation or any other
        class with a controlType which is None, this object will not be considered.

        See: :class:`biopax2cadbiom.classes.Control`.

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dictionary of controls.
        uris as keys; Control objects as values
    :rtype: <dict <str>:<Control>>
    """
    dictControl = {}
    query = """
        SELECT DISTINCT ?control ?interactionType ?controlType ?reaction_uri ?controller ?evidence
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?control rdf:type ?interactionType.
    """
    if provenance_uri:
        query += "?control biopax3:dataSource <" + provenance_uri + "> ."
    query += """
            ?type rdfs:subClassOf* biopax3:Control .
            ?control biopax3:controlled ?reaction_uri .
            ?control biopax3:controller ?controller .
            OPTIONAL { ?control biopax3:controlType ?controlType . }
            OPTIONAL { ?control biopax3:evidence ?evidence . }
        }
    """

    def get_entity(control_uri):

        try:
            # If present, return it
            return dictControl[control_uri]
        except KeyError:
            # If not present, create it and the return it
            new_control = Control(
                control_uri, interactionType, controlType, reaction_uri, controller
            )

            dictControl[control_uri] = new_control
            return new_control

    for (
        control_uri,
        interactionType,
        controlType,
        reaction_uri,
        controller,
        evidence,
    ) in sparql_wrapper.order_results(query, orderby="?control", limit=cm.SPARQL_LIMIT):

        # Entity creation if not already met
        control = get_entity(control_uri)

        if evidence is not None:
            control.evidences.add(evidence)
        if controller is not None:
            control.controllers.add(controller)

    return dictControl


[docs]def get_biopax_xrefs(graph_uris, provenance_uri, database_name=None):
    """Get xrefs of all entities in the given database (if specified)

    - An Xref is a reference from an instance of a class in the current ontology
        to an object in external resource.

    - An xref can be an instance of PublicationXref, RelationshipXref,
        UnificationXref.

    .. warning:: WE DO NOT filter the references according to the relation
        of identity or similarity that they define.
        i.e, UnificationXref relationships have the same weight as
        RelationshipXref relationships, and the relationshipType attributes of
        RelationshipXref objects are not used to show the degree of similarity
        between the current object and the object in the external database
        (see the note below).

    .. note:: Classes inherit xref from their members.

    .. note:: Each ontology can differently name their databases.
        Ex: 'UniProt' vs 'uniprot knowledgebase', 'ChEBI' vs 'chebi'

    .. note:: Some objects (RelationshipXref, ?)
        have relationshipType attributes pointing to
        RelationshipTypeVocabulary objects. These objects use the
        PSI Molecular Interaction ontology (MI).

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :param provenance_uri: URI of the queried subgraphs.
        Used to filter objects on their dataSource attribute.
    :type graph_uris: <list>
    :type provenance_uri: <str>
    :return: Dictionary of entityRefs.
        keys: uris; values: dict of databases
        keys: database names; values: ids
    :rtype: <dict <str>: <dict <str>: <list>>>
    """

    query = """
        SELECT DISTINCT ?entity_uri ?dbname ?id_ref
    """
    for graph_uri in graph_uris:
        query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE
        {
            ?entity_uri rdf:type ?type.
        """
    if provenance_uri:
        query += "?entity_uri biopax3:dataSource <" + provenance_uri + "> ."
    query += """
            ?type rdfs:subClassOf* biopax3:PhysicalEntity.
            {
                {
                    # Simple entities
                    ?entity_uri biopax3:entityReference ?entityRef .
                    ?entityRef biopax3:xref ?ref .
                }
                UNION
                {
                    # Classes
                    ?entity_uri biopax3:entityReference ?entityRef .
                    ?entityRef biopax3:memberEntityReference ?memberEntityRef .
                    ?memberEntityRef biopax3:xref ?ref .
                }
                UNION
                # Entities with xref
                { ?entity_uri biopax3:xref ?ref .}
            }
            """
    if database_name:
        query += (
            """
            ?ref biopax3:db '"""
            + database_name
            + """'^^XMLSchema:string .
            """
        )

    query += """
        ?ref biopax3:db ?dbname .
        ?ref biopax3:id ?id_ref .
    }"""

    entities_id_refs = defaultdict(lambda: defaultdict(list))
    for entity_uri, dbname, id_ref in sparql_wrapper.sparql_query(query):
        entities_id_refs[entity_uri][dbname].append(id_ref.strip())

    # Cast defaultdicts to dicts
    return {
        entity_uri: dict(db_refs)
        for entity_uri, db_refs in entities_id_refs.iteritems()
    }

## Tiplestore metadata #########################################################

[docs]def get_subgraphs_from_triplestore(graph_uris):
    """Get URIs of BioPAX graphs in the configured triplestore

    .. note:: We assume that graphs are in full BioPAX format, i.e that
        dataSource attribute is set on entities. That's the only way to extract
        a database from another in a merged graph (Cf PathwayCommons).

    .. note:: In practice, name is more precise than displayName.

    .. note:: SPARQL query:

        .. code:: sparql

            PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#>
            SELECT ?graph ?provenance ?name ?dname ?comment
            WHERE {
                GRAPH ?graph {
                    ?provenance a bp:Provenance.
                    OPTIONAL {
                        ?provenance bp:standardName ?name.
                    }
                    OPTIONAL {
                        ?provenance bp:displayName ?dname.
                    }
                    OPTIONAL {
                        ?provenance bp:comment ?comment.
                    }
                }
            }
            ORDER BY ?graph ?name

    :param graph_uris: List of RDF graphs that will be queried on the triplestore.
    :type graph_uris: <list>
    :return: Iterable of tuples.
        (graph_uri, provenance_uri, name, display_name, comment)

        .. note:: If you get an encoding error in name or comment, please
            put 'from __future__ import unicode_literals' at the begining of
            your Python script.
    :type: <generator>
    """

    query = """
        SELECT DISTINCT ?graph ?provenance ?name ?dname ?comment
    """
    if graph_uris:
        for graph_uri in graph_uris:
            query += "FROM <" + graph_uri + ">\n"
    query += """
        WHERE {
            GRAPH ?graph {
                ?provenance a biopax3:Provenance.
                OPTIONAL {
                    ?provenance biopax3:standardName ?name.
                }
                OPTIONAL {
                    ?provenance biopax3:displayName ?dname.
                }
                OPTIONAL {
                    ?provenance biopax3:comment ?comment.
                }
            }
        }
    """

    return sparql_wrapper.order_results(
        query, orderby="?graph ?name", limit=cm.SPARQL_LIMIT
    )


[docs]def get_graphs_from_triplestore():
    """Get the list of graphs URIs in the triplestore

    .. note:: The queried graphs are named graphs.

    :return: Iterable of tuples (1 graph URI per tuple)
    :rtype: <generator>
    """

    query = """
        SELECT DISTINCT ?graph {
            GRAPH ?graph {?s ?p ?o}
        }
    """

    return sparql_wrapper.order_results(query, orderby="?graph", limit=cm.SPARQL_LIMIT)


[docs]def get_info_from_triplestore(graph_uris=list()):
    """List graphs and subgraphs from the triplestore and retrieve some metadata

    :param graph_uris: List of graphs uris (optional)
    :type graph_uris: <list>
    :return: Generator of tuples:
        (graph_uri, provenance_uri, name, dname, comment)
    :rtype: <generator>
    """

    # Get BioPAX graphs and metadata
    graph_uris = set()
    for graph_uri, provenance_uri, name, dname, comment in get_subgraphs_from_triplestore(
        graph_uris
    ):
        yield (graph_uri, provenance_uri, name, dname, comment)
        graph_uris.add(graph_uri)

    # Get simple graphs not discovered without pure BioPAX (dataSource attr)
    for graph_uri in get_graphs_from_triplestore():
        if graph_uri[0] not in graph_uris:
            yield (graph_uri[0], "", None, None, None)