Source code for biopax2cadbiom.sparql_biopaxQueries

# -*- coding: utf-8 -*-
# MIT License
#
# Copyright (c) 2017 IRISA, Jean Coquet, Pierre Vignet, Mateo Boudet
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Contributor(s): Jean Coquet, Pierre Vignet, Mateo Boudet

"""
This module contains a list of functions to query any SPARQL endpoint with
BioPAX data.
"""
from __future__ import unicode_literals

# Standard imports
from collections import defaultdict, Counter

# Custom imports
from biopax2cadbiom import sparql_wrapper
from biopax2cadbiom.classes import Control, Location, PhysicalEntity, Reaction
from biopax2cadbiom.tools import parse_uri
import biopax2cadbiom.commons as cm

LOGGER = cm.logger()

## Triplestore querying ########################################################

[docs]def get_biopax_pathways(graph_uris, provenance_uri): """Extract pathways from the given graphs :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dict of pathways URIs and names. keys: URIs; values: names (or uri if no name) :rtype: <dict> """ pathwayToName = {} query = """ SELECT DISTINCT ?pathway ?displayName """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?pathway rdf:type biopax3:Pathway . """ if provenance_uri: query += "?pathway biopax3:dataSource <" + provenance_uri + "> ." query += """ OPTIONAL { ?pathway biopax3:displayName ?displayName . } } """ for pathway, name in sparql_wrapper.order_results( query, orderby="?pathway", limit=cm.SPARQL_LIMIT ): # Handle pathways without names if name is not None: pathwayToName[pathway] = name else: pathwayToName[pathway] = pathway return pathwayToName
[docs]def get_biopax_parent_pathways(graph_uris, provenance_uri): """Get sets of direct parent pathways for every pathway in the given graphs :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dict of parent pathways. Pathways as keys; sets of parent pathways as values :rtype: <dict <str>:<set>> """ query = """ SELECT DISTINCT ?pathway ?superPathway """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?superPathway rdf:type biopax3:Pathway . """ if provenance_uri: query += ( "?superPathway biopax3:dataSource <{}> .\n" "?pathway biopax3:dataSource <{}> .".format(provenance_uri, provenance_uri) ) query += """ ?superPathway biopax3:pathwayComponent ?pathway . ?pathway rdf:type biopax3:Pathway . ?pathway biopax3:pathwayComponent* ?subPathway . ?subPathway rdf:type biopax3:Pathway . } """ pathwayToSuperPathways = defaultdict(set) for pathway, superPathway in sparql_wrapper.order_results( query, orderby="?pathway", limit=cm.SPARQL_LIMIT ): pathwayToSuperPathways[pathway].add(superPathway) return pathwayToSuperPathways
[docs]def get_biopax_reactions(graph_uris, provenance_uri): """Query all Interactions of the database, minus Control objects. .. warning:: We also get 'Control', if we do 'rdfs: subClassOf * biopax3: Interaction', but this must be done by get_biopax_controls(). THEREFORE: Suppression of the controls from the results via MINUS {} .. note:: Control class contains (Catalysis, TemplateReactionRegulation, ...) .. note:: We correct the BioPAX hierarchy generated by some tools like BiNOM. This tool defines the entire hierarchy of parent classes for each BioPAX object instead of let users to use the RDFS reasoner and the rdfs: subclassof property. As a result, objects are queried as many times as they have parent classes. Fortunately, we remove Control derivatives from Interaction objects. However, Interaction objects are far too generic to be interpreted/used in the program, so we must ensure that objects created here have the most accurate interactionType attribute possible. In practice Virtuoso returns first the rdf: type most accurate property, then the parent classes (Ex: BiochemicalReaction then Conversion in the case of an object that would include these 2 properties). In theory, nothing seems to guarantee that this happens all the time. .. note:: FR version: Nous corrigeons la hierarchie BioPAX générée par certains outils comme BiNOM. Cet outil définit toute la hiérarchie des classes parentes pour chaque objet BioPAX au lieu de laisser les utilisateurs d'utiliser le raisonneur RDFS et la propriété rdfs:subclassof. Par conséquent les objets sont requêtés autant de fois qu'ils ont de classes parentes. Heureusement nous enlevons les dérivés de Control des objets de type Interaction. Toutefois les objets Interaction sont bien trop génériques pour être interprétés/utilisés dans le programme, nous devons donc veiller à ce que les objets créés ici aient un attribut interactionType le plus précis possible. En pratique Virtuoso renvoie en premier la propriété rdf:type la plus précise, puis ensuite les classes parentes (Ex: BiochemicalReaction puis Conversion dans le cas d'un objet qui comporterait ces 2 propriétés). En théorie, rien ne semble garantir que cela se produise tout le temps. .. note:: conversionDirection and catalysisDirection are respectively for Conversion and Catalysis subclasses. Do not forget that Catalysis direction overrides Conversion direction. Currently we assume that Conversion *are* LEFT_TO_RIGHT (although this is not recommended in the standard). Order of priority for directions: catalysisDirection > conversionDirection > spontaneous > thermodynamic constants and FBA analysis :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dictionary of reactions. uris as keys; Reaction objects as values :rtype: <dict <str>:<Reaction>> """ dictReaction = {} query = """ SELECT DISTINCT ?reaction ?nameReaction ?interactionType ?conversionDirection ?pathway ?leftComponent ?rightComponent ?productComponent ?participantComponent """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?reaction rdf:type ?interactionType . """ if provenance_uri: query += "?reaction biopax3:dataSource <" + provenance_uri + "> ." query += """ ?interactionType rdfs:subClassOf* biopax3:Interaction . OPTIONAL { ?reaction biopax3:displayName ?nameReaction . } OPTIONAL { ?pathway biopax3:pathwayComponent ?reaction . } OPTIONAL { ?reaction biopax3:left ?leftComponent . FILTER NOT EXISTS { ?leftComponent rdf:type biopax3:Pathway } } OPTIONAL { ?reaction biopax3:right ?rightComponent . FILTER NOT EXISTS { ?rightComponent rdf:type biopax3:Pathway } } OPTIONAL { ?reaction biopax3:product ?productComponent . FILTER NOT EXISTS { ?productComponent rdf:type biopax3:Pathway } } OPTIONAL { ?reaction biopax3:participant ?participantComponent . FILTER NOT EXISTS { ?participantComponent rdf:type biopax3:Pathway } } OPTIONAL {?reaction biopax3:conversionDirection ?conversionDirection } OPTIONAL {?reaction biopax3:catalysisDirection ?conversionDirection } MINUS { ?interactionType rdfs:subClassOf* biopax3:Control } } """ # Used to test if the proposed interactionType is more accurate than # the one already in the reaction. BioPAX_HIERARCHY = [ # Not queried by 'subClassOf* Interaction' #"Entity", "Interaction", "TemplateReaction", "GeneticInteraction", # Not queried by 'MINUS subClassOf* Control': #"Control", "Catalysis", "TemplateReactionRegulation", "Modulation", "MolecularInteraction", "Conversion", "ComplexAssembly", "Degradation", "TransportWithBiochemicalReaction", "BiochemicalReaction", "Transport", ] BioPAX_supported_directions = ("LEFT_TO_RIGHT", "LEFT-TO-RIGHT") def get_reaction(reaction_uri): # If present, return it if reaction_uri in dictReaction: reaction = dictReaction[reaction_uri] if parse_uri(interactionType) != reaction.interactionType: # Test if the proposed interactionType is more accurate than # the one already in the reaction. if (BioPAX_HIERARCHY.index(parse_uri(interactionType)) > BioPAX_HIERARCHY.index(reaction.interactionType)): # Replace the old interactionType; See docstring; # LOGGER.debug("Reaction: <%s>", parse_uri(reaction_uri)) # LOGGER.debug( # "BioPAX_HIERARCHY:: replace %s by %s", # parse_uri(interactionType), # reaction.interactionType) reaction.interactionType = interactionType return reaction # If not present, create it and the return it if ( conversionDirection is not None and conversionDirection not in BioPAX_supported_directions ): LOGGER.warning( "Reaction <%s>:: conversionDirection <%s> not currently supported. " "We will assume <LEFT_TO_RIGHT> instead.", reaction_uri, conversionDirection, ) reaction = Reaction( reaction_uri, nameReaction, interactionType, productComponent, participantComponent, ) dictReaction[reaction_uri] = reaction return reaction for ( reaction_uri, nameReaction, interactionType, conversionDirection, pathway, leftComponent, rightComponent, productComponent, participantComponent, ) in sparql_wrapper.order_results( query, orderby="?reaction", limit=cm.SPARQL_LIMIT ): # Reaction creation if not already met reaction = get_reaction(reaction_uri) if pathway is not None: reaction.pathways.add(pathway) if leftComponent is not None: reaction.leftComponents.add(leftComponent) if rightComponent is not None: reaction.rightComponents.add(rightComponent) return dictReaction
[docs]def get_biopax_physicalentities(graph_uris, provenance_uri): """Get objects of the PhysicalEntity class and its subclasses in the given graphs .. note:: From the BioPAX documentation, about the use of memberPhysicalEntity:\n Using this property is not recommended. memberPhysicalEntity is only defined to support legacy data in certain databases. It is used to define a generic physical entity that is a collection of other physical entities. In general, EntityReference class should be used to create generic groups of physical entities, however, there are some cases where this is not possible, and the property has to be used. For instance, when an entity reference is used to define a generic physical entity with generic features, the generic features of the same type must be grouped. If you do not have grouping information for features of generic physical entities, you cannot use entity reference to define generic physical entities and must use the memberPhysicalEntity property. Another example for using this property is to create generic complexes, which are currently not supported with the EntityReference scheme (there is no "ComplexReference" class). :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dictionary of PhysicalEntity. uris as keys; Reaction objects as values :rtype: <dict <str>:<PhysicalEntity>> """ dictPhysicalEntity = {} query = """ SELECT DISTINCT ?entity ?name ?synonym ?location ?type ?component ?member ?entityRef """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?entity rdf:type ?type. """ if provenance_uri: query += "?entity biopax3:dataSource <" + provenance_uri + "> ." query += """ ?type rdfs:subClassOf* biopax3:PhysicalEntity. OPTIONAL { ?entity biopax3:displayName ?name . } OPTIONAL { ?entity biopax3:name ?synonym . } OPTIONAL { ?entity biopax3:cellularLocation ?location . } OPTIONAL { ?entity biopax3:component ?component . } OPTIONAL { ?entity biopax3:memberPhysicalEntity ?member . } OPTIONAL { ?entity biopax3:entityReference ?entityRef . } } """ def get_entity(entity_uri): try: # If present, return it return dictPhysicalEntity[entity_uri] except KeyError: # If not present, create it and the return it new_entity = PhysicalEntity( entity_uri, name, location_uri, entityType, entityRef ) dictPhysicalEntity[entity_uri] = new_entity return new_entity for ( entity_uri, name, synonym, location_uri, entityType, component_uri, member, entityRef, ) in sparql_wrapper.order_results(query, orderby="?entity", limit=cm.SPARQL_LIMIT): # Entity creation if not already met entity = get_entity(entity_uri) if synonym is not None: entity.synonyms.add(synonym) if component_uri is not None: # todo : reflechir à avoir 1 set de PhysicalEntity et non d'uri... # !!!!! le component hérite de par le fait des parametres de son parent là... # component = get_entity(component_uri) # entity.components.add(component) entity.components_uris.add(component_uri) if member is not None: entity.members.add(member) return dictPhysicalEntity
[docs]def get_biopax_modificationfeatures(graph_uris, provenance_uri): """Get ModificationFeatures that occur on PhysicalEntities, grouped by entity, modification type and number of modifications per type. :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :returns: A dict of dicts (not Counters)! Each dict contains the modifications as keys and their number as values. :rtype: <dict <dict>> """ modificationFeatures = defaultdict(Counter) query = """ SELECT ?entity ?term COUNT(?term) AS ?terms_number """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?entity rdf:type ?type . """ if provenance_uri: query += "?entity biopax3:dataSource <" + provenance_uri + "> ." query += """ ?type rdfs:subClassOf+ biopax3:PhysicalEntity . ?entity biopax3:feature ?feature . ?feature rdf:type biopax3:ModificationFeature . ?feature biopax3:modificationType ?modif_voc . ?modif_voc biopax3:term ?term . } GROUP BY ?entity ?term """ # for entity_uri, term, terms_number in sparql_wrapper.sparql_query(query): for entity_uri, term, terms_number in sparql_wrapper.order_results( query, orderby="?entity", limit=cm.SPARQL_LIMIT ): # TODO: unit test on the count of modifications please per model modificationFeatures[entity_uri] += Counter({term: int(terms_number)}) # Transtype the Counters to standard dict return { entity_uri: dict(modif) for entity_uri, modif in modificationFeatures.iteritems() }
[docs]def get_biopax_locations(graph_uris): """Get Location objects in the given graphs :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dictionary of locations. uris as keys; Location objects as values :rtype: <dict <str>:<Location>> """ dictLocation = {} query = """ SELECT DISTINCT ?location ?locationTerm ?dbname ?id_ref """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?entity biopax3:cellularLocation ?location . OPTIONAL { ?location biopax3:term ?locationTerm . } OPTIONAL { ?location biopax3:xref ?ref . ?ref biopax3:db ?dbname . ?ref biopax3:id ?id_ref . } } """ for location, locationTerm, dbname, id_ref in sparql_wrapper.order_results( query, orderby="?location", limit=cm.SPARQL_LIMIT ): if location not in dictLocation: dictLocation[location] = Location(location, locationTerm) if id_ref is not None: # Add xref dictLocation[location].add_xref(dbname, id_ref.strip()) return dictLocation
[docs]def get_biopax_controls(graph_uris, provenance_uri): """Get objects of the Control class and its subclasses in the given graphs .. note:: controlType is in (ACTIVATION, INHIBITION) Please note that Only Catalysis is allowed to have a default (not specified) controlType. Because of this, this attribute is optional. In the near future, if you try to create a Modulation or any other class with a controlType which is None, this object will not be considered. See: :class:`biopax2cadbiom.classes.Control`. :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dictionary of controls. uris as keys; Control objects as values :rtype: <dict <str>:<Control>> """ dictControl = {} query = """ SELECT DISTINCT ?control ?interactionType ?controlType ?reaction_uri ?controller ?evidence """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?control rdf:type ?interactionType. """ if provenance_uri: query += "?control biopax3:dataSource <" + provenance_uri + "> ." query += """ ?type rdfs:subClassOf* biopax3:Control . ?control biopax3:controlled ?reaction_uri . ?control biopax3:controller ?controller . OPTIONAL { ?control biopax3:controlType ?controlType . } OPTIONAL { ?control biopax3:evidence ?evidence . } } """ def get_entity(control_uri): try: # If present, return it return dictControl[control_uri] except KeyError: # If not present, create it and the return it new_control = Control( control_uri, interactionType, controlType, reaction_uri, controller ) dictControl[control_uri] = new_control return new_control for ( control_uri, interactionType, controlType, reaction_uri, controller, evidence, ) in sparql_wrapper.order_results(query, orderby="?control", limit=cm.SPARQL_LIMIT): # Entity creation if not already met control = get_entity(control_uri) if evidence is not None: control.evidences.add(evidence) if controller is not None: control.controllers.add(controller) return dictControl
[docs]def get_biopax_xrefs(graph_uris, provenance_uri, database_name=None): """Get xrefs of all entities in the given database (if specified) - An Xref is a reference from an instance of a class in the current ontology to an object in external resource. - An xref can be an instance of PublicationXref, RelationshipXref, UnificationXref. .. warning:: WE DO NOT filter the references according to the relation of identity or similarity that they define. i.e, UnificationXref relationships have the same weight as RelationshipXref relationships, and the relationshipType attributes of RelationshipXref objects are not used to show the degree of similarity between the current object and the object in the external database (see the note below). .. note:: Classes inherit xref from their members. .. note:: Each ontology can differently name their databases. Ex: 'UniProt' vs 'uniprot knowledgebase', 'ChEBI' vs 'chebi' .. note:: Some objects (RelationshipXref, ?) have relationshipType attributes pointing to RelationshipTypeVocabulary objects. These objects use the PSI Molecular Interaction ontology (MI). :param graph_uris: List of RDF graphs that will be queried on the triplestore. :param provenance_uri: URI of the queried subgraphs. Used to filter objects on their dataSource attribute. :type graph_uris: <list> :type provenance_uri: <str> :return: Dictionary of entityRefs. keys: uris; values: dict of databases keys: database names; values: ids :rtype: <dict <str>: <dict <str>: <list>>> """ query = """ SELECT DISTINCT ?entity_uri ?dbname ?id_ref """ for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { ?entity_uri rdf:type ?type. """ if provenance_uri: query += "?entity_uri biopax3:dataSource <" + provenance_uri + "> ." query += """ ?type rdfs:subClassOf* biopax3:PhysicalEntity. { { # Simple entities ?entity_uri biopax3:entityReference ?entityRef . ?entityRef biopax3:xref ?ref . } UNION { # Classes ?entity_uri biopax3:entityReference ?entityRef . ?entityRef biopax3:memberEntityReference ?memberEntityRef . ?memberEntityRef biopax3:xref ?ref . } UNION # Entities with xref { ?entity_uri biopax3:xref ?ref .} } """ if database_name: query += ( """ ?ref biopax3:db '""" + database_name + """'^^XMLSchema:string . """ ) query += """ ?ref biopax3:db ?dbname . ?ref biopax3:id ?id_ref . }""" entities_id_refs = defaultdict(lambda: defaultdict(list)) for entity_uri, dbname, id_ref in sparql_wrapper.sparql_query(query): entities_id_refs[entity_uri][dbname].append(id_ref.strip()) # Cast defaultdicts to dicts return { entity_uri: dict(db_refs) for entity_uri, db_refs in entities_id_refs.iteritems() }
## Tiplestore metadata #########################################################
[docs]def get_subgraphs_from_triplestore(graph_uris): """Get URIs of BioPAX graphs in the configured triplestore .. note:: We assume that graphs are in full BioPAX format, i.e that dataSource attribute is set on entities. That's the only way to extract a database from another in a merged graph (Cf PathwayCommons). .. note:: In practice, name is more precise than displayName. .. note:: SPARQL query: .. code:: sparql PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#> SELECT ?graph ?provenance ?name ?dname ?comment WHERE { GRAPH ?graph { ?provenance a bp:Provenance. OPTIONAL { ?provenance bp:standardName ?name. } OPTIONAL { ?provenance bp:displayName ?dname. } OPTIONAL { ?provenance bp:comment ?comment. } } } ORDER BY ?graph ?name :param graph_uris: List of RDF graphs that will be queried on the triplestore. :type graph_uris: <list> :return: Iterable of tuples. (graph_uri, provenance_uri, name, display_name, comment) .. note:: If you get an encoding error in name or comment, please put 'from __future__ import unicode_literals' at the begining of your Python script. :type: <generator> """ query = """ SELECT DISTINCT ?graph ?provenance ?name ?dname ?comment """ if graph_uris: for graph_uri in graph_uris: query += "FROM <" + graph_uri + ">\n" query += """ WHERE { GRAPH ?graph { ?provenance a biopax3:Provenance. OPTIONAL { ?provenance biopax3:standardName ?name. } OPTIONAL { ?provenance biopax3:displayName ?dname. } OPTIONAL { ?provenance biopax3:comment ?comment. } } } """ return sparql_wrapper.order_results( query, orderby="?graph ?name", limit=cm.SPARQL_LIMIT )
[docs]def get_graphs_from_triplestore(): """Get the list of graphs URIs in the triplestore .. note:: The queried graphs are named graphs. :return: Iterable of tuples (1 graph URI per tuple) :rtype: <generator> """ query = """ SELECT DISTINCT ?graph { GRAPH ?graph {?s ?p ?o} } """ return sparql_wrapper.order_results(query, orderby="?graph", limit=cm.SPARQL_LIMIT)
[docs]def get_info_from_triplestore(graph_uris=list()): """List graphs and subgraphs from the triplestore and retrieve some metadata :param graph_uris: List of graphs uris (optional) :type graph_uris: <list> :return: Generator of tuples: (graph_uri, provenance_uri, name, dname, comment) :rtype: <generator> """ # Get BioPAX graphs and metadata graph_uris = set() for graph_uri, provenance_uri, name, dname, comment in get_subgraphs_from_triplestore( graph_uris ): yield (graph_uri, provenance_uri, name, dname, comment) graph_uris.add(graph_uri) # Get simple graphs not discovered without pure BioPAX (dataSource attr) for graph_uri in get_graphs_from_triplestore(): if graph_uri[0] not in graph_uris: yield (graph_uri[0], "", None, None, None)