Source code for pathme.kegg.convert_to_bel

# -*- coding: utf-8 -*-

"""This module contains the methods to convert a KEGG RDF network into a BELGraph."""

import logging
from collections import defaultdict
from itertools import product

import tqdm
from pybel import BELGraph, to_pickle
from pybel.dsl.edges import activity
from pybel.dsl.node_classes import CentralDogma
from pybel.dsl.nodes import bioprocess, composite_abundance, pmod, reaction
from pybel.struct import add_annotation_value
from pybel.struct.summary import count_functions, edge_summary

from pathme.constants import *
from pathme.export_utils import add_annotation_key
from pathme.kegg.kegg_xml_parser import (
    get_all_reactions, get_all_relationships, get_complex_components, get_entity_nodes, get_reaction_pathway_edges,
from pathme.utils import add_bel_metadata

__all__ = [

log = logging.getLogger(__name__)

"""Populate empty BEL graph with KEGG pathway entities and interactions"""

[docs]def kegg_to_bel(path, hgnc_manager, chebi_manager, flatten=False): """Convert KGML file to a BELGraph. :param str path: path to KGML file :param bio2bel_hgnc.Manager hgnc_manager: HGNC manager :param bio2bel_chebi.Manager chebi_manager: ChEBI manager :param bool flatten: flat nodes :rtype: BELGraph """ xml_tree = import_xml_etree(path) # Load xml root = xml_tree.getroot() graph = BELGraph( name=root.attrib['title'], version='1.0.0', description=root.attrib['link'], authors="Daniel Domingo-Fernández, Josep Marín-Llaó and Sarah Mubeen", contact='' ) add_bel_metadata(graph) graph.graph['pathway_id'] = root.attrib['name'] # Parse file and get entities and interactions genes_dict, compounds_dict, maps_dict, orthologs_dict = get_entity_nodes(xml_tree, hgnc_manager, chebi_manager) relations_list = get_all_relationships(xml_tree) # Get compounds and reactions substrates_dict, products_dict = get_all_reactions(xml_tree, compounds_dict) reactions_dict = get_reaction_pathway_edges(xml_tree, substrates_dict, products_dict) # Get complexes complex_ids, flattened_complexes = get_complex_components(xml_tree, genes_dict, flattened=flatten) # Add nodes to graph nodes = xml_entities_to_bel(graph, genes_dict, compounds_dict, maps_dict, flattened=flatten) nodes = xml_complexes_to_bel( graph=graph, node_dict=nodes, complex_ids=complex_ids, flatten_complexes=flattened_complexes if flatten else None ) # Add edges to graph add_edges(graph, relations_list, nodes) add_reaction_edges(graph, reactions_dict, nodes) graph.annotation_pattern['PathwayID'] = '.*' add_annotation_key(graph) add_annotation_value(graph, 'PathwayID', f'{root.attrib["org"]}{root.attrib["number"]}') return graph
"""Get all entities from XML tree and convert to BEL nodes""" def xml_entities_to_bel(graph, genes_dict, compounds_dict, maps_dict, flattened=False): """Convert gene and compound entities in XML to BEL nodes. :param pybel.BELGraph graph: BEL Graph :param dict[str,str] genes_dict: KEGG genes (entry_id: [kegg_id, HGNC, UniProt]) :param dict[str,str] compounds_dict: KEGG compounds (entry_id: [compound_name, ChEBI]) :param dict[str,str] maps_dict: KEGG pathway maps (entry_id: [kegg_id, map_name]) :param bool flattened: True to flatten to list of similar genes grouped together :return: KEGG entities to BEL nodes :rtype: dict[str,pybel.dsl.BaseEntity] """ # Create a dictionary of flattened BEL nodes if flattened: node_dict = { node_id: flatten_gene_to_bel_node(graph, node_att) for node_id, node_att in genes_dict.items() } for node_id, node_att in compounds_dict.items(): node_dict[node_id] = flatten_compound_to_bel_node(graph, node_att) # Create a dictionary of un-flattened BEL nodes else: node_dict = { node_id: gene_to_bel_node(graph, node_att) for node_id, node_att in genes_dict.items() } for node_id, node_att in compounds_dict.items(): node_dict[node_id] = compound_to_bel(graph, node_att) for node_id, node_att in maps_dict.items(): node_dict[node_id] = map_to_bel_node(graph, node_att) return node_dict def xml_complexes_to_bel(graph, node_dict, complex_ids, flatten_complexes=None): """Convert complexes in XML to BEL nodes where each complex is made up of proteins and/or composites. :param pybel.BELGraph graph: BEL Graph :param dict[str,pybel.dsl.BaseEntity] node_dict: kegg_id to BEL node dictionary :param dict[str,list] complex_ids: complex IDs to corresponding component IDs :param Optional[dict[str,list]] flatten_complexes: complex IDs and flattened list of all components :return: kegg_ids to BEL nodes :rtype: dict[str,pybel.dsl.BaseEntity] """ member_dict = defaultdict(list) if flatten_complexes is not None: for node_id, node_att in flatten_complexes.items(): node_dict[node_id] = flatten_complex_to_bel_node(graph, node_att) # For all complexes, add BEL node component info else: for complex_id, member_ids in complex_ids.items(): for member in member_ids: member_dict[complex_id].append(node_dict[member]) for complex_id, bel_members in member_dict.items(): node_dict[complex_id] = complexes_to_bel_node(graph, bel_members) return node_dict def complexes_to_bel_node(graph, members): complex_node = complex_abundance(members=members) graph.add_node_from_data(complex_node) return complex_node def gene_to_bel_node(graph, node): """Create a protein or protein composite BEL node and add to BEL Graph. :param pybel.BELGraph graph: BEL Graph :param list[dict[str,str]] node: dictionary of node attributes :return: corresponding BEL node :rtype: pybel.dsl.BaseEntity """ members = list() # Create a protein BEL node if len(node) == 1: for attribute in node: if HGNC in attribute: protein_node = protein(namespace=HGNC, name=attribute[HGNC_SYMBOL], identifier=attribute[HGNC]) graph.add_node_from_data(protein_node) return protein_node elif UNIPROT in attribute: protein_node = protein(namespace=UNIPROT.upper(), name=attribute[UNIPROT], identifier=attribute[UNIPROT]) graph.add_node_from_data(protein_node) return protein_node else: protein_node = protein(namespace=KEGG.upper(), name=attribute[KEGG_ID], identifier=attribute[KEGG_ID]) graph.add_node_from_data(protein_node) return protein_node # Create a composite abundance BEL node else: for member in node: bel_node = gene_to_bel_node(graph, [member]) members.append(bel_node) protein_composite = composite_abundance(members=members) graph.add_node_from_data(protein_composite) return protein_composite def flatten_gene_to_bel_node(graph, node): """Create a protein or list of protein BEL nodes and add to BEL Graph. :param pybel.BELGraph graph: BEL Graph :param dict[str,str] node: dictionary of node attributes :return: corresponding BEL node :rtype: pybel.dsl.BaseEntity """ # if only 1 protein node, return corresponding BEL node if len(node) == 1: node_dict = node[0] if HGNC in node_dict: protein_node = protein(namespace=HGNC, name=node_dict[HGNC_SYMBOL], identifier=node_dict[HGNC]) graph.add_node_from_data(protein_node) return protein_node elif UNIPROT in node_dict: protein_node = protein(namespace=UNIPROT.upper(), name=node_dict[UNIPROT], identifier=node_dict[UNIPROT]) graph.add_node_from_data(protein_node) return protein_node else: protein_node = protein(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) graph.add_node_from_data(protein_node) return protein_node proteins_list = [] # if multiple protein nodes, return corresponding list of BEL nodes for node_dict in node: if HGNC in node_dict: protein_node = protein(namespace=HGNC, name=node_dict[HGNC_SYMBOL], identifier=node_dict[HGNC]) graph.add_node_from_data(protein_node) proteins_list.append(protein_node) elif UNIPROT in node_dict: protein_node = protein(namespace=UNIPROT.upper(), name=node_dict[UNIPROT], identifier=node_dict[UNIPROT]) proteins_list.append(protein_node) else: protein_node = protein(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) graph.add_node_from_data(protein_node) proteins_list.append(protein_node) return proteins_list def compound_to_bel(graph, node): """Create an abundance BEL node or composite abundances BEL node and add to BEL Graph. :param pybel.BELGraph graph: BEL Graph :param dict node: dictionary of node attributes :return: corresponding BEL node :rtype: pybel.dsl.BaseEntity """ members = list() # Create a compound BEL node if len(node) == 1: node_dict = node[0] if CHEBI in node_dict: compound = abundance(namespace=CHEBI.upper(), name=node_dict[CHEBI_NAME], identifier=node_dict[CHEBI]) graph.add_node_from_data(compound) return compound elif PUBCHEM in node_dict: compound = abundance(namespace=PUBCHEM.upper(), name=node_dict[PUBCHEM], identifier=node_dict[PUBCHEM]) graph.add_node_from_data(compound) return compound else: compound = abundance(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) graph.add_node_from_data(compound) return compound # Create a composite abundance BEL node else: for member in node: bel_node = compound_to_bel(graph, [member]) members.append(bel_node) compound_composite = composite_abundance(members=members) graph.add_node_from_data(compound_composite) return compound_composite def flatten_compound_to_bel_node(graph, node): """Create an abundance or list of abundance BEL nodes and add to BEL Graph. :param pybel.BELGraph graph: BEL Graph :param dict node: dictionary of node attributes :return: corresponding BEL node :rtype: pybel.dsl.BaseEntity """ # if only 1 compound node, return corresponding BEL node if len(node) == 1: node_dict = node[0] if CHEBI in node_dict: compound = abundance(namespace=CHEBI.upper(), name=node_dict[CHEBI_NAME], identifier=node_dict[CHEBI]) graph.add_node_from_data(compound) return compound elif PUBCHEM in node_dict: compound = abundance(namespace=PUBCHEM.upper(), name=node_dict[PUBCHEM], identifier=node_dict[PUBCHEM]) graph.add_node_from_data(compound) return compound else: compound = abundance(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) graph.add_node_from_data(compound) return compound compounds_list = [] # If multiple compound nodes, return flattened list of BEL nodes for node_dict in node: if CHEBI in node_dict: compound_node = abundance(namespace=CHEBI.upper(), name=node_dict[CHEBI_NAME], identifier=node_dict[CHEBI]) graph.add_node_from_data(compound_node) compounds_list.append(compound_node) elif PUBCHEM in node_dict: compound_node = abundance(namespace=PUBCHEM.upper(), name=node_dict[PUBCHEM], identifier=node_dict[PUBCHEM]) graph.add_node_from_data(compound_node) compounds_list.append(compound_node) else: compound_node = abundance(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) graph.add_node_from_data(compound_node) compounds_list.append(compound_node) return compounds_list def map_to_bel_node(graph, node): """Create a biological process BEL node. :param pybel.BELGraph graph: BEL Graph :param graph: BELGraph :param dict node: dictionary of node attributes :return: corresponding BEL node :rtype: pybel.dsl.BaseEntity """ for attribute in node: name = attribute['map_name'] identifier = attribute[KEGG_ID] if not name: log.debug(f"KEGG API does not provide information about {node}. Using identifier.") name = identifier if name.startswith('TITLE:'): name = name.strip('TITLE:') bio_process = bioprocess(namespace=KEGG.upper(), name=name, identifier=identifier) graph.add_node_from_data(bio_process) return bio_process def flatten_complex_to_bel_node(graph, node): """Create complex abundance BEL node. :param pybel.BELGraph graph: BEL Graph :param dict node: dictionary of node attributes :return: BEL node dictionary :rtype: pybel.dsl.BaseEntity """ members = list() for node_dict in node: if HGNC in node_dict: protein_node = protein(namespace=HGNC, name=node_dict[HGNC_SYMBOL], identifier=node_dict[HGNC]) members.append(protein_node) elif UNIPROT in node_dict: protein_node = protein(namespace=UNIPROT.upper(), name=node_dict[UNIPROT], identifier=node_dict[UNIPROT]) members.append(protein_node) else: protein_node = protein(namespace=KEGG.upper(), name=node_dict[KEGG_ID], identifier=node_dict[KEGG_ID]) members.append(protein_node) complex_members = complex_abundance(members=members) graph.add_node_from_data(complex_members) return complex_members """Get edges between BEL nodes""" def add_edges(graph, edges, nodes): """Add edges to BEL graph. :param pybel.BELGraph graph: BEL Graph :param list[tuple] edges: list of relationships with entity IDs and interaction types :param dict nodes: dictionary of BEL nodes """ for source, target, relation in edges: # Catch KeyError if entity node in list of edges is not a BEL node try: u = nodes[source] v = nodes[target] except KeyError: continue # If subject and object are lists, create edges between all products if isinstance(u, list) and isinstance(v, list): for pair in product(u, v): add_simple_edge(graph, pair[0], pair[1], relation) # If source is protein list and target is not, add edges between members in list and target elif isinstance(u, list) and not isinstance(v, list): for member in u: add_simple_edge(graph, member, v, relation) # If source is not a list and target is proteins list, add edges between them elif not isinstance(u, list) and isinstance(v, list): for member in v: add_simple_edge(graph, u, member, relation) # If entities are not lists, add edges between them else: add_simple_edge(graph, u, v, relation) def add_reaction_edges(graph, reaction_dict, nodes): """Add reaction nodes and edges from reactants to products and enzymes to reactions to BEL Graph. :param pybel.BELGraph graph: BEL Graph :param dict reaction_dict: dictionary of reaction IDs and reactant and product IDs :param dict nodes: dictionary of BEL nodes """ for k, v in reaction_dict.items(): # Get BEL gene node(s) enzyme = nodes[k] # Get compound nodes for source, target, reaction_type in v: reactants_list = [] products_list = [] # Get reactant compound node for source_id in source: substrate = nodes[source_id] reactants_list.append(substrate) # Get product compound node for target_id in target: product = nodes[target_id] products_list.append(product) for reactant_compound in reactants_list: for product_compound in products_list: # If multiple compounds represent a reactant or a product, add reaction BEL nodes to graph if isinstance(reactants_list, list) and isinstance(products_list, list): reaction_node = reaction(reactants=reactant_compound, products=product_compound) graph.add_node_from_data(reaction_node) # If multiple compounds represent a reactant, add reaction BEL node to graph elif isinstance(reactants_list, list) and not isinstance(products_list, list): for reactant_compound in reactants_list: reaction_node = reaction(reactants=reactant_compound, products=products_list) graph.add_node_from_data(reaction_node) # If multiple compounds represent a product, add reaction BEL node to graph elif not isinstance(reactants_list, list) and isinstance(products_list, list): for product_compound in products_list: reaction_node = reaction(reactants=reactants_list, products=product_compound) graph.add_node_from_data(reaction_node) # If reactant and product is represented by a single compound, add reaction BEL node to graph else: reaction_node = reaction(reactants=reactants_list, products=products_list) graph.add_node_from_data(reaction_node) # If enzyme is a list of genes, add edges between all enzymes and reactions if isinstance(enzyme, list): for gene_type in enzyme: add_simple_edge(graph, gene_type, reaction_node, reaction_type) else: add_simple_edge(graph, enzyme, reaction_node, reaction_type) def add_simple_edge(graph, u, v, relation_type): """Add corresponding edge type to BEL graph. :param pybel.BELGraph graph: BEL Graph :param u: source node :param v: target node :param list relation_type: source ID, target ID and relation types """ # Check if multiple relation subtypes present if isinstance(relation_type, list): # Check if protein modification is a relation subtype if relation_type[1] in {'phosphorylation', 'glycosylation', 'ubiquitination', 'methylation'}: # If the object is a gene, miRNA, RNA, or protein, add protein modification if isinstance(v, CentralDogma): v = v.with_variants(pmod(KEGG_MODIFICATIONS[relation_type[1]])) # Add increases edge if pmod subtype is coupled with activation subtype if relation_type[0] == 'activation': graph.add_increases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', subject_modifier=activity() if isinstance(u, ACTIVITY_ALLOWED_MODIFIERS) else None, # Add the activity function if subject is one of the following nodes (BEL 2.0 specifications) annotations={}, ) return # Add decreases edge if pmod subtype is coupled with inhibition subtype elif relation_type[0] == 'inhibition': graph.add_decreases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', subject_modifier=activity() if isinstance(u, ACTIVITY_ALLOWED_MODIFIERS) else None, # Add the activity function if subject is one of the following nodes (BEL 2.0 specifications) annotations={}, ) return # Found multiple relationship which cannot be combined into one logic (e.g., ['inhibition', 'indirect effect']) else: # Create all relationships in the list for relation in relation_type: add_simple_edge(graph, u, v, relation) return # Found multiple relationship which cannot be combined into one logic (e.g., ['inhibition', 'indirect effect']) else: # Create all relationships in the list for relation in relation_type: add_simple_edge(graph, u, v, relation) return """Handle differently the relationships""" # If only one pmod relation subtype if relation_type in {'phosphorylation', 'glycosylation', 'ubiquitination', 'methylation'}: # If the object is a gene, miRNA, RNA, or protein, add protein modification if isinstance(v, CentralDogma): v = v.with_variants(pmod(KEGG_MODIFICATIONS[relation_type])) graph.add_increases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', subject_modifier=activity() if isinstance(u, ACTIVITY_ALLOWED_MODIFIERS) else None, annotations={}, ) return # Subject activity decreases protein modification (i.e. dephosphorylation) of object elif relation_type == 'dephosphorylation': # If the object is a gene, miRNA, RNA, or protein, add protein modification if isinstance(v, CentralDogma): v = v.with_variants(pmod('Ph')) graph.add_decreases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', subject_modifier=activity() if isinstance(u, ACTIVITY_ALLOWED_MODIFIERS) else None, annotations={}, ) return # Subject increases activity of object elif relation_type == 'activation': graph.add_increases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', object_modifier=activity() if isinstance(v, ACTIVITY_ALLOWED_MODIFIERS) else None, annotations={}, ) return # Catalytic activity of subject increases transformation of reactant(s) to product(s) elif relation_type in {'reversible', 'irreversible'}: graph.add_increases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', subject_modifier=activity('cat') if isinstance(u, ACTIVITY_ALLOWED_MODIFIERS) else None, annotations={}, ) return # Subject decreases activity of object elif relation_type == 'inhibition': graph.add_decreases( u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', object_modifier=activity() if isinstance(v, ACTIVITY_ALLOWED_MODIFIERS) else None, annotations={}, ) return # Indirect effect and binding/association are noted to be equivalent relation types elif relation_type in {'indirect effect', 'binding/association', 'compound'}: graph.add_association(u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', annotations={}) return # Subject increases expression of object elif relation_type == 'expression': # Expression object is converted to RNA abundance if isinstance(v, CentralDogma): v = v.get_rna() graph.add_increases(u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', annotations={}) return # Subject decreases expression of object elif relation_type == 'repression': # Repression object is converted to RNA abundance if isinstance(v, CentralDogma): v = v.get_rna() graph.add_decreases(u, v, citation=KEGG_CITATION, evidence='Extracted from KEGG', annotations={}) return elif relation_type in {'dissociation', 'hidden compound', 'missing interaction', 'state change'}: return raise ValueError(f'Unexpected relation type {relation_type} between {u} and {v}') def get_bel_types(path, hgnc_manager, chebi_manager, flatten=None): """Get all BEL node and edge type statistics. :param str path: path to KGML file :param bio2bel_hgnc.Manager hgnc_manager: HGNC manager :param bio2bel_chebi.Manager chebi_manager: ChEBI manager :param bool flatten: flat nodes :return: count of all nodes and edges in BEL graph :rtype: dict """ bel_stats = {} bel_graph = kegg_to_bel(path, hgnc_manager, chebi_manager, flatten=True if flatten else False) bel_stats['nodes'] = bel_graph.number_of_nodes() bel_stats['edges'] = bel_graph.number_of_edges() # Get count of all BEL function types bel_functions_dict = count_functions(bel_graph) bel_stats.update(bel_functions_dict) # Get count of all BEL edge types bel_edges_dict = edge_summary.count_relations(bel_graph) bel_stats.update(bel_edges_dict) return bel_stats
[docs]def kegg_to_pickles(resource_files, resource_folder, hgnc_manager, chebi_manager, flatten=None, export_folder=None): """Export WikiPathways to Pickles. :param iter[str] resource_files: iterator with file names :param str resource_folder: path folder :param Optional[str] export_folder: export folder """ if export_folder is None: export_folder = resource_folder for kgml_file in tqdm.tqdm(resource_files, desc=f'Exporting KEGG to BEL in {export_folder}'): # Name of file created will be: "hsaXXX_unflatten.pickle" or "hsaXXX_flatten.pickle" pickle_path = os.path.join( export_folder if export_folder else KEGG_BEL, '{}_{}.pickle'.format( kgml_file.strip('.xml'), 'flatten' if flatten else 'unflatten') # By default graphs are unflatten ) # Skip not KGML files or file already exists if not kgml_file.endswith('.xml') or os.path.exists(pickle_path): continue bel_graph = kegg_to_bel( path=os.path.join(resource_folder, kgml_file), hgnc_manager=hgnc_manager, chebi_manager=chebi_manager, flatten=True if flatten else False, ) to_pickle(bel_graph, pickle_path)