Source code for pathme.kegg.kegg_xml_parser

# -*- coding: utf-8 -*-

"""This module contains functions to parse KGML files."""

import itertools as itt
import json
import logging
import os
from collections import defaultdict
from xml.etree.ElementTree import parse

import requests
from bio2bel_kegg.constants import API_KEGG_GET
from bio2bel_kegg.parsers.description import parse_description

from pathme.constants import CHEBI, CHEBI_NAME, HGNC, HGNC_SYMBOL, KEGG_CACHE, KEGG_ID, KEGG_TYPE, PUBCHEM, UNIPROT
from pathme.wikipathways.utils import merge_two_dicts

log = logging.getLogger(__name__)

"""Import XML"""


[docs]def import_xml_etree(filename):
    """Return XML tree from KGML file.

    :param str filename: path to KGML file
    :returns: XML Tree
    :rtype: xml.etree.ElementTree.ElementTree
    """
    try:
        tree = parse(filename)
    except IOError as ioerr:
        print('File error: ' + str(ioerr))
        return None

    return tree


"""KEGG Handling functions"""


def _post_process_api_query(node_meta_data, hgnc_manager, chebi_manager):
    """Process API query.

    :param dict[str,str] node_meta_data: JSON retrieved from the API
    :param bio2bel_hgnc.Manager hgnc_manager: HGNC Manager
    :param bio2bel_chebi.Manager chebi_manager: ChEBI Manager
    :return: Standard identifiers for the protein/chemical
    :rtype: dict[str,str]
    """
    node_dict = {}

    if 'DBLINKS' in node_meta_data:

        for resource, identifier in node_meta_data['DBLINKS']:

            if resource not in {HGNC, UNIPROT, CHEBI, PUBCHEM}:
                continue

            # Get protein identifiers
            if resource == HGNC:
                hgnc_entry = hgnc_manager.get_gene_by_hgnc_id(identifier)

                if not hgnc_entry:
                    continue

                node_dict[HGNC] = identifier
                node_dict[HGNC_SYMBOL] = hgnc_entry.symbol

            elif resource == UNIPROT:
                hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(identifier)

                if not hgnc_entry:
                    continue

                hgnc_entry = hgnc_entry[0]  # Use the first element queried
                hgnc_id = str(hgnc_entry.identifier)
                node_dict[HGNC] = hgnc_id

                if hgnc_entry.symbol:
                    node_dict[HGNC_SYMBOL] = hgnc_entry.symbol

                else:
                    node_dict[UNIPROT] = identifier

            # Get chemical identifiers
            else:
                node_dict[resource] = identifier
                if resource == CHEBI:
                    # Split multiple identifiers and get their names
                    for chebi_id in identifier.split(' '):
                        chebi_entry = chebi_manager.get_chemical_by_chebi_id(chebi_id)

                        # If the id is found in the database stick the name
                        if chebi_entry:
                            node_dict[CHEBI_NAME] = chebi_entry.name
                        # Else use the default name by KEGG to ensure the name makes it into the graph
                        elif "ENTRY_NAME" in node_meta_data:
                            node_dict[CHEBI_NAME] = node_meta_data["ENTRY_NAME"]

    return node_dict


def _process_kegg_api_get_entity(entity, type, hgnc_manager, chebi_manager):
    """Send a given entity to the KEGG API and process the results.

    :param str entity: A KEGG identifier
    :param str type: Entity type
    :param bio2bel_hgnc.Manager hgnc_manager: HGNC Manager
    :param bio2bel_chebi.Manager chebi_manager: ChEBI Manager
    :return: JSON retrieved from the API
    :rtype: dict[str,str]
    """
    _entity_filepath = os.path.join(KEGG_CACHE, '{}.json'.format(entity))

    if os.path.exists(_entity_filepath):
        with open(_entity_filepath) as f:
            return json.load(f)

    kegg_url = API_KEGG_GET.format(entity)

    node_meta_data = parse_description(requests.get(kegg_url))

    node_dict = _post_process_api_query(node_meta_data, hgnc_manager, chebi_manager)

    node_dict[KEGG_ID] = entity
    node_dict[KEGG_TYPE] = type

    with open(_entity_filepath, 'w') as f:
        json.dump(node_dict, f)

    return node_dict


[docs]def get_entity_nodes(tree, hgnc_manager, chebi_manager):
    """Find entry elements (KEGG pathway nodes) in XML.

    :param xml.etree.ElementTree.ElementTree tree: XML tree
    :param bio2bel_hgnc.Manager hgnc_manager: HGNC Manager
    :param bio2bel_chebi.Manager chebi_manager: ChEBI Manager
    :return: genes with corresponding metadata (entry_id: [kegg_id, HGNC, UniProt])
    :return: compounds with corresponding metadata (entry_id: [compound_name, ChEBI])
    :return: biological processes with corresponding metadata  (entry_id: [kegg_id, map_name])
    :return: orthologs with corresponding metadata (entry_id: [kegg_id, kegg_type])
    :rtype: dict[str,str]
    """
    entry_dict = defaultdict(list)
    compound_dict = defaultdict(list)
    map_dict = defaultdict(list)
    ortholog_dict = defaultdict(list)

    for entry in tree.findall("entry"):

        entry_id = entry.get("id")
        kegg_ids = entry.get("name")
        kegg_type = entry.get("type")

        if kegg_type.startswith('gene'):
            for kegg_id in kegg_ids.split(' '):
                # Query the API/Cache to fetch information about protein
                node_info = _process_kegg_api_get_entity(kegg_id, kegg_type, hgnc_manager, chebi_manager)
                entry_dict[entry_id].append(node_info)

        elif kegg_type.startswith('compound'):
            for compound_id in kegg_ids.split(' '):

                compound_info = _process_kegg_api_get_entity(compound_id, kegg_type, hgnc_manager, chebi_manager)

                if compound_info:
                    compound_dict[entry_id].append(compound_info)

        elif kegg_type.startswith('map'):

            map_info = {KEGG_ID: kegg_ids}

            for graphics in entry.iter('graphics'):
                map_name = graphics.get('name')
                map_info['map_name'] = map_name

            map_dict[entry_id].append(map_info)

        elif kegg_type.startswith('ortholog'):

            for ortholog_id in kegg_ids.split(' '):
                ortholog_info = {
                    KEGG_ID: ortholog_id,
                    KEGG_TYPE: kegg_type
                }

                ortholog_dict[entry_id].append(ortholog_info)

        # TODO: other, enzyme
        elif kegg_type.startswith('brite'):
            pass

    return entry_dict, compound_dict, map_dict, ortholog_dict


[docs]def get_complex_components(tree, genes_dict, flattened=False):
    """Get complex components to either construct complex or flatten relationships.

    :param xml.etree.ElementTree.ElementTree tree: XML tree
    :param dict genes_dict: dictionary of all genes in pathway
    :param bool flattened: True to flatten all complex participants
    :return: dictionary of complex IDs and component IDs (complex_id: [component_ids])
    :return: flattened dictionary of complex IDs and component metadata (complex_ids: [metadata_dict])
    :rtype: dict[str,list]
    """
    # Get IDs of complex components to construct complexes of protein composites (i.e. similar proteins).
    # or get dictionary of flattened lists of all proteins involved in complexes.
    component_info = defaultdict(list)
    complex_ids = defaultdict(list)
    complexes = defaultdict(list)
    all_components = []
    flattened_complexes = defaultdict(list)

    for entry in (tree.findall("entry")):
        entry_id = entry.get("id")

        for component in entry.iter("component"):
            component_id = component.get("id")

            # Get complex IDs and each of their component IDs
            complex_ids[entry_id].append(component_id)

            # Get the IDs of all components participating in complexes
            if component_id not in all_components:
                all_components.append(component_id)

    # Get node info for each component
    for k, v in genes_dict.items():
        for (component_id, node_info) in itt.product(all_components, v):
            if component_id == k:
                component_info[component_id].append(node_info)

    # Flatten lists of components in complexes
    if flattened:

        for k, v in complex_ids.items():
            for comp_id in v:
                for comp_k, comp_v in component_info.items():
                    if comp_id == comp_k:
                        complexes[k].append(comp_v)

        for k, v in complexes.items():
            for component in v:
                for info in component:
                    flattened_complexes[k].append(info)

    return complex_ids, flattened_complexes


[docs]def get_xml_types(tree):
    """Find entity and interaction types in KEGG XML.

    :param xml.etree.ElementTree.ElementTree tree: XML tree
    :return: count of all entity, relation and reaction types present in XML
    :rtype: dict[str,int]
    """
    entity_types_dict = defaultdict(int)
    interaction_types_dict = defaultdict(int)

    for entry in tree.findall('entry'):
        entry_type = entry.get('type')

        if entry_type.startswith('gene'):
            gene_ids = entry.get('name')
            for gene_id in gene_ids.split(' '):
                entity_types_dict['gene'] += 1

        elif entry_type.startswith('ortholog'):
            ortholog_ids = entry.get('name')
            for ortholog_id in ortholog_ids.split(' '):
                entity_types_dict['ortholog'] += 1

        elif entry_type.startswith('compound'):
            entity_types_dict['compound entity'] += 1

        else:
            entity_types_dict[entry_type] += 1

    for relation in tree.findall('relation'):
        for subtype in relation.iter('subtype'):
            relation_subtype = subtype.get('name')
            interaction_types_dict[relation_subtype] += 1

    for reaction in tree.findall('reaction'):
        reaction_type = reaction.get('type')
        interaction_types_dict[reaction_type] += 1

    entity_types_dict['entities'] = sum(entity_types_dict.values())
    interaction_types_dict['interactions'] = sum(interaction_types_dict.values())

    return merge_two_dicts(entity_types_dict, interaction_types_dict)


"""Get all interactions in KEGG pathways"""


[docs]def get_all_relationships(tree):
    """Find all relationships between 2 entities.

    :param xml.etree.ElementTree.ElementTree tree: XML tree
    :return: relationships list [(relation_entry1, relation_entry2, relation_subtype)]
    :rtype: list[tuple]
    """
    relations_list = []

    for relation in tree.findall("relation"):

        subtype_list = []

        relation_entry1 = relation.get("entry1")
        relation_entry2 = relation.get("entry2")
        relation_type = relation.get('type')

        for subtype in relation.iter('subtype'):

            relation_subtype = subtype.get("name")
            relation_value = subtype.get("value")
            subtype_list.append(relation_subtype)

            # TODO: assume association ??
            if not relation_subtype:
                log.warning("No relation type declared")

            # Add protein-protein, protein-compound and transcription factor-target gene product interactions
            if relation_type in {'PPrel', 'PCrel', 'GErel'}:

                if relation_subtype == 'compound':
                    relations_list.append((relation_entry1, relation_entry2, 'binding/association'))

                else:
                    # Check if multiple relation subtypes present
                    if len(subtype_list) == 1:
                        relations_list.append((relation_entry1, relation_entry2, relation_subtype))
                    else:
                        relations_list.append((relation_entry1, relation_entry2, subtype_list))

            # Add enzyme-enzyme relations denoted as binding/association
            elif relation_type.startswith('ECrel'):
                relations_list.append((relation_entry1, relation_entry2, 'binding/association'))
                relations_list.append((relation_entry1, relation_value, 'binding/association'))
                relations_list.append((relation_value, relation_entry2, 'binding/association'))

            # Add interactions between a protein and a protein in another biological process
            elif relation_type.startswith('maplink'):
                relations_list.append((relation_entry1, relation_entry2, 'binding/association'))

    return relations_list


[docs]def get_all_reactions(tree, compounds_dict):
    """Get substrates and products with ChEBI or PubChem IDs participating in reactions.

    :param xml.etree.ElementTree.ElementTree tree: XML tree
    :param dict compounds_dict: dictionary of KEGG compound information
    :return: dictionary with substrate ids (reaction_id: [substrate_ids])
    :return: dictionary with product ids (reaction_id: [product_ids])
    :rtype: dict[str,list]
    """
    substrates_dict = defaultdict(list)
    products_dict = defaultdict(list)

    for reaction in tree.findall("reaction"):

        reaction_id = reaction.get("id")

        for k, v in compounds_dict.items():

            for substrate in reaction.iter('substrate'):
                substrate_id = substrate.get("id")

                if substrate_id == k:
                    substrates_dict[reaction_id].append(substrate_id)

            for product in reaction.iter('product'):
                product_id = product.get("id")

                if product_id == k:
                    products_dict[reaction_id].append(product_id)

    return substrates_dict, products_dict


[docs]def get_reaction_pathway_edges(xml_tree, substrates_dict, products_dict):
    """Get reaction edges.

    :param xml.etree.ElementTree.ElementTree xml_tree: xml tree
    :param dict substrates_dict: dictionary with substrate info
    :param dict products_dict: dictionary with product info
    :return: dictionary of reaction elements (reaction_id: [(substrate_id, product_id, reaction_type)])
    :rtype: dict[str,list]
    """
    reactions_dict = defaultdict(list)

    for reaction in xml_tree.findall("reaction"):

        reaction_type = reaction.get("type")
        reaction_id = reaction.get("id")

        if substrates_dict[reaction_id]:
            reaction_substrates = substrates_dict[reaction_id]

            if products_dict[reaction_id]:
                reaction_products = products_dict[reaction_id]

                # Add edge from substrates to products with compound info
                reactions_dict[reaction_id].append((reaction_substrates, reaction_products, reaction_type))

                # If reaction is reversible, flip the reaction order and add a new edge
                if reaction_type == "reversible":
                    reactions_dict[reaction_id].append((reaction_products, reaction_substrates, reaction_type))

    return reactions_dict