Source code for pathme.reactome.utils

# -*- coding: utf-8 -*-

"""This module has utilities method for parsing, handling WikiPathways RDF and data."""

import logging
import tarfile
from typing import List, Tuple

from bio2bel_chebi import Manager as ChebiManager
from bio2bel_hgnc import Manager as HgncManager
from bio2bel_hgnc.models import HumanGene
from pybel.dsl import protein

from pathme.utils import parse_id_uri
from ..constants import ENSEMBL, HGNC, UNIPROT, UNKNOWN

log = logging.getLogger(__name__)

"""Download utilities"""


[docs]def get_hgnc_node_info(gene: HumanGene) -> Tuple[str, str, str]: """Return HGNC identifier, symbol and namespace from HGNC entry. :param bio2bel_hgnc.manager.models.HGNC gene: """ return str(gene.identifier), gene.symbol, HGNC
[docs]def get_valid_node_parameters( node, hgnc_manager: HgncManager, chebi_manager: ChebiManager, ) -> Tuple[str, str, str]: """Get valid node parameters.""" namespace = None if 'uri_id' in node: _, _, namespace, identifier = parse_id_uri(node['uri_id']) # Look up in HGNC Manager the HGNC Symbol for a given UniProt or ENSEMBL identifier. # If not matches anything, leave it as it is and give a warning. if namespace == 'uniprot': hgnc_entry = hgnc_manager.get_gene_by_uniprot_id(identifier) if not hgnc_entry: log.debug('UniProt id: %s could not be converted to HGNC', identifier) namespace = UNIPROT # Multiple HGNC entries match the UniProt ID elif 1 < len(hgnc_entry): identifier = hgnc_entry namespace = 'hgnc_multiple_entry' else: identifier, name, namespace = get_hgnc_node_info(hgnc_entry[0]) elif namespace == 'ensembl': hgnc_entry = hgnc_manager.get_gene_by_ensembl_id(identifier) if not hgnc_entry: log.debug('ENSEMBL id: %s could not be converted to HGNC', identifier) namespace = ENSEMBL else: identifier, name, namespace = get_hgnc_node_info(hgnc_entry) elif namespace == 'obo' and 'CHEBI' in identifier: namespace = 'chebi' elif 'uri_reactome_id' in node: namespace = 'reactome' identifier = node.get('reactome_id') if identifier is None: _, _, _, identifier = parse_id_uri(node['uri_reactome_id']) if '#' in identifier: identifier = str(identifier).split('#')[1] if 'Complex' not in identifier or 'SmallMolecule' not in identifier: log.debug('Adding Reactome identifier for %s ', node['uri_reactome_id']) else: log.debug('Not found HGNC Symbol neither Reactome id for %s ', node['uri_id']) if 'display_name' in node: name = node['display_name'] if namespace == 'chebi' or namespace == 'CHEBI': if not chebi_manager.get_chemical_by_chebi_name(node['display_name']): identifier = identifier.replace('CHEBI:', '') chem = chebi_manager.get_chemical_by_chebi_id(identifier) # In case chebi id is outdated use the identifier as the name if chem: name = chem.safe_name else: name = identifier else: if 'name' in node: name = node['name'] if isinstance(name, set): name = list(name)[0] else: name = UNKNOWN return identifier, name, namespace
[docs]def process_multiple_proteins(hgnc_entries: List) -> List: """Create multiple nodes when UniProt identifer refers to multiple HGNC symbols. :param hgnc_entries: Results from query :return: List of Protein BEL nodes """ return [ protein(namespace='HGNC', name=hgnc_entry.symbol, identifier=hgnc_entry.id) for hgnc_entry in hgnc_entries ]
[docs]def untar_file(file_path: str, export_folder: str) -> None: """Unzip file into a destination folder. :param file_path: name of the file :param export_folder: name of the file """ tar_ref = tarfile.open(file_path, 'r:bz2') tar_ref.extractall(export_folder) tar_ref.close()