Source code for pathme.kegg.utils

# -*- coding: utf-8 -*-

"""This module has utilities method for parsing and handling KEGG KGML files."""

import os

import pandas as pd
import requests
import tqdm
from bio2bel_kegg.manager import Manager as KeggManager

from pathme.constants import KEGG_FILES, KEGG_KGML_URL, KEGG_STATS_COLUMN_NAMES
from pathme.export_utils import get_paths_in_folder
from pathme.kegg.convert_to_bel import get_bel_types
from pathme.kegg.kegg_xml_parser import get_xml_types, import_xml_etree

__all__ = [
    'download_kgml_files',
    'get_kegg_statistics',
    'get_kegg_pathway_ids'
]


[docs]def get_kegg_pathway_ids(connection=None): """Return a list of all pathway identifiers stored in the KEGG database. :param Optional[str] connection: connection to the database :returns: list of all kegg_pathway_ids :rtype: list """ kegg_manager = KeggManager(connection=connection) kegg_pathways_ids = [ pathway.resource_id.replace('path:', '') for pathway in kegg_manager.get_all_pathways() ] if not kegg_pathways_ids: raise EnvironmentError('Your database is empty. Please run python3 -m bio2bel_kegg populate') return kegg_pathways_ids
[docs]def download_kgml_files(kegg_pathway_ids): """Download KEGG KGML files by querying the KEGG API. :param list kegg_pathway_ids: list of kegg ids """ for kegg_id in tqdm.tqdm(kegg_pathway_ids, desc='Downloading KEGG files'): request = requests.get(KEGG_KGML_URL.format(kegg_id)) with open(os.path.join(KEGG_FILES, '{}.xml'.format(kegg_id)), 'w+') as file: file.write(request.text) file.close()
[docs]def get_kegg_statistics(path, hgnc_manager, chebi_manager, flatten=None): """Parse a folder and get KEGG statistics. :param graph: path :param bio2bel_hgnc.Manager hgnc_manager: HGNC manager :param bio2bel_chebi.Manager chebi_manager: ChEBI manager :param str path: path to folder containing XML files :return: KEGG KGML file and BEL graph statistics :rtype: pandas.DataFrame """ df = pd.DataFrame() export_file_name = 'KEGG_pathway_stats_{}.csv'.format('flatten' if flatten else 'non_flatten') # Get list of all files in folder files = get_paths_in_folder(path) for file_name in tqdm.tqdm(files, desc='Parsing KGML files and BEL graphs for entities and relation stats'): pathway_names = [] file_path = os.path.join(path, file_name) tree = import_xml_etree(file_path) root = tree.getroot() pathway_names.append(root.attrib['title']) # Get dictionary of all entity and interaction types in XML xml_statistics_dict = get_xml_types(tree) # Get dictionary of all node and edge types in BEL Graph bel_statistics_dict = get_bel_types(file_path, hgnc_manager, chebi_manager, flatten=flatten) # Get dictionary with all XML and BEL graph stats xml_statistics_dict.update(bel_statistics_dict) # Update dictionary of all XML and BEL graph stats with corresponding column names all_kegg_statistics = { KEGG_STATS_COLUMN_NAMES[key]: value for key, value in xml_statistics_dict.items() } # Add pathway statistic rows to DataFrame pathway_data = pd.DataFrame( all_kegg_statistics, index=pathway_names, columns=KEGG_STATS_COLUMN_NAMES.values(), dtype=int ) df = df.append(pathway_data.fillna(0).astype(int)) df.to_csv(export_file_name, sep='\t') return df