Source code for pathomx.db

# -*- coding: utf-8 -*-
# Database manager
# Loads compounds, reactions and pathways on initialisation. Provides interface to
# filter sets, list etc.
# Database is a key-based store of each dataset
from __future__ import unicode_literals
import logging
logging.debug('Loading db.py')

import os
import sys
import re
from .utils import UnicodeReader, UnicodeWriter
from collections import defaultdict

from . import utils
import numpy as np

from .translate import tr

try:
    from urllib.request import urlopen
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse
    from urllib import urlopen

import logging

# Databases that have sufficiently unique IDs that do not require additional namespacing
database_link_synonyms = [
    'UCSC', 'ENSEMBL', 'HMDB', 'CAS', 'KEGG'
]

# Internal URLS
COMPOUND_URL = 'pathomx://db/compound/%s/view'
PATHWAY_URL = 'pathomx://db/pathway/%s/view'
REACTION_URL = 'pathomx://db/reaction/%s/view'
PROTEIN_URL = 'pathomx://db/protein/%s/view'
GENE_URL = 'pathomx://db/gene/%s/view'


# Global Pathomx db object class to simplify object display, synonym referencing, etc.
class _PathomxObject(object):
    def __unicode__(self):
        return self.name

    def _str__(self):
        return self.name

    def __repr__(self):
        return self.__unicode__()

    def __init__(self, **entries):
        object.__init__(self)
        self.__dict__.update(entries)

    def __hash__(self):
        return hash(self.id)

    def __eq__(self, other):
        return type(self) == type(other) and self.id == other.id

    def get_piped_str(self, l):
        l = list(set(l))  # Remove duplicates from list when saving
        return '|'.join([o if type(o) is str or type(o) is unicode else str(o.id) for o in l])

    def get_db_str(self, dbs):
        if dbs:
            dbtbl = list()
            for k, v in list(dbs.items()):
                dbtbl.append('%s:%s' % (k, v))
            return ';'.join(dbtbl)
        else:
            return ''

    def synonym_str(self):
        return ', '.join(self.synonyms)


# Dummy wrapper classes for readability
[docs]class Compound(_PathomxObject):
    type = 'compound'
    type_name = tr('Compound')

[docs]    def as_csv(self):
        return [
            self.id,
            self.name,
            self.type,
            self.get_db_str(self.databases)]

    @property
    def url(self):
        return COMPOUND_URL % self.id


[docs]class Pathway(_PathomxObject):
    type = 'pathway'
    type_name = tr('Pathway')

[docs]    def as_csv(self):
        return [
            self.id,
            self.name,
            self.get_db_str(self.databases)]

    @property
    def url(self):
        return PATHWAY_URL % self.id


[docs]class Reaction(_PathomxObject):
    type = 'reaction'
    type_name = tr('Reaction')

[docs]    def as_csv(self):
        return [
            self.id,
            self.name,
            self.get_piped_str(self.mtins),
            self.get_piped_str(self.mtouts),
            self.get_piped_str(self.smtins),
            self.get_piped_str(self.smtouts),
            self.get_piped_str(self.proteins),
            self.dir,
            self.get_piped_str(self.pathways),
            self.get_db_str(self.databases)]

    @property
    def url(self):
        return REACTION_URL % self.id

    @property
    def compounds(self):
        return self.mtins + self.mtouts

    @property
    def secondary_compounds(self):
        return self.smtins + self.smtouts


[docs]class Protein(_PathomxObject):
    type = 'protein'
    type_name = tr('Protein')

[docs]    def as_csv(self):
        return [
            self.id,
            self.name,
            self.get_piped_str(self.genes),
            self.get_piped_str(self.compartments),
            self.get_db_str(self.databases)]

    @property
    def url(self):
        return PROTEIN_URL % self.id


[docs]class Gene(_PathomxObject):
    type = 'gene'
    type_name = tr('Gene')

[docs]    def as_csv(self):
        return [
            self.id,
            self.name,
            self.get_db_str(self.databases)]

    @property
    def url(self):
        return GENE_URL % self.id


# Dummy class to handle reaction intermediate compounds/reaction steps
[docs]class ReactionIntermediate(_PathomxObject):
    # Standard values
    type = 'dummy'
    type_name = 'n/a'
    name = 'n/a'


[docs]class databaseManager():

    # compounds, reactions, pathways = dict()
    def __init__(self):

    # Initialise variables
        self.synfwd = defaultdict(set)  # ID -> Synonyms
        self.synrev = dict()  # Synonym -> ID
        self.synrev_by_type = defaultdict(dict)  # Synonym -> ID

        # A namespace index
        self.index = dict()

        # Separate subtypes
        self.pathways = dict()  # Pathway)

        self.reactions = dict()  # Reaction)
        self.compounds = dict()  # Compound)

        self.proteins = dict()  # Protein)
        self.genes = dict()  # Gene)

        self.unification = defaultdict(dict)

        # Load the data
        self.load_pathways()
        self.load_compounds()

        self.load_genes()
        self.load_proteins()

        self.load_reactions()

        # Load synonym interface for conversion and data-interpreting
        self.load_synonyms()
        self.load_identities()
        self.load_xrefs()

        # Load additional chemical data
        self.load_gibbs()

    # Helper functions
[docs]    def get_via_unification(self, database, id):
        try:
            return self.unification[database][id]
        except:
            return None

    # Helper functions
[docs]    def get_via_synonym(self, id):
        try:
            return self.synrev[id]
        except:
            return None

    # Handler to load all identity files in /identities
[docs]    def load_identities(self):
        identities_files = os.listdir(os.path.join(utils.scriptdir, 'identities', 'synonyms'))
        if len(identities_files) > 0:
            logging.info("Loading additional synonyms:")
            for filename in identities_files:
                logging.info("- %s" % filename)
                reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'identities', 'synonyms', filename), 'rU'), delimiter=str(','), dialect='excel')
                for id, identity in reader:
                    self.add_identity(id, identity)
            logging.info("Done.")

[docs]    def load_xrefs(self):
        identities_files = os.listdir(os.path.join(utils.scriptdir, 'identities', 'xrefs'))
        if len(identities_files) > 0:
            logging.info("Loading additional xrefs:")
            for filename in identities_files:
                logging.info("- %s" % filename)
                reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'identities', 'xrefs', filename), 'rU'), delimiter=str(','), dialect='excel')
                for id, db, key in reader:
                    #self.add_xref(id, db, key)
                    self.add_db_synonyms(id, {db: key})  # Hack, fix this up
            logging.info("Done.")

    # Synonym interface for compounds, reactions and pathways (shared namespace)
    # Can call with filename to load a specific synonym file, e.g. containing peak ids
[docs]    def load_synonyms(self, filename=os.path.join(utils.scriptdir, 'database/synonyms')):
        reader = UnicodeReader(open(filename, 'rU'), delimiter=str(','), dialect='excel')
        for id, name in reader:
            if id in self.synfwd:  # Protection 
                self.add_synonym(id, name)

[docs]    def load_compounds(self):
        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/compounds'), 'rU'), delimiter=str(','), dialect='excel')
        for id, name, type, db_unification in reader:
            self.add_compound(id, {
                'name': name,
                'type': type,
                'databases': self.extract_db_unification(db_unification),
                })

[docs]    def load_reactions(self):
        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/reactions'), 'rU'), delimiter=str(','), dialect='excel')
        for id, name, origin, dest, smtins, smtouts, proteins, dir, pathways, db_unification in reader:
            self.add_reaction(id, {
                'name': name,
                # Build internal db links to compounds
                'mtins': [self.index[mid] for mid in origin.split('|')],
                'mtouts': [self.index[mid] for mid in dest.split('|')],
                'smtins': [self.index[mid] for mid in smtins.split('|') if mid != ''],  # [s for s in smtins.split('|') if s != ''],
                'smtouts': [self.index[mid] for mid in smtouts.split('|') if mid != ''],  # [s for s in smtouts.split('|') if s != ''],
                'proteins': [self.index[prid] for prid in proteins.split('|') if prid != ''],
                'dir': dir,
                # Reactions can be in >1 pathway
                'pathways': [self.pathways[pid] for pid in pathways.split('|')],
                'databases': self.extract_db_unification(db_unification),
            })

[docs]    def load_pathways(self):
        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/pathways'), 'rU'), delimiter=str(','), dialect='excel')
        for id, name, db_unification in reader:
            self.add_pathway(id, {
                'name': name,
                'databases': self.extract_db_unification(db_unification),
            })

[docs]    def load_proteins(self):
        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/proteins'), 'rU'), delimiter=str(','), dialect='excel')
        for id, name, genes, compartments, db_unification in reader:
            self.add_protein(id, {
                'name': name,
                'genes': [self.index[gid] for gid in genes.split('|') if gid != ''],
                'compartments': [c for c in compartments.split('|') if c != ''],
                'databases': self.extract_db_unification(db_unification),
                })

[docs]    def load_genes(self):
        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/genes'), 'rU'), delimiter=str(','), dialect='excel')
        for id, name, db_unification in reader:
            self.add_gene(id, {
                'name': name,
                'databases': self.extract_db_unification(db_unification),
                })

[docs]    def load_gibbs(self):

        def sum_gibbs_in_outs(key, ins, outs):
            return sum([m.gibbs[key] for m in ins if hasattr(m, 'gibbs')]) - sum([m.gibbs[key] for m in outs if hasattr(m, 'gibbs')])

        reader = UnicodeReader(open(os.path.join(utils.scriptdir, 'database/gibbs'), 'rU'), delimiter=str(','), dialect='excel')

        # Add reactions from each compound that we have gibbs data for
        gibbs_reactions = set()

        for kegg_id, deltag, uncertainty, charge in reader:
            if kegg_id in self.unification['KEGG']:

                self.unification['KEGG'][kegg_id].gibbs = {
                        'deltaG': float(deltag),  # + 8314 * 310.15 * np.log(2), # G = G° + RTln(C2/C1) n.b. 310.15K = 37°C; R= 8.314 gas constant
                        'deltaG_bio': float(deltag),
                        'uncertainty': float(uncertainty),
                        'charge': float(charge)
                    }
                gibbs_reactions.update(self.unification['KEGG'][kegg_id].reactions)

        for r in list(gibbs_reactions):
            # Do we have gibbs data for all reactants (excl. H+ and?)
            # Some are uncalculated (e.g. pseudoatom H so can't be included - how to treat, could mark these somehow from original source data
            ins = r.mtins + r.smtins
            outs = r.mtouts + r.smtouts

            deltag = sum_gibbs_in_outs('deltaG', ins, outs)

            # Swap reaction directions on birectional reactions to match gibbs
            #if r.dir == 'both' and deltag > 0:
            #    print r, "swap!"
            #    tmtins, tsmtins = r.mtins, r.smtins
            #    r.mtins, r.smtins = r.mtouts, r.smtouts
            #    r.mtouts, r.smtouts = tmtins, tsmtins
            #    deltag = -deltag

            # Calculate penwidth for deltag viz
            if deltag == 0:
                deltag_w = 1
            else:
                deltag_w = 1 + (np.log2(abs(deltag)) + 1) * deltag / abs(deltag)  # Signed log2

            r.gibbs = {
                    'deltaG': deltag,  # sum_gibbs_in_outs( 'deltaG', ins, outs ),
                    'deltaG_bio': deltag,  # sum_gibbs_in_outs( 'deltaG', ins, outs ),
                    'deltaG_w': deltag_w,
                    'uncertainty': sum_gibbs_in_outs('uncertainty', ins, outs),
                    'charge': sum_gibbs_in_outs('charge', ins, outs),
                }

[docs]    def extract_db_unification(self, db_unification):
        # Process database links field
        dbs = dict()
        if db_unification:
            for dblink in db_unification.split(';'):
                key, val = dblink.split(":", 1)
                dbs[key] = val
        return dbs

[docs]    def add_db_synonyms(self, id, databases):
        if id in self.index:
            self.add_synonyms(id, ['%s:%s' % (db, key) for db, key in list(databases.items())])
            self.add_synonyms(id, ['%s' % (key) for db, key in list(databases.items()) if db in database_link_synonyms])

            for db, key in list(databases.items()):
                self.index[id].databases[db] = key

            # Add unification links
            for db, key in list(databases.items()):
                self.unification[db][key] = self.index[id]

[docs]    def add_reaction(self, id, attr):
        self.reactions[id] = Reaction(**dict(
            list({'id': id, 'synonyms': self.synfwd[id]}.items()) + list(attr.items()))
        )

        # Store id and names in the synonym database
        self.index[id] = self.reactions[id]
        self.add_synonym(id, attr['name'])

        # Build the reverse link
        for pathway in attr['pathways']:
            if hasattr(pathway, 'id'):
                self.pathways[pathway.id].reactions.append(self.reactions[id])

        # Add pathway links to compounds
        for m in self.reactions[id].mtins + self.reactions[id].mtouts:
            # The follow instead of extend to remove duplicates
            if hasattr(m, 'id'):
                self.compounds[m.id].pathways.extend([p for p in self.reactions[id].pathways if p not in self.compounds[m.id].pathways])
                self.compounds[m.id].reactions.append(self.reactions[id])

                for p in self.reactions[id].pathways:
                    if self.compounds[m.id] not in self.pathways[p.id].compounds:
                        self.pathways[p.id].compounds.append(self.compounds[m.id])

        # Add pathway links to proteins
        for pr in self.reactions[id].proteins:
            if hasattr(pr, 'id'):
                self.proteins[pr.id].pathways.extend([p for p in self.reactions[id].pathways if p not in self.proteins[pr.id].pathways])
                self.proteins[pr.id].reactions.append(self.reactions[id])

                for p in self.reactions[id].pathways:
                    if self.proteins[pr.id] not in self.pathways[p.id].proteins:
                        self.pathways[p.id].proteins.append(self.proteins[pr.id])

                # Add pathway links to genes
                for g in pr.genes:
                    if hasattr(g, 'id'):
                        self.genes[g.id].pathways.extend([p for p in self.reactions[id].pathways if p not in self.genes[g.id].pathways])
                        self.genes[g.id].reactions.append(self.reactions[id])

                        for p in self.reactions[id].pathways:
                            if self.genes[g.id] not in self.pathways[p.id].genes:
                                self.pathways[p.id].genes.append(self.genes[g.id])

[docs]    def add_pathway(self, id, attr):
        self.pathways[id] = Pathway(**dict(
            list({'id': id, 'synonyms': self.synfwd[id], 'reactions': [], 'compounds': [], 'proteins': [], 'genes': [], }.items()) + list(attr.items()))
        )

        # Store id and names in the synonym database
        self.index[id] = self.pathways[id]
        self.add_synonym(id, attr['name'])

[docs]    def add_compound(self, id, attr):
        self.compounds[id] = Compound(**dict(
            list({'id': id, 'synonyms': self.synfwd[id], 'reactions': [], 'pathways': [], }.items()) + list(attr.items()))
        )

        # Store id and name in the synonym database
        self.index[id] = self.compounds[id]
        self.add_synonym(id, attr['name'])
        self.add_db_synonyms(id, self.compounds[id].databases)

        # Check if we have a compound image for this compound (KEGG Sourced)
        if 'LIGAND-CPD' in list(self.compounds[id].databases.keys()):
            self.compounds[id].image = os.path.join(utils.scriptdir, 'database', 'figures', '%s.png' % id)
            self.compounds[id].imagecolor = os.path.join(utils.scriptdir, 'database', 'figures', '%d', '%s.png' % id)

[docs]    def add_protein(self, id, attr):
        self.proteins[id] = Protein(**dict(
            list({'id': id, 'synonyms': self.synfwd[id], 'reactions': [], 'pathways': [], }.items()) + list(attr.items()))
        )

        # Store id and name in the synonym database
        self.index[id] = self.proteins[id]
        self.add_synonym(id, attr['name'])
        self.add_db_synonyms(id, self.proteins[id].databases)

[docs]    def add_gene(self, id, attr):
        self.genes[id] = Gene(**dict(
            list({'id': id, 'synonyms': self.synfwd[id], 'reactions': [], 'pathways': [], }.items()) + list(attr.items()))
        )

        # Store id and name in the synonym database
        self.index[id] = self.genes[id]
        self.add_synonym(id, attr['name'])
        self.add_db_synonyms(id, self.genes[id].databases)

[docs]    def add_synonym(self, id, synonym):
        self.synfwd[id].add(synonym)  # ID -> Synonyms
        if id in self.index:
            self.synrev[synonym] = self.index[id]  # Synonym -> Object
            self.synrev[synonym.lower()] = self.index[id]  # lc Synonym -> Object
            self.synrev[id] = self.index[id]  # id -> Object

            # BY type
            self.synrev_by_type[self.index[id].type][synonym] = self.index[id]  # Synonym -> Object
            self.synrev_by_type[self.index[id].type][synonym.lower()] = self.index[id]  # lc Synonym -> Object
            self.synrev_by_type[self.index[id].type][id] = self.index[id]  # id -> Object

[docs]    def add_synonyms(self, id, synonyms):
        for syn in synonyms:
            self.add_synonym(id, syn)

    # An identity is a limited version of the synonym (using the same tables as above)
    # linking from database id etc. to a object Id, but not reverse
[docs]    def add_identity(self, id, identity):
        if id in self.index:
            self.synrev[identity] = self.index[id]  # Synonym -> Object
            self.synrev[identity.lower()] = self.index[id]  # lc Synonym -> Object

    # Output the current database to disk (Overwrite completely)
[docs]    def save_compounds(self):
        writer = UnicodeWriter(open('./database/compounds', 'wb'), delimiter=str(','))
        for compound in list(self.compounds.values()):
            writer.writerow(compound.as_csv())

[docs]    def save_reactions(self):
        writer = UnicodeWriter(open('./database/reactions', 'wb'), delimiter=str(','))
        for reaction in list(self.reactions.values()):
            writer.writerow(reaction.as_csv())

[docs]    def save_pathways(self):
        writer = UnicodeWriter(open('./database/pathways', 'wb'), delimiter=str(','))
        for pathway in list(self.pathways.values()):
            writer.writerow(pathway.as_csv())

[docs]    def save_proteins(self):
        writer = UnicodeWriter(open('./database/proteins', 'wb'), delimiter=str(','))
        for protein in list(self.proteins.values()):
            writer.writerow(protein.as_csv())

[docs]    def save_genes(self):
        writer = UnicodeWriter(open('./database/genes', 'wb'), delimiter=str(','))
        for gene in list(self.genes.values()):
            writer.writerow(gene.as_csv())

[docs]    def save_synonyms(self):
        writer = UnicodeWriter(open('./database/synonyms', 'wb'), delimiter=str(','))
        for synk, synv in list(self.synfwd.items()):
            for syn in synv:
                row = [synk, syn]
                writer.writerow(row)