Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add molecule encodings for contextual bandit #179

Merged
merged 14 commits into from
Sep 5, 2023
212 changes: 212 additions & 0 deletions examples/molecule_search/mol_encoders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import os
from typing import Any, List, Optional

import numpy as np
import torch
from gensim.models import word2vec, Word2Vec
from mol2vec.features import mol2alt_sentence, MolSentence
from rdkit.Chem import AllChem, RDKFingerprint, rdFingerprintGenerator
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

from examples.molecule_search.mol_adapter import MolAdapter
from examples.molecule_search.mol_transformer.transformer import create_masks, Transformer, EXTRA_CHARS, ALPHABET_SIZE
from examples.molecule_search.utils import download_from_github
from golem.core.log import default_log
from golem.core.paths import project_root


def adapter_func_to_molgraph(func):
""" Decorator function to adapt observation to MolGraphs graphs. """
def wrapper(obs):
mol_graph = MolAdapter().restore(obs)
embedding = func(mol_graph)
return embedding
return wrapper


def adapter_method_to_molgraph(func):
""" Decorator function to adapt observation to MolGraphs graphs. """
def wrapper(obj, obs):
mol_graph = MolAdapter().restore(obs)
embedding = func(obj, mol_graph)
return embedding
return wrapper


@adapter_func_to_molgraph
def ECFP(obs: Any):
""" Extended-Connectivity Fingerprint """
molecule = obs.get_rw_molecule()
feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
radius=2,
nBits=2**10,
useFeatures=False,
useChirality=False)
return np.array(feature_list)


@adapter_func_to_molgraph
def RDKF(obs: Any):
""" RDK Fingerprint """
molecule = obs.get_rw_molecule()
fingerprint_rdk = RDKFingerprint(molecule)
return np.array(fingerprint_rdk)


@adapter_func_to_molgraph
def atom_pair(obs: Any):
""" Atom pair fingerprint """
molecule = obs.get_rw_molecule()
fingerprint = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=1024).GetFingerprint(molecule)
return np.array(fingerprint)


@adapter_func_to_molgraph
def topological_torsion(obs: Any):
""" Topological Torsion fingerprint """
molecule = obs.get_rw_molecule()
fingerprint = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=1024).GetFingerprint(molecule)
return np.array(fingerprint)


@adapter_func_to_molgraph
def mol_descriptors(obs: Any):
molecule = obs.get_rw_molecule()
chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v',
'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11',
'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7',
'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2',
'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt',
'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge',
'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge',
'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount',
'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings',
'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors',
'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds',
'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings',
'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12',
'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5',
'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10',
'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8',
'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8',
'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9',
'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH',
'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine',
'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2',
'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide',
'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo',
'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido',
'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho',
'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine',
'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd',
'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan',
'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed']
mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors)
list_of_descriptor_vals = list(mol_descriptor_calculator.CalcDescriptors(molecule))
return list_of_descriptor_vals


class Mol2Vec:

PRETRAINED_WORD2VEC = 'examples/molecule_search/data/pretrained_models/model_300dim.pkl'
GITHUB_URL = 'https://github.com/samoturk/mol2vec/raw/master/examples/models/model_300dim.pkl'

def __init__(self):
self.file_path = os.path.join(project_root(), Mol2Vec.PRETRAINED_WORD2VEC)
download_from_github(self.file_path,
Mol2Vec.GITHUB_URL,
message="Downloading pretrained model for molecules encoding...")

self.model = word2vec.Word2Vec.load(self.file_path)

@adapter_method_to_molgraph
def __call__(self, obs: Any):
molecule = obs.get_rw_molecule()
sentence = MolSentence(mol2alt_sentence(molecule, radius=1))
embedding = self.sentences2vec([sentence], self.model, unseen='UNK')[0]
return np.array(embedding).astype(float)

@staticmethod
def sentences2vec(sentences: List[MolSentence], model: Word2Vec, unseen: Optional[str] = None) -> np.array:
"""Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
sum of vectors for individual words.

Parameters
----------
sentences : list, array
List with sentences
model : word2vec.Word2Vec
Gensim word2vec model
unseen : None, str
Keyword for unseen words. If None, those words are skipped.
https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032

Returns
-------
np.array
"""

keys = set(model.wv.key_to_index)
vec = []

if unseen:
unseen_vec = model.wv.get_vector(unseen)

for sentence in sentences:
if unseen:
vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
else unseen_vec for y in sentence]))
else:
vec.append(sum([model.wv.get_vector(y) for y in sentence
if y in set(sentence) & keys]))
return np.array(vec)


class MoleculeTransformer:
""" Based on https://github.com/mpcrlab/MolecularTransformerEmbeddings """

PRETRAINED_TRANSFORMER = 'examples/molecule_search/data/pretrained_models/pretrained.ckpt'
GITHUB_URL = 'https://github.com/mpcrlab/MolecularTransformerEmbeddings/releases/download/' \
'checkpoints/pretrained.ckpt'

def __init__(self, embedding_size: int = 512, num_layers: int = 6, max_length: int = 256):
self.log = default_log(self)

self.file_path = os.path.join(project_root(), MoleculeTransformer.PRETRAINED_TRANSFORMER)
download_from_github(self.file_path,
MoleculeTransformer.GITHUB_URL,
message="Downloading pretrained model for molecules encoding...")
self.model = self._model_setup(embedding_size, num_layers)
self.encoder = self.model.encoder.cpu()
self.max_length = max_length

def _model_setup(self, embedding_size: int, num_layers: int):
model = Transformer(ALPHABET_SIZE, embedding_size, num_layers).eval()
maypink marked this conversation as resolved.
Show resolved Hide resolved
model = torch.nn.DataParallel(model)
checkpoint = torch.load(self.file_path, map_location=torch.device("cpu"))
model.load_state_dict(checkpoint['state_dict'])
return model.module.cpu()

@adapter_method_to_molgraph
def __call__(self, obs: Any):
smiles = obs.get_smiles()
with torch.no_grad():
encoded = self.encode_smiles(smiles)
mask = create_masks(encoded)
embedding = self.encoder(encoded, mask)[0].numpy()
embedding = embedding.mean(axis=0)
return embedding

@staticmethod
def encode_char(c):
return ord(c) - 32

def encode_smiles(self, string: str, start_char=EXTRA_CHARS['seq_start']):
return torch.tensor([ord(start_char)] +
[self.encode_char(c) for c in string], dtype=torch.long)[:self.max_length].unsqueeze(0)
13 changes: 3 additions & 10 deletions examples/molecule_search/mol_metrics.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import os
import pickle
import sys
from typing import Dict

import requests
from rdkit import RDConfig, Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.QED import qed
from rdkit.Chem.rdchem import RWMol
from typing import Dict

from examples.molecule_search.constants import ZINC_LOGP_MEAN, ZINC_LOGP_STD, ZINC_SA_MEAN, ZINC_SA_STD, \
ZINC_CYCLE_MEAN, ZINC_CYCLE_STD, MIN_LONG_CYCLE_SIZE
from examples.molecule_search.mol_graph import MolGraph
from examples.molecule_search.utils import largest_ring_size
from examples.molecule_search.utils import largest_ring_size, download_from_github
from golem.core.paths import project_root

sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
Expand Down Expand Up @@ -160,13 +159,7 @@ def __call__(self, mol_graph: MolGraph) -> float:
return -avg_score

def load_shingles(self) -> Dict:
save_dir = os.path.dirname(self.file_path)
os.makedirs(save_dir, exist_ok=True)

if not os.path.exists(self.file_path):
response = requests.get(self.github_url)
with open(self.file_path, "wb") as new_file:
new_file.write(response.content)
download_from_github(self.file_path, self.github_url)

with open(self.file_path, "rb") as pyc:
db_shingles = pickle.load(pyc)
Expand Down
21 changes: 21 additions & 0 deletions examples/molecule_search/mol_transformer/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 Machine Perception & Cognitive Robotics Laboratory

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Empty file.
Loading