From 9a253f896fba757778370c8ad6d40daa3b4cdad0 Mon Sep 17 00:00:00 2001 From: Navan Chauhan Date: Fri, 31 Jul 2020 22:19:38 +0530 Subject: added Curie-Generate BETA --- lstm_chem/utils/config.py | 26 ++++++++ lstm_chem/utils/dirs.py | 12 ++++ lstm_chem/utils/smiles_tokenizer.py | 72 ++++++++++++++++++++++ ...okenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py | 72 ++++++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100755 lstm_chem/utils/config.py create mode 100755 lstm_chem/utils/dirs.py create mode 100755 lstm_chem/utils/smiles_tokenizer.py create mode 100755 lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py (limited to 'lstm_chem/utils') diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py new file mode 100755 index 0000000..fff7359 --- /dev/null +++ b/lstm_chem/utils/config.py @@ -0,0 +1,26 @@ +import os +import time +import json +from bunch import Bunch + + +def get_config_from_json(json_file): + with open(json_file, 'r') as config_file: + config_dict = json.load(config_file) + config = Bunch(config_dict) + return config + + +def process_config(json_file): + config = get_config_from_json(json_file) + config.config_file = json_file + config.exp_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name) + config.tensorboard_log_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'logs/') + config.checkpoint_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'checkpoints/') + return config diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py new file mode 100755 index 0000000..bcd2a49 --- /dev/null +++ b/lstm_chem/utils/dirs.py @@ -0,0 +1,12 @@ +import os +import sys + + +def create_dirs(dirs): + try: + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + except Exception as err: + print(f'Creating directories error: {err}') + sys.exit() diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result -- cgit v1.2.3