diff options
Diffstat (limited to 'lstm_chem/utils')
-rw-r--r-- | lstm_chem/utils/config.py | 26 | ||||
-rw-r--r-- | lstm_chem/utils/dirs.py | 12 | ||||
-rw-r--r-- | lstm_chem/utils/smiles_tokenizer.py | 65 | ||||
-rw-r--r-- | lstm_chem/utils/smiles_tokenizer2.py | 56 |
4 files changed, 159 insertions, 0 deletions
diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py new file mode 100644 index 0000000..fff7359 --- /dev/null +++ b/lstm_chem/utils/config.py @@ -0,0 +1,26 @@ +import os +import time +import json +from bunch import Bunch + + +def get_config_from_json(json_file): + with open(json_file, 'r') as config_file: + config_dict = json.load(config_file) + config = Bunch(config_dict) + return config + + +def process_config(json_file): + config = get_config_from_json(json_file) + config.config_file = json_file + config.exp_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name) + config.tensorboard_log_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'logs/') + config.checkpoint_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'checkpoints/') + return config diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py new file mode 100644 index 0000000..bcd2a49 --- /dev/null +++ b/lstm_chem/utils/dirs.py @@ -0,0 +1,12 @@ +import os +import sys + + +def create_dirs(dirs): + try: + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + except Exception as err: + print(f'Creating directories error: {err}') + sys.exit() diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py new file mode 100644 index 0000000..f998c4a --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -0,0 +1,65 @@ +import numpy as np + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I' + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result diff --git a/lstm_chem/utils/smiles_tokenizer2.py b/lstm_chem/utils/smiles_tokenizer2.py new file mode 100644 index 0000000..29575ba --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer2.py @@ -0,0 +1,56 @@ +import numpy as np + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N', + 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te' + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + table_len = len(self.table) + + self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table)) + self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table)) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + smiles = smiles + ' ' + N = len(smiles) + token = [] + i = 0 + while (i < N): + c1 = smiles[i] + c2 = smiles[i:i + 2] + + if c2 in self.table_2_chars: + token.append(c2) + i += 2 + continue + + if c1 in self.table_1_chars: + token.append(c1) + i += 1 + continue + + i += 1 + + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result |