aboutsummaryrefslogtreecommitdiff
path: root/lstm_chem/utils
diff options
context:
space:
mode:
Diffstat (limited to 'lstm_chem/utils')
-rw-r--r--lstm_chem/utils/config.py26
-rw-r--r--lstm_chem/utils/dirs.py12
-rw-r--r--lstm_chem/utils/smiles_tokenizer.py65
-rw-r--r--lstm_chem/utils/smiles_tokenizer2.py56
4 files changed, 159 insertions, 0 deletions
diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py
new file mode 100644
index 0000000..fff7359
--- /dev/null
+++ b/lstm_chem/utils/config.py
@@ -0,0 +1,26 @@
+import os
+import time
+import json
+from bunch import Bunch
+
+
+def get_config_from_json(json_file):
+ with open(json_file, 'r') as config_file:
+ config_dict = json.load(config_file)
+ config = Bunch(config_dict)
+ return config
+
+
+def process_config(json_file):
+ config = get_config_from_json(json_file)
+ config.config_file = json_file
+ config.exp_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name)
+ config.tensorboard_log_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name, 'logs/')
+ config.checkpoint_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name, 'checkpoints/')
+ return config
diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py
new file mode 100644
index 0000000..bcd2a49
--- /dev/null
+++ b/lstm_chem/utils/dirs.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+
+def create_dirs(dirs):
+ try:
+ for dir_ in dirs:
+ if not os.path.exists(dir_):
+ os.makedirs(dir_)
+ except Exception as err:
+ print(f'Creating directories error: {err}')
+ sys.exit()
diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py
new file mode 100644
index 0000000..f998c4a
--- /dev/null
+++ b/lstm_chem/utils/smiles_tokenizer.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+
+class SmilesTokenizer(object):
+ def __init__(self):
+ atoms = [
+ 'Li',
+ 'Na',
+ 'Al',
+ 'Si',
+ 'Cl',
+ 'Sc',
+ 'Zn',
+ 'As',
+ 'Se',
+ 'Br',
+ 'Sn',
+ 'Te',
+ 'Cn',
+ 'H',
+ 'B',
+ 'C',
+ 'N',
+ 'O',
+ 'F',
+ 'P',
+ 'S',
+ 'K',
+ 'V',
+ 'I'
+ ]
+ special = [
+ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
+ ]
+ padding = ['G', 'A', 'E']
+
+ self.table = sorted(atoms, key=len, reverse=True) + special + padding
+ self.table_len = len(self.table)
+
+ self.one_hot_dict = {}
+ for i, symbol in enumerate(self.table):
+ vec = np.zeros(self.table_len, dtype=np.float32)
+ vec[i] = 1
+ self.one_hot_dict[symbol] = vec
+
+ def tokenize(self, smiles):
+ N = len(smiles)
+ i = 0
+ token = []
+ while (i < N):
+ for j in range(self.table_len):
+ symbol = self.table[j]
+ if symbol == smiles[i:i + len(symbol)]:
+ token.append(symbol)
+ i += len(symbol)
+ break
+ return token
+
+ def one_hot_encode(self, tokenized_smiles):
+ result = np.array(
+ [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
+ dtype=np.float32)
+ result = result.reshape(1, result.shape[0], result.shape[1])
+ return result
diff --git a/lstm_chem/utils/smiles_tokenizer2.py b/lstm_chem/utils/smiles_tokenizer2.py
new file mode 100644
index 0000000..29575ba
--- /dev/null
+++ b/lstm_chem/utils/smiles_tokenizer2.py
@@ -0,0 +1,56 @@
+import numpy as np
+
+
+class SmilesTokenizer(object):
+ def __init__(self):
+ atoms = [
+ 'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N',
+ 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te'
+ ]
+ special = [
+ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
+ ]
+ padding = ['G', 'A', 'E']
+
+ self.table = sorted(atoms, key=len, reverse=True) + special + padding
+ table_len = len(self.table)
+
+ self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table))
+ self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table))
+
+ self.one_hot_dict = {}
+ for i, symbol in enumerate(self.table):
+ vec = np.zeros(table_len, dtype=np.float32)
+ vec[i] = 1
+ self.one_hot_dict[symbol] = vec
+
+ def tokenize(self, smiles):
+ smiles = smiles + ' '
+ N = len(smiles)
+ token = []
+ i = 0
+ while (i < N):
+ c1 = smiles[i]
+ c2 = smiles[i:i + 2]
+
+ if c2 in self.table_2_chars:
+ token.append(c2)
+ i += 2
+ continue
+
+ if c1 in self.table_1_chars:
+ token.append(c1)
+ i += 1
+ continue
+
+ i += 1
+
+ return token
+
+ def one_hot_encode(self, tokenized_smiles):
+ result = np.array(
+ [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
+ dtype=np.float32)
+ result = result.reshape(1, result.shape[0], result.shape[1])
+ return result