aboutsummaryrefslogtreecommitdiff
path: root/lstm_chem/utils
diff options
context:
space:
mode:
Diffstat (limited to 'lstm_chem/utils')
-rwxr-xr-xlstm_chem/utils/config.py26
-rwxr-xr-xlstm_chem/utils/dirs.py12
-rwxr-xr-xlstm_chem/utils/smiles_tokenizer.py72
-rwxr-xr-xlstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py72
4 files changed, 182 insertions, 0 deletions
diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py
new file mode 100755
index 0000000..fff7359
--- /dev/null
+++ b/lstm_chem/utils/config.py
@@ -0,0 +1,26 @@
+import os
+import time
+import json
+from bunch import Bunch
+
+
+def get_config_from_json(json_file):
+ with open(json_file, 'r') as config_file:
+ config_dict = json.load(config_file)
+ config = Bunch(config_dict)
+ return config
+
+
+def process_config(json_file):
+ config = get_config_from_json(json_file)
+ config.config_file = json_file
+ config.exp_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name)
+ config.tensorboard_log_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name, 'logs/')
+ config.checkpoint_dir = os.path.join(
+ 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
+ config.exp_name, 'checkpoints/')
+ return config
diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py
new file mode 100755
index 0000000..bcd2a49
--- /dev/null
+++ b/lstm_chem/utils/dirs.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+
+def create_dirs(dirs):
+ try:
+ for dir_ in dirs:
+ if not os.path.exists(dir_):
+ os.makedirs(dir_)
+ except Exception as err:
+ print(f'Creating directories error: {err}')
+ sys.exit()
diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py
new file mode 100755
index 0000000..d15d625
--- /dev/null
+++ b/lstm_chem/utils/smiles_tokenizer.py
@@ -0,0 +1,72 @@
+import copy
+import numpy as np
+
+import time
+
+
+class SmilesTokenizer(object):
+ def __init__(self):
+ atoms = [
+ 'Li',
+ 'Na',
+ 'Al',
+ 'Si',
+ 'Cl',
+ 'Sc',
+ 'Zn',
+ 'As',
+ 'Se',
+ 'Br',
+ 'Sn',
+ 'Te',
+ 'Cn',
+ 'H',
+ 'B',
+ 'C',
+ 'N',
+ 'O',
+ 'F',
+ 'P',
+ 'S',
+ 'K',
+ 'V',
+ 'I',
+ ]
+ special = [
+ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
+ ]
+ padding = ['G', 'A', 'E']
+
+ self.table = sorted(atoms, key=len, reverse=True) + special + padding
+ self.table_len = len(self.table)
+
+ self.one_hot_dict = {}
+ for i, symbol in enumerate(self.table):
+ vec = np.zeros(self.table_len, dtype=np.float32)
+ vec[i] = 1
+ self.one_hot_dict[symbol] = vec
+
+ def tokenize(self, smiles):
+ N = len(smiles)
+ i = 0
+ token = []
+
+ timeout = time.time() + 5 # 5 seconds from now
+ while (i < N):
+ for j in range(self.table_len):
+ symbol = self.table[j]
+ if symbol == smiles[i:i + len(symbol)]:
+ token.append(symbol)
+ i += len(symbol)
+ break
+ if time.time() > timeout:
+ break
+ return token
+
+ def one_hot_encode(self, tokenized_smiles):
+ result = np.array(
+ [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
+ dtype=np.float32)
+ result = result.reshape(1, result.shape[0], result.shape[1])
+ return result
diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py
new file mode 100755
index 0000000..d15d625
--- /dev/null
+++ b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py
@@ -0,0 +1,72 @@
+import copy
+import numpy as np
+
+import time
+
+
+class SmilesTokenizer(object):
+ def __init__(self):
+ atoms = [
+ 'Li',
+ 'Na',
+ 'Al',
+ 'Si',
+ 'Cl',
+ 'Sc',
+ 'Zn',
+ 'As',
+ 'Se',
+ 'Br',
+ 'Sn',
+ 'Te',
+ 'Cn',
+ 'H',
+ 'B',
+ 'C',
+ 'N',
+ 'O',
+ 'F',
+ 'P',
+ 'S',
+ 'K',
+ 'V',
+ 'I',
+ ]
+ special = [
+ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
+ ]
+ padding = ['G', 'A', 'E']
+
+ self.table = sorted(atoms, key=len, reverse=True) + special + padding
+ self.table_len = len(self.table)
+
+ self.one_hot_dict = {}
+ for i, symbol in enumerate(self.table):
+ vec = np.zeros(self.table_len, dtype=np.float32)
+ vec[i] = 1
+ self.one_hot_dict[symbol] = vec
+
+ def tokenize(self, smiles):
+ N = len(smiles)
+ i = 0
+ token = []
+
+ timeout = time.time() + 5 # 5 seconds from now
+ while (i < N):
+ for j in range(self.table_len):
+ symbol = self.table[j]
+ if symbol == smiles[i:i + len(symbol)]:
+ token.append(symbol)
+ i += len(symbol)
+ break
+ if time.time() > timeout:
+ break
+ return token
+
+ def one_hot_encode(self, tokenized_smiles):
+ result = np.array(
+ [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
+ dtype=np.float32)
+ result = result.reshape(1, result.shape[0], result.shape[1])
+ return result