From e80badd278f54e8cde64243aa56687f8e6e32cc2 Mon Sep 17 00:00:00 2001 From: Navan Chauhan Date: Sat, 1 Aug 2020 15:34:22 +0530 Subject: added alpha model --- lstm_chem/__init__.py | 0 lstm_chem/data_loader.py | 2 +- lstm_chem/finetuner.py | 5 +- lstm_chem/generator.py | 2 +- lstm_chem/model.py | 2 +- lstm_chem/trainer.py | 3 +- lstm_chem/utils/config.py | 0 lstm_chem/utils/dirs.py | 0 lstm_chem/utils/smiles_tokenizer.py | 9 +-- ...okenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py | 72 ---------------------- lstm_chem/utils/smiles_tokenizer2.py | 56 +++++++++++++++++ 11 files changed, 65 insertions(+), 86 deletions(-) mode change 100755 => 100644 lstm_chem/__init__.py mode change 100755 => 100644 lstm_chem/data_loader.py mode change 100755 => 100644 lstm_chem/finetuner.py mode change 100755 => 100644 lstm_chem/generator.py mode change 100755 => 100644 lstm_chem/model.py mode change 100755 => 100644 lstm_chem/trainer.py mode change 100755 => 100644 lstm_chem/utils/config.py mode change 100755 => 100644 lstm_chem/utils/dirs.py mode change 100755 => 100644 lstm_chem/utils/smiles_tokenizer.py delete mode 100755 lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py create mode 100644 lstm_chem/utils/smiles_tokenizer2.py (limited to 'lstm_chem') diff --git a/lstm_chem/__init__.py b/lstm_chem/__init__.py old mode 100755 new mode 100644 diff --git a/lstm_chem/data_loader.py b/lstm_chem/data_loader.py old mode 100755 new mode 100644 index 86ddbba..243f7e6 --- a/lstm_chem/data_loader.py +++ b/lstm_chem/data_loader.py @@ -3,7 +3,7 @@ import os import numpy as np from tqdm import tqdm from tensorflow.keras.utils import Sequence -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class DataLoader(Sequence): diff --git a/lstm_chem/finetuner.py b/lstm_chem/finetuner.py old mode 100755 new mode 100644 index 904958b..24f26ce --- a/lstm_chem/finetuner.py +++ b/lstm_chem/finetuner.py @@ -1,4 +1,4 @@ -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer from lstm_chem.generator import LSTMChemGenerator @@ -14,7 +14,8 @@ class LSTMChemFinetuner(LSTMChemGenerator): self.model.compile(optimizer=self.config.optimizer, loss='categorical_crossentropy') - history = self.model.fit_generator( +# history = self.model.fit_generator( + history = self.model.fit( self.finetune_data_loader, steps_per_epoch=self.finetune_data_loader.__len__(), epochs=self.config.finetune_epochs, diff --git a/lstm_chem/generator.py b/lstm_chem/generator.py old mode 100755 new mode 100644 index 498f864..4f80e9f --- a/lstm_chem/generator.py +++ b/lstm_chem/generator.py @@ -1,6 +1,6 @@ from tqdm import tqdm import numpy as np -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class LSTMChemGenerator(object): diff --git a/lstm_chem/model.py b/lstm_chem/model.py old mode 100755 new mode 100644 index 079589a..368a834 --- a/lstm_chem/model.py +++ b/lstm_chem/model.py @@ -4,7 +4,7 @@ from tensorflow.keras import Sequential from tensorflow.keras.models import model_from_json from tensorflow.keras.layers import LSTM, Dense from tensorflow.keras.initializers import RandomNormal -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class LSTMChem(object): diff --git a/lstm_chem/trainer.py b/lstm_chem/trainer.py old mode 100755 new mode 100644 index 4e8057e..0ed608a --- a/lstm_chem/trainer.py +++ b/lstm_chem/trainer.py @@ -32,7 +32,8 @@ class LSTMChemTrainer(object): )) def train(self): - history = self.model.fit_generator( +# history = self.model.fit_generator( + history = self.model.fit( self.train_data_loader, steps_per_epoch=self.train_data_loader.__len__(), epochs=self.config.num_epochs, diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py old mode 100755 new mode 100644 diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py old mode 100755 new mode 100644 diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py old mode 100755 new mode 100644 index d15d625..f998c4a --- a/lstm_chem/utils/smiles_tokenizer.py +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -1,8 +1,5 @@ -import copy import numpy as np -import time - class SmilesTokenizer(object): def __init__(self): @@ -30,7 +27,7 @@ class SmilesTokenizer(object): 'S', 'K', 'V', - 'I', + 'I' ] special = [ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', @@ -51,8 +48,6 @@ class SmilesTokenizer(object): N = len(smiles) i = 0 token = [] - - timeout = time.time() + 5 # 5 seconds from now while (i < N): for j in range(self.table_len): symbol = self.table[j] @@ -60,8 +55,6 @@ class SmilesTokenizer(object): token.append(symbol) i += len(symbol) break - if time.time() > timeout: - break return token def one_hot_encode(self, tokenized_smiles): diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py deleted file mode 100755 index d15d625..0000000 --- a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py +++ /dev/null @@ -1,72 +0,0 @@ -import copy -import numpy as np - -import time - - -class SmilesTokenizer(object): - def __init__(self): - atoms = [ - 'Li', - 'Na', - 'Al', - 'Si', - 'Cl', - 'Sc', - 'Zn', - 'As', - 'Se', - 'Br', - 'Sn', - 'Te', - 'Cn', - 'H', - 'B', - 'C', - 'N', - 'O', - 'F', - 'P', - 'S', - 'K', - 'V', - 'I', - ] - special = [ - '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', - '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' - ] - padding = ['G', 'A', 'E'] - - self.table = sorted(atoms, key=len, reverse=True) + special + padding - self.table_len = len(self.table) - - self.one_hot_dict = {} - for i, symbol in enumerate(self.table): - vec = np.zeros(self.table_len, dtype=np.float32) - vec[i] = 1 - self.one_hot_dict[symbol] = vec - - def tokenize(self, smiles): - N = len(smiles) - i = 0 - token = [] - - timeout = time.time() + 5 # 5 seconds from now - while (i < N): - for j in range(self.table_len): - symbol = self.table[j] - if symbol == smiles[i:i + len(symbol)]: - token.append(symbol) - i += len(symbol) - break - if time.time() > timeout: - break - return token - - def one_hot_encode(self, tokenized_smiles): - result = np.array( - [self.one_hot_dict[symbol] for symbol in tokenized_smiles], - dtype=np.float32) - result = result.reshape(1, result.shape[0], result.shape[1]) - return result diff --git a/lstm_chem/utils/smiles_tokenizer2.py b/lstm_chem/utils/smiles_tokenizer2.py new file mode 100644 index 0000000..29575ba --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer2.py @@ -0,0 +1,56 @@ +import numpy as np + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N', + 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te' + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + table_len = len(self.table) + + self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table)) + self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table)) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + smiles = smiles + ' ' + N = len(smiles) + token = [] + i = 0 + while (i < N): + c1 = smiles[i] + c2 = smiles[i:i + 2] + + if c2 in self.table_2_chars: + token.append(c2) + i += 2 + continue + + if c1 in self.table_1_chars: + token.append(c1) + i += 1 + continue + + i += 1 + + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result -- cgit v1.2.3