diff options
Diffstat (limited to 'lstm_chem')
-rw-r--r--[-rwxr-xr-x] | lstm_chem/__init__.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/data_loader.py | 2 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/finetuner.py | 5 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/generator.py | 2 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/model.py | 2 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/trainer.py | 3 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/config.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/dirs.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/smiles_tokenizer.py | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/smiles_tokenizer2.py (renamed from lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py) | 64 |
10 files changed, 33 insertions, 54 deletions
diff --git a/lstm_chem/__init__.py b/lstm_chem/__init__.py index 8b13789..8b13789 100755..100644 --- a/lstm_chem/__init__.py +++ b/lstm_chem/__init__.py diff --git a/lstm_chem/data_loader.py b/lstm_chem/data_loader.py index 86ddbba..243f7e6 100755..100644 --- a/lstm_chem/data_loader.py +++ b/lstm_chem/data_loader.py @@ -3,7 +3,7 @@ import os import numpy as np from tqdm import tqdm from tensorflow.keras.utils import Sequence -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class DataLoader(Sequence): diff --git a/lstm_chem/finetuner.py b/lstm_chem/finetuner.py index 904958b..24f26ce 100755..100644 --- a/lstm_chem/finetuner.py +++ b/lstm_chem/finetuner.py @@ -1,4 +1,4 @@ -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer from lstm_chem.generator import LSTMChemGenerator @@ -14,7 +14,8 @@ class LSTMChemFinetuner(LSTMChemGenerator): self.model.compile(optimizer=self.config.optimizer, loss='categorical_crossentropy') - history = self.model.fit_generator( +# history = self.model.fit_generator( + history = self.model.fit( self.finetune_data_loader, steps_per_epoch=self.finetune_data_loader.__len__(), epochs=self.config.finetune_epochs, diff --git a/lstm_chem/generator.py b/lstm_chem/generator.py index 498f864..4f80e9f 100755..100644 --- a/lstm_chem/generator.py +++ b/lstm_chem/generator.py @@ -1,6 +1,6 @@ from tqdm import tqdm import numpy as np -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class LSTMChemGenerator(object): diff --git a/lstm_chem/model.py b/lstm_chem/model.py index 079589a..368a834 100755..100644 --- a/lstm_chem/model.py +++ b/lstm_chem/model.py @@ -4,7 +4,7 @@ from tensorflow.keras import Sequential from tensorflow.keras.models import model_from_json from tensorflow.keras.layers import LSTM, Dense from tensorflow.keras.initializers import RandomNormal -from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer class LSTMChem(object): diff --git a/lstm_chem/trainer.py b/lstm_chem/trainer.py index 4e8057e..0ed608a 100755..100644 --- a/lstm_chem/trainer.py +++ b/lstm_chem/trainer.py @@ -32,7 +32,8 @@ class LSTMChemTrainer(object): )) def train(self): - history = self.model.fit_generator( +# history = self.model.fit_generator( + history = self.model.fit( self.train_data_loader, steps_per_epoch=self.train_data_loader.__len__(), epochs=self.config.num_epochs, diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py index fff7359..fff7359 100755..100644 --- a/lstm_chem/utils/config.py +++ b/lstm_chem/utils/config.py diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py index bcd2a49..bcd2a49 100755..100644 --- a/lstm_chem/utils/dirs.py +++ b/lstm_chem/utils/dirs.py diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py index d15d625..f998c4a 100755..100644 --- a/lstm_chem/utils/smiles_tokenizer.py +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -1,8 +1,5 @@ -import copy import numpy as np -import time - class SmilesTokenizer(object): def __init__(self): @@ -30,7 +27,7 @@ class SmilesTokenizer(object): 'S', 'K', 'V', - 'I', + 'I' ] special = [ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', @@ -51,8 +48,6 @@ class SmilesTokenizer(object): N = len(smiles) i = 0 token = [] - - timeout = time.time() + 5 # 5 seconds from now while (i < N): for j in range(self.table_len): symbol = self.table[j] @@ -60,8 +55,6 @@ class SmilesTokenizer(object): token.append(symbol) i += len(symbol) break - if time.time() > timeout: - break return token def one_hot_encode(self, tokenized_smiles): diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer2.py index d15d625..29575ba 100755..100644 --- a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py +++ b/lstm_chem/utils/smiles_tokenizer2.py @@ -1,36 +1,11 @@ -import copy import numpy as np -import time - class SmilesTokenizer(object): def __init__(self): atoms = [ - 'Li', - 'Na', - 'Al', - 'Si', - 'Cl', - 'Sc', - 'Zn', - 'As', - 'Se', - 'Br', - 'Sn', - 'Te', - 'Cn', - 'H', - 'B', - 'C', - 'N', - 'O', - 'F', - 'P', - 'S', - 'K', - 'V', - 'I', + 'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N', + 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te' ] special = [ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', @@ -39,29 +14,38 @@ class SmilesTokenizer(object): padding = ['G', 'A', 'E'] self.table = sorted(atoms, key=len, reverse=True) + special + padding - self.table_len = len(self.table) + table_len = len(self.table) + + self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table)) + self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table)) self.one_hot_dict = {} for i, symbol in enumerate(self.table): - vec = np.zeros(self.table_len, dtype=np.float32) + vec = np.zeros(table_len, dtype=np.float32) vec[i] = 1 self.one_hot_dict[symbol] = vec def tokenize(self, smiles): + smiles = smiles + ' ' N = len(smiles) - i = 0 token = [] - - timeout = time.time() + 5 # 5 seconds from now + i = 0 while (i < N): - for j in range(self.table_len): - symbol = self.table[j] - if symbol == smiles[i:i + len(symbol)]: - token.append(symbol) - i += len(symbol) - break - if time.time() > timeout: - break + c1 = smiles[i] + c2 = smiles[i:i + 2] + + if c2 in self.table_2_chars: + token.append(c2) + i += 2 + continue + + if c1 in self.table_1_chars: + token.append(c1) + i += 1 + continue + + i += 1 + return token def one_hot_encode(self, tokenized_smiles): |