diff options
Diffstat (limited to 'lstm_chem/utils')
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/config.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/dirs.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/smiles_tokenizer.py | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | lstm_chem/utils/smiles_tokenizer2.py (renamed from lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py) | 64 |
4 files changed, 25 insertions, 48 deletions
diff --git a/lstm_chem/utils/config.py b/lstm_chem/utils/config.py index fff7359..fff7359 100755..100644 --- a/lstm_chem/utils/config.py +++ b/lstm_chem/utils/config.py diff --git a/lstm_chem/utils/dirs.py b/lstm_chem/utils/dirs.py index bcd2a49..bcd2a49 100755..100644 --- a/lstm_chem/utils/dirs.py +++ b/lstm_chem/utils/dirs.py diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py index d15d625..f998c4a 100755..100644 --- a/lstm_chem/utils/smiles_tokenizer.py +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -1,8 +1,5 @@ -import copy import numpy as np -import time - class SmilesTokenizer(object): def __init__(self): @@ -30,7 +27,7 @@ class SmilesTokenizer(object): 'S', 'K', 'V', - 'I', + 'I' ] special = [ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', @@ -51,8 +48,6 @@ class SmilesTokenizer(object): N = len(smiles) i = 0 token = [] - - timeout = time.time() + 5 # 5 seconds from now while (i < N): for j in range(self.table_len): symbol = self.table[j] @@ -60,8 +55,6 @@ class SmilesTokenizer(object): token.append(symbol) i += len(symbol) break - if time.time() > timeout: - break return token def one_hot_encode(self, tokenized_smiles): diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer2.py index d15d625..29575ba 100755..100644 --- a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py +++ b/lstm_chem/utils/smiles_tokenizer2.py @@ -1,36 +1,11 @@ -import copy import numpy as np -import time - class SmilesTokenizer(object): def __init__(self): atoms = [ - 'Li', - 'Na', - 'Al', - 'Si', - 'Cl', - 'Sc', - 'Zn', - 'As', - 'Se', - 'Br', - 'Sn', - 'Te', - 'Cn', - 'H', - 'B', - 'C', - 'N', - 'O', - 'F', - 'P', - 'S', - 'K', - 'V', - 'I', + 'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N', + 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te' ] special = [ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', @@ -39,29 +14,38 @@ class SmilesTokenizer(object): padding = ['G', 'A', 'E'] self.table = sorted(atoms, key=len, reverse=True) + special + padding - self.table_len = len(self.table) + table_len = len(self.table) + + self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table)) + self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table)) self.one_hot_dict = {} for i, symbol in enumerate(self.table): - vec = np.zeros(self.table_len, dtype=np.float32) + vec = np.zeros(table_len, dtype=np.float32) vec[i] = 1 self.one_hot_dict[symbol] = vec def tokenize(self, smiles): + smiles = smiles + ' ' N = len(smiles) - i = 0 token = [] - - timeout = time.time() + 5 # 5 seconds from now + i = 0 while (i < N): - for j in range(self.table_len): - symbol = self.table[j] - if symbol == smiles[i:i + len(symbol)]: - token.append(symbol) - i += len(symbol) - break - if time.time() > timeout: - break + c1 = smiles[i] + c2 = smiles[i:i + 2] + + if c2 in self.table_2_chars: + token.append(c2) + i += 2 + continue + + if c1 in self.table_1_chars: + token.append(c1) + i += 1 + continue + + i += 1 + return token def one_hot_encode(self, tokenized_smiles): |