diff options
Diffstat (limited to 'lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py')
-rwxr-xr-x | lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py | 72 |
1 files changed, 0 insertions, 72 deletions
diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py deleted file mode 100755 index d15d625..0000000 --- a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py +++ /dev/null @@ -1,72 +0,0 @@ -import copy -import numpy as np - -import time - - -class SmilesTokenizer(object): - def __init__(self): - atoms = [ - 'Li', - 'Na', - 'Al', - 'Si', - 'Cl', - 'Sc', - 'Zn', - 'As', - 'Se', - 'Br', - 'Sn', - 'Te', - 'Cn', - 'H', - 'B', - 'C', - 'N', - 'O', - 'F', - 'P', - 'S', - 'K', - 'V', - 'I', - ] - special = [ - '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', - '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' - ] - padding = ['G', 'A', 'E'] - - self.table = sorted(atoms, key=len, reverse=True) + special + padding - self.table_len = len(self.table) - - self.one_hot_dict = {} - for i, symbol in enumerate(self.table): - vec = np.zeros(self.table_len, dtype=np.float32) - vec[i] = 1 - self.one_hot_dict[symbol] = vec - - def tokenize(self, smiles): - N = len(smiles) - i = 0 - token = [] - - timeout = time.time() + 5 # 5 seconds from now - while (i < N): - for j in range(self.table_len): - symbol = self.table[j] - if symbol == smiles[i:i + len(symbol)]: - token.append(symbol) - i += len(symbol) - break - if time.time() > timeout: - break - return token - - def one_hot_encode(self, tokenized_smiles): - result = np.array( - [self.one_hot_dict[symbol] for symbol in tokenized_smiles], - dtype=np.float32) - result = result.reshape(1, result.shape[0], result.shape[1]) - return result |