diff options
author | Navan Chauhan <navanchauhan@gmail.com> | 2020-07-31 22:19:38 +0530 |
---|---|---|
committer | Navan Chauhan <navanchauhan@gmail.com> | 2020-07-31 22:19:38 +0530 |
commit | 9a253f896fba757778370c8ad6d40daa3b4cdad0 (patch) | |
tree | 2257187cdbc4c2085fb14df8bbb2e6ae6679c3e4 /lstm_chem/utils/smiles_tokenizer.py | |
parent | 61ce4e7b089d68395be2221f64d89040c0b14a34 (diff) |
added Curie-Generate BETA
Diffstat (limited to 'lstm_chem/utils/smiles_tokenizer.py')
-rwxr-xr-x | lstm_chem/utils/smiles_tokenizer.py | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/lstm_chem/utils/smiles_tokenizer.py b/lstm_chem/utils/smiles_tokenizer.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/lstm_chem/utils/smiles_tokenizer.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result |