From 61ce4e7b089d68395be2221f64d89040c0b14a34 Mon Sep 17 00:00:00 2001 From: Navan Chauhan Date: Fri, 31 Jul 2020 21:23:03 +0530 Subject: added AI model --- app/lstm_chem/utils/smiles_tokenizer.py | 72 +++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100755 app/lstm_chem/utils/smiles_tokenizer.py (limited to 'app/lstm_chem/utils/smiles_tokenizer.py') diff --git a/app/lstm_chem/utils/smiles_tokenizer.py b/app/lstm_chem/utils/smiles_tokenizer.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/app/lstm_chem/utils/smiles_tokenizer.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result -- cgit v1.2.3