!pip install pandas==1.3.5 sklearn==0.0.post1 tensorflow==2.9.2 dlomix==0.0.3 numpy==1.21.6 matplotlib==3.2.2 requests==2.23.0 --quietDLOmix embedding of Prosit model on ProteomeTools data
# Import and normalize/standarize data
import pandas as pd
import numpy as np
# Import and normalize the data
data = pd.read_csv('https://github.com/ProteomicsML/ProteomicsML/blob/main/datasets/retentiontime/ProteomeTools/small.zip?raw=true', compression='zip')
# shuffle and split dataset into internal (80%) and external (20%) datasets
data = data.sample(frac=1)
test_data = data[int(len(data)*0.8):]
data = data[:int(len(data)*0.8)]# Split the internal dataset into training and validation
# We have to split the data based on Sequences, to make sure we dont have cross-over sequences in the training and validation splits.
unique_sequences = list(set(data['sequence']))
# Shuffle the data to ensure unbiased data splitting
from random import shuffle
shuffle(unique_sequences)
# Split sequence 80-10-10 training, validation and testing split
train = unique_sequences[0:int(len(unique_sequences) * 0.8)]
validation = unique_sequences[int(len(unique_sequences) * 0.8):]
# Transfer the sequence split into data split
train = data[data['sequence'].isin(train)]
validation = data[data['sequence'].isin(validation)]
print('Training data points:', len(train),' Validation data points:', len(validation),' Testing data points:', len(test_data))
# Here we use test as an external dataset unlike the one used for training.Training data points: 63955 Validation data points: 16045 Testing data points: 20000
normalize = True
if normalize:
# Normalize
train_val_min, train_val_max = min(train['retention time'].min(), validation['retention time'].min()), max(train['retention time'].max(), validation['retention time'].max())
train['retention time'] = list((train['retention time'] - train_val_min) / (train_val_max - train_val_min))
validation['retention time'] = list((validation['retention time'] - train_val_min) / (train_val_max - train_val_min))
test_data['retention time'] = list((test_data['retention time'] - test_data['retention time'].min()) / (test_data['retention time'].max() - test_data['retention time'].min()))
else:
# Standardize
train_val_mean, train_val_std = np.mean(list(train['retention time']) + list(validation['retention time'])), np.std(list(train['retention time']) + list(validation['retention time']))
train['retention time'] = (train['retention time'] - train_val_mean) / train_val_std
validation['retention time'] = (validation['retention time'] - train_val_mean) / train_val_std
test_data['retention time'] = (test_data['retention time'] - np.mean(test_data['retention time'])) / np.std(test_data['retention time'])/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Setup parameters
sequence_length = 30
batch_size = 64
epochs=5# Setup data
from dlomix.data import RetentionTimeDataset
train_input = RetentionTimeDataset(data_source=tuple([np.array(train['sequence']), np.array(train['retention time'])]),
seq_length=sequence_length, batch_size=batch_size, test=False).train_data
val_input = RetentionTimeDataset(data_source=tuple([np.array(validation['sequence']), np.array(validation['retention time'])]),
seq_length=sequence_length, batch_size=batch_size, test=False).train_data
test_input = RetentionTimeDataset(data_source=tuple([np.array(test_data['sequence']), np.array(test_data['retention time'])]),
seq_length=sequence_length, batch_size=batch_size, test=False).train_data
# Setup PROSIT model from DLOmix
from dlomix.models.prosit import PrositRetentionTimePredictor
model = PrositRetentionTimePredictor(seq_length=sequence_length)
model.build((None, sequence_length))
model.summary()Model: "prosit_retention_time_predictor_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
string_lookup_2 (StringLook multiple 0
up)
embedding_2 (Embedding) multiple 352
sequential_4 (Sequential) (None, 30, 512) 1996800
attention_layer_2 (Attentio multiple 542
nLayer)
sequential_5 (Sequential) (None, 512) 262656
dense_5 (Dense) multiple 513
=================================================================
Total params: 2,260,863
Trainable params: 2,260,863
Non-trainable params: 0
_________________________________________________________________
from dlomix.eval.rt_eval import TimeDeltaMetric
import tensorflow as tf
# Compiling the keras model with loss function, metrics and optimizer
model.compile(loss='mse', metrics=['mae', TimeDeltaMetric()], optimizer=tf.keras.optimizers.Adam(learning_rate=0.005))
# Train the model
history = model.fit(x=train_input, epochs=epochs, batch_size=batch_size, validation_data=val_input)Epoch 1/5
998/998 [==============================] - 26s 22ms/step - loss: 0.6175 - mae: 0.1161 - timedelta: 0.1140 - val_loss: 0.0040 - val_mae: 0.0427 - val_timedelta: 0.0484
Epoch 2/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0055 - mae: 0.0526 - timedelta: 0.0522 - val_loss: 0.0038 - val_mae: 0.0428 - val_timedelta: 0.0467
Epoch 3/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0047 - mae: 0.0474 - timedelta: 0.0464 - val_loss: 0.0039 - val_mae: 0.0459 - val_timedelta: 0.0480
Epoch 4/5
998/998 [==============================] - 21s 21ms/step - loss: 0.6041 - mae: 0.2064 - timedelta: 0.1935 - val_loss: 0.0537 - val_mae: 0.1940 - val_timedelta: 0.1972
Epoch 5/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0544 - mae: 0.1961 - timedelta: 0.1900 - val_loss: 0.0536 - val_mae: 0.1943 - val_timedelta: 0.1967
from dlomix.reports import RetentionTimeReport
report = RetentionTimeReport(output_path="./output", history=history)report.plot_keras_metric("loss")
report.plot_keras_metric("timedelta")
y_real = np.concatenate([y for x, y in val_input], axis=0)
y_pred = model.predict(validation['sequence'][:len(y_real)])
report.plot_residuals(y_real, y_pred, xrange=(-1, 1))501/501 [==============================] - 3s 3ms/step

history = model.fit(x=test_input, epochs=epochs, batch_size=batch_size)
import matplotlib.pyplot as plt
plt.plot(range(epochs), history.history['loss'], '-', color='r', label='Training loss')
plt.title(f'Training and validation loss of the refined model')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()Epoch 1/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0560 - mae: 0.1987 - timedelta: 0.1993
Epoch 2/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1986 - timedelta: 0.1987
Epoch 3/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1988
Epoch 4/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1991
Epoch 5/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1982
