!pip install pandas==1.3.5 sklearn==0.0.post1 tensorflow==2.9.2 dlomix==0.0.3 numpy==1.21.6 matplotlib==3.2.2 requests==2.23.0 --quiet
DLOmix embedding of Prosit model on ProteomeTools data
# Import and normalize/standarize data
import pandas as pd
import numpy as np
# Import and normalize the data
= pd.read_csv('https://github.com/ProteomicsML/ProteomicsML/blob/main/datasets/retentiontime/ProteomeTools/small.zip?raw=true', compression='zip')
data
# shuffle and split dataset into internal (80%) and external (20%) datasets
= data.sample(frac=1)
data = data[int(len(data)*0.8):]
test_data = data[:int(len(data)*0.8)] data
# Split the internal dataset into training and validation
# We have to split the data based on Sequences, to make sure we dont have cross-over sequences in the training and validation splits.
= list(set(data['sequence']))
unique_sequences # Shuffle the data to ensure unbiased data splitting
from random import shuffle
shuffle(unique_sequences)# Split sequence 80-10-10 training, validation and testing split
= unique_sequences[0:int(len(unique_sequences) * 0.8)]
train = unique_sequences[int(len(unique_sequences) * 0.8):]
validation # Transfer the sequence split into data split
= data[data['sequence'].isin(train)]
train = data[data['sequence'].isin(validation)]
validation print('Training data points:', len(train),' Validation data points:', len(validation),' Testing data points:', len(test_data))
# Here we use test as an external dataset unlike the one used for training.
Training data points: 63955 Validation data points: 16045 Testing data points: 20000
= True
normalize if normalize:
# Normalize
= min(train['retention time'].min(), validation['retention time'].min()), max(train['retention time'].max(), validation['retention time'].max())
train_val_min, train_val_max 'retention time'] = list((train['retention time'] - train_val_min) / (train_val_max - train_val_min))
train['retention time'] = list((validation['retention time'] - train_val_min) / (train_val_max - train_val_min))
validation['retention time'] = list((test_data['retention time'] - test_data['retention time'].min()) / (test_data['retention time'].max() - test_data['retention time'].min()))
test_data[else:
# Standardize
= np.mean(list(train['retention time']) + list(validation['retention time'])), np.std(list(train['retention time']) + list(validation['retention time']))
train_val_mean, train_val_std 'retention time'] = (train['retention time'] - train_val_mean) / train_val_std
train['retention time'] = (validation['retention time'] - train_val_mean) / train_val_std
validation['retention time'] = (test_data['retention time'] - np.mean(test_data['retention time'])) / np.std(test_data['retention time']) test_data[
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Setup parameters
= 30
sequence_length = 64
batch_size =5 epochs
# Setup data
from dlomix.data import RetentionTimeDataset
= RetentionTimeDataset(data_source=tuple([np.array(train['sequence']), np.array(train['retention time'])]),
train_input =sequence_length, batch_size=batch_size, test=False).train_data
seq_length
= RetentionTimeDataset(data_source=tuple([np.array(validation['sequence']), np.array(validation['retention time'])]),
val_input =sequence_length, batch_size=batch_size, test=False).train_data
seq_length
= RetentionTimeDataset(data_source=tuple([np.array(test_data['sequence']), np.array(test_data['retention time'])]),
test_input =sequence_length, batch_size=batch_size, test=False).train_data
seq_length
# Setup PROSIT model from DLOmix
from dlomix.models.prosit import PrositRetentionTimePredictor
= PrositRetentionTimePredictor(seq_length=sequence_length)
model None, sequence_length))
model.build(( model.summary()
Model: "prosit_retention_time_predictor_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
string_lookup_2 (StringLook multiple 0
up)
embedding_2 (Embedding) multiple 352
sequential_4 (Sequential) (None, 30, 512) 1996800
attention_layer_2 (Attentio multiple 542
nLayer)
sequential_5 (Sequential) (None, 512) 262656
dense_5 (Dense) multiple 513
=================================================================
Total params: 2,260,863
Trainable params: 2,260,863
Non-trainable params: 0
_________________________________________________________________
from dlomix.eval.rt_eval import TimeDeltaMetric
import tensorflow as tf
# Compiling the keras model with loss function, metrics and optimizer
compile(loss='mse', metrics=['mae', TimeDeltaMetric()], optimizer=tf.keras.optimizers.Adam(learning_rate=0.005))
model.# Train the model
= model.fit(x=train_input, epochs=epochs, batch_size=batch_size, validation_data=val_input) history
Epoch 1/5
998/998 [==============================] - 26s 22ms/step - loss: 0.6175 - mae: 0.1161 - timedelta: 0.1140 - val_loss: 0.0040 - val_mae: 0.0427 - val_timedelta: 0.0484
Epoch 2/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0055 - mae: 0.0526 - timedelta: 0.0522 - val_loss: 0.0038 - val_mae: 0.0428 - val_timedelta: 0.0467
Epoch 3/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0047 - mae: 0.0474 - timedelta: 0.0464 - val_loss: 0.0039 - val_mae: 0.0459 - val_timedelta: 0.0480
Epoch 4/5
998/998 [==============================] - 21s 21ms/step - loss: 0.6041 - mae: 0.2064 - timedelta: 0.1935 - val_loss: 0.0537 - val_mae: 0.1940 - val_timedelta: 0.1972
Epoch 5/5
998/998 [==============================] - 21s 21ms/step - loss: 0.0544 - mae: 0.1961 - timedelta: 0.1900 - val_loss: 0.0536 - val_mae: 0.1943 - val_timedelta: 0.1967
from dlomix.reports import RetentionTimeReport
= RetentionTimeReport(output_path="./output", history=history) report
"loss") report.plot_keras_metric(
"timedelta") report.plot_keras_metric(
= np.concatenate([y for x, y in val_input], axis=0)
y_real = model.predict(validation['sequence'][:len(y_real)])
y_pred xrange=(-1, 1)) report.plot_residuals(y_real, y_pred,
501/501 [==============================] - 3s 3ms/step
= model.fit(x=test_input, epochs=epochs, batch_size=batch_size)
history import matplotlib.pyplot as plt
range(epochs), history.history['loss'], '-', color='r', label='Training loss')
plt.plot(f'Training and validation loss of the refined model')
plt.title('Epochs')
plt.xlabel('Loss')
plt.ylabel(
plt.legend() plt.show()
Epoch 1/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0560 - mae: 0.1987 - timedelta: 0.1993
Epoch 2/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1986 - timedelta: 0.1987
Epoch 3/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1988
Epoch 4/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1991
Epoch 5/5
312/312 [==============================] - 6s 19ms/step - loss: 0.0559 - mae: 0.1985 - timedelta: 0.1982