Prosit-style GRU with pre-annotated ProteomeTools data

Author
Affiliation

Siegfried Gessulat

MSAID, Gmbh, Berlin, Germany

Published

September 28, 2022

Download and prepare training data

# Load ProteomeTools data from Figshare
!wget https://figshare.com/ndownloader/files/12506534
!mv 12506534 prosit_2018_holdout.hdf5
--2022-09-27 22:45:17--  https://figshare.com/ndownloader/files/12506534
Resolving figshare.com (figshare.com)... 54.170.191.210, 54.194.117.36, 2a05:d018:1f4:d003:88c8:2c66:47e7:4f07, ...
Connecting to figshare.com (figshare.com)|54.170.191.210|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/12506534/holdout_hcd.hdf5?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220927/eu-west-1/s3/aws4_request&X-Amz-Date=20220927T224517Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=3cae745acb61cdbeed831bae33ea71451bf3becf7404b7423a9fb848f55b3e3c [following]
--2022-09-27 22:45:18--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/12506534/holdout_hcd.hdf5?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220927/eu-west-1/s3/aws4_request&X-Amz-Date=20220927T224517Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=3cae745acb61cdbeed831bae33ea71451bf3becf7404b7423a9fb848f55b3e3c
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.120.88
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.120.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 262781348 (251M) [application/octet-stream]
Saving to: ‘12506534’

12506534            100%[===================>] 250.61M  13.7MB/s    in 20s     

2022-09-27 22:45:39 (12.4 MB/s) - ‘12506534’ saved [262781348/262781348]
# Import packages
import pandas as pd
import numpy as np
import h5py as h5

# Using the alphabet as defined in Prosit:
# https://github.com/kusterlab/prosit/blob/master/prosit/constants.py#L21-L43
PROSIT_ALHABET = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M(ox)": 21,
}
PROSIT_INDEXED_ALPHABET = {i: c for c, i in PROSIT_ALHABET.items()}
# Read the downloaded data to a dataframe
with h5.File('prosit_2018_holdout.hdf5', 'r') as f:
  KEY_ARRAY = ["sequence_integer", "precursor_charge_onehot", "intensities_raw"]
  KEY_SCALAR = ["collision_energy_aligned_normed", "collision_energy"]
  df = pd.DataFrame({key: list(f[key][...]) for key in KEY_ARRAY})
  for key in KEY_SCALAR:
    df[key] = f[key][...]

# Add convenience columns
df['precursor_charge'] = df.precursor_charge_onehot.map(lambda a: a.argmax() + 1)
df['sequence_maxquant'] = df.sequence_integer.map(lambda s: "".join(PROSIT_INDEXED_ALPHABET[i] for i in s if i != 0))
df['sequence_length'] = df.sequence_integer.map(lambda s: np.count_nonzero(s))

Inspecting data distributions

df['precursor_charge'].hist()
<matplotlib.axes._subplots.AxesSubplot>

df['collision_energy'].hist(bins=10)
<matplotlib.axes._subplots.AxesSubplot>

df['sequence_length'].hist(bins=30-7)
<matplotlib.axes._subplots.AxesSubplot>

Dataset preparation

# Split the data into training, validation, and test

from random import shuffle

def split_dataframe(df,
                    unique_column,
                    ratio_training=0.8,
                    ratio_validation=0.1,
                    ratio_test=0.1):
  """
  This function splits the dataframe in three splits and makes sure that values
  of `unique_column` are unique to each of the splits. This is helpful if, for
  example, you have non-unique sequence in `unique_column` but want to ensure
  that a sequence value is unique to one of the splits.
  """

  assert ratio_training + ratio_validation + ratio_test == 1

  unique = list(set(df[unique_column]))
  n_unique = len(unique)
  shuffle(unique)

  train_split = int(n_unique * ratio_training)
  val_split = int(n_unique * (ratio_training + ratio_validation))

  unique_train = unique[:train_split]
  unique_validation = unique[train_split:val_split]
  unique_test = unique[val_split:]

  assert len(unique_train) + len(unique_validation) + len(unique_test) == n_unique

  df_train = df[df[unique_column].isin(unique_train)]
  df_validation = df[df[unique_column].isin(unique_validation)]
  df_test = df[df[unique_column].isin(unique_test)]

  assert len(df_train) + len(df_validation) + len(df_test) == len(df)

  return df_train, df_validation, df_test

df_train, df_validation, df_test = split_dataframe(df, unique_column='sequence_maxquant')
# Prepare the training data
INPUT_COLUMNS = ('sequence_integer', 'precursor_charge_onehot', 'collision_energy_aligned_normed')
OUTPUT_COLUMN = 'intensities_raw'

x_train = [np.vstack(df_train[column]) for column in INPUT_COLUMNS]
y_train = np.vstack(df_train[OUTPUT_COLUMN])

x_validation = [np.vstack(df_validation[column]) for column in INPUT_COLUMNS]
y_validation = np.vstack(df_validation[OUTPUT_COLUMN])

x_test = [np.vstack(df_test[column]) for column in INPUT_COLUMNS]
y_test = np.vstack(df_test[OUTPUT_COLUMN])

Model setup and training

# Setup model and trainig parameters
DIM_LATENT = 124
DIM_EMBEDDING_IN = max(PROSIT_ALHABET.values()) + 1  # max value + zero for padding
DIM_EMBEDDING_OUT = 32
EPOCHS = 5
BATCH_SIZE = 256
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras import backend as k

# Build the model with input layers for sequence, precursor charge, and collision energy
in_sequence = Input(shape=[x_train[0].shape[1]], name="in_sequence")
in_precursor_charge = Input(shape=[x_train[1].shape[1]], name="in_precursor_charge")
in_collision_energy = Input(shape=[x_train[2].shape[1]], name="in_collision_energy")

x_s = Embedding(input_dim=DIM_EMBEDDING_IN, output_dim=DIM_EMBEDDING_OUT)(in_sequence)
x_s = GRU(DIM_LATENT)(x_s)
x_z = Dense(DIM_LATENT)(in_precursor_charge)
x_e = Dense(DIM_LATENT)(in_collision_energy)
x = Multiply()([x_s, x_z, x_e])
out_intensities = Dense(y_train.shape[1])(x)

model = Model([in_sequence, in_precursor_charge, in_collision_energy], out_intensities)
model.summary()
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 in_sequence (InputLayer)       [(None, 30)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 30, 32)       704         ['in_sequence[0][0]']            
                                                                                                  
 in_precursor_charge (InputLaye  [(None, 6)]         0           []                               
 r)                                                                                               
                                                                                                  
 in_collision_energy (InputLaye  [(None, 1)]         0           []                               
 r)                                                                                               
                                                                                                  
 gru (GRU)                      (None, 124)          58776       ['embedding[0][0]']              
                                                                                                  
 dense (Dense)                  (None, 124)          868         ['in_precursor_charge[0][0]']    
                                                                                                  
 dense_1 (Dense)                (None, 124)          248         ['in_collision_energy[0][0]']    
                                                                                                  
 multiply (Multiply)            (None, 124)          0           ['gru[0][0]',                    
                                                                  'dense[0][0]',                  
                                                                  'dense_1[0][0]']                
                                                                                                  
 dense_2 (Dense)                (None, 174)          21750       ['multiply[0][0]']               
                                                                                                  
==================================================================================================
Total params: 82,346
Trainable params: 82,346
Non-trainable params: 0
__________________________________________________________________________________________________
def masked_spectral_distance(true, pred):
    # This is the spectral angle implementation as used in Prosit
    # See https://github.com/kusterlab/prosit/blob/master/prosit/losses.py#L4-L16
    # Note, fragment ions that cannot exists (i.e. y20 for a 7mer) must have the value  -1.
    import keras.backend as k

    epsilon = k.epsilon()
    pred_masked = ((true + 1) * pred) / (true + 1 + epsilon)
    true_masked = ((true + 1) * true) / (true + 1 + epsilon)
    pred_norm = k.l2_normalize(true_masked, axis=-1)
    true_norm = k.l2_normalize(pred_masked, axis=-1)
    product = k.sum(pred_norm * true_norm, axis=1)
    arccos = tf.acos(product)
    return 2 * arccos / np.pi

model.compile(optimizer='Adam', loss=masked_spectral_distance)
history = model.fit(x=x_train, y=y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(x_validation, y_validation))
Epoch 1/5
2362/2362 [==============================] - 30s 10ms/step - loss: 0.5062 - val_loss: 0.4790
Epoch 2/5
2362/2362 [==============================] - 17s 7ms/step - loss: 0.4308 - val_loss: 0.4086
Epoch 3/5
2362/2362 [==============================] - 17s 7ms/step - loss: 0.3789 - val_loss: 0.3665
Epoch 4/5
2362/2362 [==============================] - 17s 7ms/step - loss: 0.3429 - val_loss: 0.3404
Epoch 5/5
2362/2362 [==============================] - 17s 7ms/step - loss: 0.3123 - val_loss: 0.3086

Model evaluation

# Plotting the training history

import matplotlib.pyplot as plt

plt.plot(range(EPOCHS), history.history['loss'], '-', color='r', label='Training loss')
plt.plot(range(EPOCHS), history.history['val_loss'], '--', color='r', label='Validation loss')
plt.title(f'Training and validation loss across epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

test_spectral_angle = model.evaluate(x_test, y_test)
test_spectral_angle
2341/2341 [==============================] - 7s 3ms/step - loss: 0.3031
0.3031081557273865