# Installing required python packages
! pip install rich~=12.5 numpy~=1.21 pandas~=1.3 matplotlib~=3.5 seaborn~=0.11 scikit-learn~=1.0 pyarrow~=15.0 hyperopt~=0.2 --quiet
NIST (part 2): Traditional ML: Gradient boosting
1. Introduction
This is the second part in a three-part tutorial. We recommend you to to start with the first section, where the NIST spectral library is parsed and prepared for use in the second and third parts.
In this tutorial, you will learn how to build a fragmentation intensity predictor similar to MS²PIP v3 (Gabriels, Martens, and Degroeve 2019) with traditional machine learning (ML) feature engineering and Gradient Boosting (Friedman 2002).
2 Data preparation
We will use the spectral library that was already parsed in part 1 of this tutorial series.
import pandas as pd
= pd.read_feather("http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomicsml/fragmentation/nist-humanhcd20160503-parsed-trainval.feather")
train_val_spectra = pd.read_feather("http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomicsml/fragmentation/nist-humanhcd20160503-parsed-test.feather") test_spectra
2.1 Feature engineering
In traditional ML, the input features for the algorithm usually require some engineering. For fragmentation intensity prediction this is not different. Following the MS²PIP methods, we will calculate the distributions of several amino acid properties across the peptide and fragment ion sequences.
Using the distribution of these properties instead of the actual properties per amino acid allows MS²PIP to get a fixed length feature matrix for input peptides with varying lengths.
import numpy as np
import pandas as pd
from rich import progress
= list("ACDEFGHIKLMNPQRSTVWY")
amino_acids = np.array([
properties 37,35,59,129,94,0,210,81,191,81,106,101,117,115,343,49,90,60,134,104], # basicity
[68,23,33,29,70,58,41,73,32,73,66,38,0,40,39,44,53,71,51,55], # helicity
[51,75,25,35,100,16,3,94,0,94,82,12,0,22,22,21,39,80,98,70], # hydrophobicity
[32,23,0,4,27,32,48,32,69,32,29,26,35,28,79,29,28,31,31,28], # pI
[
])
=amino_acids, index=["basicity", "helicity", "hydrophobicity", "pI"]) pd.DataFrame(properties, columns
A | C | D | E | F | G | H | I | K | L | M | N | P | Q | R | S | T | V | W | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
basicity | 37 | 35 | 59 | 129 | 94 | 0 | 210 | 81 | 191 | 81 | 106 | 101 | 117 | 115 | 343 | 49 | 90 | 60 | 134 | 104 |
helicity | 68 | 23 | 33 | 29 | 70 | 58 | 41 | 73 | 32 | 73 | 66 | 38 | 0 | 40 | 39 | 44 | 53 | 71 | 51 | 55 |
hydrophobicity | 51 | 75 | 25 | 35 | 100 | 16 | 3 | 94 | 0 | 94 | 82 | 12 | 0 | 22 | 22 | 21 | 39 | 80 | 98 | 70 |
pI | 32 | 23 | 0 | 4 | 27 | 32 | 48 | 32 | 69 | 32 | 29 | 26 | 35 | 28 | 79 | 29 | 28 | 31 | 31 | 28 |
def encode_peptide(sequence, charge):
# 4 properties * 5 quantiles * 3 ion types + 4 properties * 4 site + 2 global
= 78
n_features = [0, 0.25, 0.5, 0.75, 1]
quantiles = len(sequence) - 1
n_ions
# Encode amino acids as integers to index amino acid properties for peptide sequence
= {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}
aa_indices = np.vectorize(lambda aa: aa_indices[aa])
aa_to_index = aa_to_index(np.array(list(sequence)))
peptide_indexed = properties[:, peptide_indexed]
peptide_properties
# Empty peptide_features array
= np.full((n_ions, n_features), np.nan)
peptide_features
for b_ion_number in range(1, n_ions + 1):
# Calculate quantiles of features across peptide, b-ion, and y-ion
= np.hstack(
peptide_quantiles =1).transpose()
np.quantile(peptide_properties, quantiles, axis
)= np.hstack(
b_ion_quantiles =1).transpose()
np.quantile(peptide_properties[:,:b_ion_number], quantiles, axis
)= np.hstack(
y_ion_quantiles =1).transpose()
np.quantile(peptide_properties[:,b_ion_number:], quantiles, axis
)
# Properties on specific sites: nterm, frag-1, frag+1, cterm
= np.array([0, b_ion_number - 1, b_ion_number, -1])
specific_site_indexes = np.hstack(peptide_properties[:, specific_site_indexes].transpose())
specific_site_properties
# Global features: Length and charge
= np.array([len(sequence), int(charge)])
global_features
# Assign to peptide_features array
- 1, 0:20] = peptide_quantiles
peptide_features[b_ion_number - 1, 20:40] = b_ion_quantiles
peptide_features[b_ion_number - 1, 40:60] = y_ion_quantiles
peptide_features[b_ion_number - 1, 60:76] = specific_site_properties
peptide_features[b_ion_number - 1, 76:78] = global_features
peptide_features[b_ion_number
return peptide_features
def generate_feature_names():
= []
feature_names for level in ["peptide", "b", "y"]:
for aa_property in ["basicity", "helicity", "hydrophobicity", "pi"]:
for quantile in ["min", "q1", "q2", "q3", "max"]:
"_".join([level, aa_property, quantile]))
feature_names.append(for site in ["nterm", "fragmin1", "fragplus1", "cterm"]:
for aa_property in ["basicity", "helicity", "hydrophobicity", "pi"]:
"_".join([site, aa_property]))
feature_names.append(
"length", "charge"])
feature_names.extend([return feature_names
Let’s test it with a single peptide. Feel free to use your own name as a “peptide”; as long as it does not contain any non-amino acid characters.
= pd.DataFrame(encode_peptide("RALFGARIELS", 2), columns=generate_feature_names())
peptide_features peptide_features
peptide_basicity_min | peptide_basicity_q1 | peptide_basicity_q2 | peptide_basicity_q3 | peptide_basicity_max | peptide_helicity_min | peptide_helicity_q1 | peptide_helicity_q2 | peptide_helicity_q3 | peptide_helicity_max | ... | fragplus1_basicity | fragplus1_helicity | fragplus1_hydrophobicity | fragplus1_pi | cterm_basicity | cterm_helicity | cterm_hydrophobicity | cterm_pi | length | charge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 37.0 | 68.0 | 51.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
1 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 81.0 | 73.0 | 94.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
2 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 94.0 | 70.0 | 100.0 | 27.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
3 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 0.0 | 58.0 | 16.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
4 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 37.0 | 68.0 | 51.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
5 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 343.0 | 39.0 | 22.0 | 79.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
6 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 81.0 | 73.0 | 94.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
7 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 129.0 | 29.0 | 35.0 | 4.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
8 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 81.0 | 73.0 | 94.0 | 32.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
9 | 0.0 | 43.0 | 81.0 | 111.5 | 343.0 | 29.0 | 41.5 | 68.0 | 71.5 | 73.0 | ... | 49.0 | 44.0 | 21.0 | 29.0 | 49.0 | 44.0 | 21.0 | 29.0 | 11.0 | 2.0 |
10 rows × 78 columns
2.2 Getting the target intensities
The target intensities are the observed intensities which the model will learn to predict. Let’s first try with a single spectrum.
= train_val_spectra.iloc[4]
test_spectrum
= pd.DataFrame({
peptide_targets "b_target": test_spectrum["parsed_intensity"]["b"],
"y_target": test_spectrum["parsed_intensity"]["y"],
}) peptide_targets
b_target | y_target | |
---|---|---|
0 | 0.000000 | 0.118507 |
1 | 0.229717 | 0.079770 |
2 | 0.294631 | 0.088712 |
3 | 0.234662 | 0.145900 |
4 | 0.185732 | 0.205005 |
5 | 0.134395 | 0.261630 |
6 | 0.081856 | 0.305119 |
7 | 0.043793 | 0.296351 |
8 | 0.000000 | 0.205703 |
9 | 0.000000 | 0.155991 |
10 | 0.000000 | 0.000000 |
These are the intensities for the b- and y-ions, each ordered from 1 to 9. In MS²PIP, however, a clever trick is applied to reuse the computed features for each fragment ion pair. Doing so makes perfect sense, as both ions in such a fragment ion pair originated from the same fragmentation event. For this peptide, the fragment ion pairs are b1-y9, b2-y8, b3-y7, etc. To match all of the pairs, we simply have to reverse the y-ion series intensities:
= pd.DataFrame({
peptide_targets "b_target": test_spectrum["parsed_intensity"]["b"],
"y_target": test_spectrum["parsed_intensity"]["y"][::-1],
}) peptide_targets
b_target | y_target | |
---|---|---|
0 | 0.000000 | 0.000000 |
1 | 0.229717 | 0.155991 |
2 | 0.294631 | 0.205703 |
3 | 0.234662 | 0.296351 |
4 | 0.185732 | 0.305119 |
5 | 0.134395 | 0.261630 |
6 | 0.081856 | 0.205005 |
7 | 0.043793 | 0.145900 |
8 | 0.000000 | 0.088712 |
9 | 0.000000 | 0.079770 |
10 | 0.000000 | 0.118507 |
2.3 Bringing it all together
= encode_peptide(test_spectrum["sequence"], test_spectrum["charge"])
features = np.stack([test_spectrum["parsed_intensity"]["b"], test_spectrum["parsed_intensity"]["y"][::-1]], axis=1)
targets = np.full(shape=(targets.shape[0], 1), fill_value=test_spectrum["index"]) # Repeat id for all ions spectrum_id
=["spectrum_id"] + generate_feature_names() + ["b_target", "y_target"]) pd.DataFrame(np.hstack([spectrum_id, features, targets]), columns
spectrum_id | peptide_basicity_min | peptide_basicity_q1 | peptide_basicity_q2 | peptide_basicity_q3 | peptide_basicity_max | peptide_helicity_min | peptide_helicity_q1 | peptide_helicity_q2 | peptide_helicity_q3 | ... | fragplus1_hydrophobicity | fragplus1_pi | cterm_basicity | cterm_helicity | cterm_hydrophobicity | cterm_pi | length | charge | b_target | y_target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.000000 | 0.000000 |
1 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.229717 | 0.155991 |
2 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.294631 | 0.205703 |
3 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.234662 | 0.296351 |
4 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.185732 | 0.305119 |
5 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.134395 | 0.261630 |
6 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.081856 | 0.205005 |
7 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.043793 | 0.145900 |
8 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 80.0 | 31.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.000000 | 0.088712 |
9 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 21.0 | 29.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.000000 | 0.079770 |
10 | 5.0 | 37.0 | 37.0 | 37.0 | 40.0 | 343.0 | 39.0 | 68.0 | 68.0 | 68.0 | ... | 22.0 | 79.0 | 343.0 | 39.0 | 22.0 | 79.0 | 12.0 | 2.0 | 0.000000 | 0.118507 |
11 rows × 81 columns
The following function applies these steps over a collection of spectra and returns the full feature/target table:
def generate_ml_input(spectra):
= []
tables for spectrum in progress.track(spectra.to_dict(orient="records")):
= encode_peptide(spectrum["sequence"], spectrum["charge"])
features = np.stack([spectrum["parsed_intensity"]["b"], spectrum["parsed_intensity"]["y"][::-1]], axis=1)
targets = np.full(shape=(targets.shape[0], 1), fill_value=spectrum["index"]) # Repeat id for all ions
spectrum_id = np.hstack([spectrum_id, features, targets])
table
tables.append(table)
= np.vstack(tables)
full_table = pd.DataFrame(full_table, columns=["spectrum_id"] + generate_feature_names() + ["b_target", "y_target"])
spectra_encoded return spectra_encoded
Note that this might take some time, sometimes up to 30 minutes. To skip this step, simple download the file with pre-encoded features and targets, and load in two cells below.
= generate_ml_input(train_val_spectra)
train_val_encoded "fragmentation-nist-humanhcd20160503-parsed-trainval-encoded.feather")
train_val_encoded.to_feather(
= generate_ml_input(test_spectra)
test_encoded "fragmentation-nist-humanhcd20160503-parsed-test-encoded.feather") test_encoded.to_feather(
# Uncomment this step to load pre-encoded features from a file:
#train_val_encoded = pd.read_feather("ftp://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomicsml/fragmentation/nist-humanhcd20160503-parsed-trainval-encoded.feather")
#test_encoded = pd.read_feather("ftp://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomicsml/fragmentation/nist-humanhcd20160503-parsed-test-encoded.feather")
train_val_encoded
spectrum_id | peptide_basicity_min | peptide_basicity_q1 | peptide_basicity_q2 | peptide_basicity_q3 | peptide_basicity_max | peptide_helicity_min | peptide_helicity_q1 | peptide_helicity_q2 | peptide_helicity_q3 | ... | fragplus1_hydrophobicity | fragplus1_pi | cterm_basicity | cterm_helicity | cterm_hydrophobicity | cterm_pi | length | charge | b_target | y_target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 37.0 | 37.0 | 37.0 | 191.0 | 32.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 191.0 | 32.0 | 0.0 | 69.0 | 22.0 | 2.0 | 0.000000 | 0.000000 |
1 | 0.0 | 0.0 | 37.0 | 37.0 | 37.0 | 191.0 | 32.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 191.0 | 32.0 | 0.0 | 69.0 | 22.0 | 2.0 | 0.094060 | 0.000000 |
2 | 0.0 | 0.0 | 37.0 | 37.0 | 37.0 | 191.0 | 32.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 191.0 | 32.0 | 0.0 | 69.0 | 22.0 | 2.0 | 0.180642 | 0.000000 |
3 | 0.0 | 0.0 | 37.0 | 37.0 | 37.0 | 191.0 | 32.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 191.0 | 32.0 | 0.0 | 69.0 | 22.0 | 2.0 | 0.204203 | 0.050476 |
4 | 0.0 | 0.0 | 37.0 | 37.0 | 37.0 | 191.0 | 32.0 | 68.0 | 68.0 | 68.0 | ... | 51.0 | 32.0 | 191.0 | 32.0 | 0.0 | 69.0 | 22.0 | 2.0 | 0.233472 | 0.094835 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3321136 | 398372.0 | 81.0 | 104.0 | 104.0 | 153.0 | 343.0 | 39.0 | 48.5 | 55.0 | 55.0 | ... | 70.0 | 28.0 | 343.0 | 39.0 | 22.0 | 79.0 | 8.0 | 3.0 | 0.000000 | 0.180938 |
3321137 | 398372.0 | 81.0 | 104.0 | 104.0 | 153.0 | 343.0 | 39.0 | 48.5 | 55.0 | 55.0 | ... | 98.0 | 31.0 | 343.0 | 39.0 | 22.0 | 79.0 | 8.0 | 3.0 | 0.000000 | 0.203977 |
3321138 | 398372.0 | 81.0 | 104.0 | 104.0 | 153.0 | 343.0 | 39.0 | 48.5 | 55.0 | 55.0 | ... | 3.0 | 48.0 | 343.0 | 39.0 | 22.0 | 79.0 | 8.0 | 3.0 | 0.000000 | 0.169803 |
3321139 | 398372.0 | 81.0 | 104.0 | 104.0 | 153.0 | 343.0 | 39.0 | 48.5 | 55.0 | 55.0 | ... | 94.0 | 32.0 | 343.0 | 39.0 | 22.0 | 79.0 | 8.0 | 3.0 | 0.000000 | 0.120565 |
3321140 | 398372.0 | 81.0 | 104.0 | 104.0 | 153.0 | 343.0 | 39.0 | 48.5 | 55.0 | 55.0 | ... | 22.0 | 79.0 | 343.0 | 39.0 | 22.0 | 79.0 | 8.0 | 3.0 | 0.000000 | 0.169962 |
3321141 rows × 81 columns
This is the data we will use for training. Note that each spectrum comprises of multiple lines: One line per b/y-ion couple.
3 Training the model
from sklearn.ensemble import GradientBoostingRegressor
Let’s first try to train a simple model on the train set and evaluate its performance on the test set.
= GradientBoostingRegressor()
reg
= train_val_encoded.drop(columns=["spectrum_id", "b_target", "y_target"])
X_train = train_val_encoded["y_target"]
y_train = test_encoded.drop(columns=["spectrum_id", "b_target", "y_target"])
X_test = test_encoded["y_target"]
y_test
reg.fit(X_train, y_train)
GradientBoostingRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor()
= reg.predict(X_test)
y_test_pred 0][1] np.corrcoef(y_test, y_test_pred)[
0.7545700864493669
Not terrible. Let’s see if we can do better after hyperparameters optimization. For this, we can use the hyperopt package.
from hyperopt import fmin, hp, tpe, STATUS_OK
def objective(n_estimators):
# Define algorithm
= GradientBoostingRegressor(n_estimators=n_estimators)
reg
# Fit model
reg.fit(X_train, y_train)
# Test model
= reg.predict(X_test)
y_test_pred = np.corrcoef(y_test, y_test_pred)[0][1]
correlation
return {'loss': -correlation, 'status': STATUS_OK}
= fmin(
best_params =objective,
fn=10 + hp.randint('n_estimators', 980),
space=tpe.suggest,
algo=10,
max_evals )
100%|██████████| 10/10 [27:32:34<00:00, 9915.41s/trial, best loss: -0.8238390276421177]
best_params
{'n_estimators': 912}
Initially, the default value of 100 estimators was used. According to this hyperopt run, using 912 estimators results in a more performant model.
Now we can train the model again with this new hyperparameter value:
= GradientBoostingRegressor(n_estimators=946)
reg
= train_val_encoded.drop(columns=["spectrum_id", "b_target", "y_target"])
X_train = train_val_encoded["y_target"]
y_train = test_encoded.drop(columns=["spectrum_id", "b_target", "y_target"])
X_test = test_encoded["y_target"]
y_test
reg.fit(X_train, y_train)
GradientBoostingRegressor(n_estimators=946)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(n_estimators=946)
= reg.predict(X_test)
y_test_pred 0][1] np.corrcoef(y_test, y_test_pred)[
0.8245284609614351
Much better already. To get a more accurate view of the model performance, we should calculate the correlation per spectrum, instead of across the full dataset:
= pd.DataFrame({
prediction_df_y "spectrum_id": test_encoded["spectrum_id"],
"target_y": y_test,
"prediction_y": y_test_pred,
}) prediction_df_y
spectrum_id | target_y | prediction_y | |
---|---|---|---|
0 | 9.0 | 0.000000 | -0.001765 |
1 | 9.0 | 0.000000 | -0.001524 |
2 | 9.0 | 0.000000 | -0.000848 |
3 | 9.0 | 0.000000 | 0.000110 |
4 | 9.0 | 0.000000 | 0.004115 |
... | ... | ... | ... |
367683 | 398369.0 | 0.000000 | 0.157859 |
367684 | 398369.0 | 0.224074 | 0.195466 |
367685 | 398369.0 | 0.283664 | 0.160815 |
367686 | 398369.0 | 0.185094 | 0.138235 |
367687 | 398369.0 | 0.192657 | 0.223232 |
367688 rows × 3 columns
= prediction_df_y.groupby("spectrum_id").corr().iloc[::2]['prediction_y']
corr_y = corr_y.index.droplevel(1)
corr_y.index = corr_y.reset_index().rename(columns={"prediction_y": "correlation"})
corr_y corr_y
spectrum_id | correlation | |
---|---|---|
0 | 9.0 | 0.839347 |
1 | 16.0 | 0.806933 |
2 | 39.0 | 0.113225 |
3 | 95.0 | -0.078798 |
4 | 140.0 | 0.744942 |
... | ... | ... |
27031 | 398328.0 | 0.856521 |
27032 | 398341.0 | 0.358528 |
27033 | 398342.0 | 0.839326 |
27034 | 398368.0 | 0.452030 |
27035 | 398369.0 | 0.204123 |
27036 rows × 2 columns
Median correlation:
"correlation"].median() corr_y[
0.8328406567933057
Correlation distribution:
import matplotlib.pyplot as plt
import seaborn as sns
"whitegrid") sns.set_style(
sns.catplot(=corr_y, x="correlation",
data=1,
fliersize="box", aspect=4, height=2
kind
) plt.show()
Not bad! With some more hyperparameter optimization (optimizing only the number of trees is a bit crude) a lot more performance gains could be made. Take a look at the Scikit Learn documentation to learn more about the various hyperparameters for the GradientBoostingRegressor
. Alternatively, you could switch to the XGBoost algorithm, which is currently used by MS²PIP.
And of course, this model can only predict y-ion intensities. You can repeat the training and optimization steps to train a model for b-ion intensities.
Good luck!