"""
This module aims to standardize the training and evaluation procedure.
"""
import numpy as np
import pandas as pd
import xarray as xr
from os.path import join, exists
from ninolearn.utils import print_header, small_print_header
from ninolearn.pathes import modeldir, processeddir
# evaluation decades
decades = [1963, 1972, 1982, 1992, 2002, 2012, 2018]
decades_elninolike = []
n_decades = len(decades)
# lead times for the evaluation
lead_times = [0, 3, 6, 9, 12, 15]
n_lead = len(lead_times)
decade_color = ['orange', 'violet', 'limegreen', 'darkgoldenrod', 'red', 'royalblue']
decade_name = ['1963-1971', '1972-1981', '1982-1991', '1992-2001', '2002-2011', '2012-2017']
[docs]def cross_training(model, pipeline, n_iter, **kwargs):
"""
Training the model on different training sets in which each time a period\
corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
ovserved date is spared.
:param model: A model that follows the guidelines how a model object\
should be set up.
:param pipeline: a function that takes lead time as argument and returns\
the corresponding feature, label, time and persistance.
:param save_dir: The prefix of the save directory.
:param **kwargs: Arguments that shell be passed to the .set_parameter() \
method of the provided model.
"""
for lead_time in [0, 3, 6, 9, 12, 15]:
X, y, timey = pipeline(lead_time, return_persistance=False)
print_header(f'Lead time: {lead_time} month')
for j in range(n_decades-1):
m = model(**kwargs)
dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"
if not exists(join(modeldir, dir_name)):
small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')
test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
train_indeces = np.invert(test_indeces)
trainX, trainy = X[train_indeces,:], y[train_indeces]
m.fit_RandomizedSearch(trainX, trainy, n_iter=n_iter)
m.save(location=modeldir, dir_name=dir_name)
else:
print(f'{dir_name} already exists')
del m
[docs]def cross_hindcast(model, pipeline, model_name):
"""
Generate a hindcast from 1962 till today using the models which were
trained by the .cross_training() method.
:param model: The considered model.
:param pipeline: The data pipeline that already was used before in \
.cross_training().
"""
first_lead_loop = True
for i in range(n_lead):
lead_time = lead_times[i]
print_header(f'Lead time: {lead_time} months')
X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)
ytrue = np.array([])
timeytrue = pd.DatetimeIndex([])
first_dec_loop = True
for j in range(n_decades-1):
small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')
# test indices
test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]
m = model()
m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')
# allocate arrays and variables for which the model must be loaded
if first_dec_loop:
n_outputs = m.n_outputs
output_names = m.output_names
pred_full = np.zeros((n_outputs, 0))
first_dec_loop=False
# make prediction
pred = np.zeros((m.n_outputs, testX.shape[0]))
pred[:,:] = m.predict(testX)
# make the full time series
pred_full = np.append(pred_full, pred, axis=1)
ytrue = np.append(ytrue, testy)
timeytrue = timeytrue.append(testtimey)
del m
if timeytrue[0]!=pd.to_datetime('1963-01-01'):
expected_first_date = '1963-01-01'
got_first_date = timeytrue[0].isoformat()[:10]
raise Exception(f"The first predicted date for lead time {lead_time} \
is {got_first_date} but expected {expected_first_date}")
# allocate arrays and variables for which the full length of the time
# series must be known
if first_lead_loop:
n_time = len(timeytrue)
pred_save = np.zeros((n_outputs, n_time, n_lead))
first_lead_loop=False
pred_save[:,:,i] = pred_full
# Save data to a netcdf file
save_dict = {}
for i in range(n_outputs):
save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:])
ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
'lead': lead_times} )
ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))