Source code for ninolearn.learn.fit

"""
This module aims to standardize the training and evaluation procedure.
"""
import numpy as np
import pandas as pd
import xarray as xr

from os.path import join, exists

from ninolearn.utils import print_header, small_print_header
from ninolearn.pathes import modeldir, processeddir

# evaluation decades
decades = [1963, 1972, 1982, 1992, 2002, 2012, 2018]
decades_elninolike = []

n_decades = len(decades)

# lead times for the evaluation
lead_times = [0, 3, 6, 9, 12, 15]
n_lead = len(lead_times)

decade_color = ['orange', 'violet', 'limegreen', 'darkgoldenrod', 'red', 'royalblue']
decade_name = ['1963-1971', '1972-1981', '1982-1991', '1992-2001', '2002-2011', '2012-2017']

[docs]def cross_training(model, pipeline, n_iter, **kwargs):
    """
    Training the model on different training sets in which each time a period\
    corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
    ovserved date is spared.

    :param model: A model that follows the guidelines how a model object\
    should be set up.

    :param pipeline: a function that takes lead time as argument and returns\
    the corresponding feature, label, time and persistance.

    :param save_dir: The prefix of the save directory.

    :param **kwargs: Arguments that shell be passed to the .set_parameter() \
    method of the provided model.
    """

    for lead_time in [0, 3, 6, 9, 12, 15]:
        X, y, timey = pipeline(lead_time, return_persistance=False)

        print_header(f'Lead time: {lead_time} month')

        for j in range(n_decades-1):
            m = model(**kwargs)
            dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"

            if not exists(join(modeldir, dir_name)):
                small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

                test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
                train_indeces = np.invert(test_indeces)

                trainX, trainy = X[train_indeces,:], y[train_indeces]

                m.fit_RandomizedSearch(trainX, trainy, n_iter=n_iter)
                m.save(location=modeldir, dir_name=dir_name)
            else:
                print(f'{dir_name} already exists')
            del m

[docs]def cross_hindcast(model, pipeline, model_name):
    """
    Generate a hindcast from 1962 till today using the models which were
    trained by the .cross_training() method.

    :param model: The considered model.

    :param pipeline: The data pipeline that already was used before in \
    .cross_training().
    """

    first_lead_loop = True

    for i in range(n_lead):
        lead_time = lead_times[i]
        print_header(f'Lead time: {lead_time} months')

        X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)

        ytrue = np.array([])
        timeytrue = pd.DatetimeIndex([])

        first_dec_loop = True
        for j in range(n_decades-1):
            small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

            # test indices
            test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
            testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]

            m = model()
            m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

            # allocate arrays and variables for which the model must be loaded
            if first_dec_loop:
                n_outputs = m.n_outputs
                output_names = m.output_names
                pred_full = np.zeros((n_outputs, 0))
                first_dec_loop=False

            # make prediction
            pred = np.zeros((m.n_outputs, testX.shape[0]))
            pred[:,:] = m.predict(testX)

            # make the full time series
            pred_full = np.append(pred_full, pred, axis=1)
            ytrue = np.append(ytrue, testy)
            timeytrue = timeytrue.append(testtimey)
            del m

        if timeytrue[0]!=pd.to_datetime('1963-01-01'):
            expected_first_date = '1963-01-01'
            got_first_date = timeytrue[0].isoformat()[:10]

            raise Exception(f"The first predicted date for lead time {lead_time} \
                            is {got_first_date} but expected {expected_first_date}")

        # allocate arrays and variables for which the full length of the time
        # series must be known
        if first_lead_loop:
            n_time = len(timeytrue)
            pred_save =  np.zeros((n_outputs, n_time, n_lead))
            first_lead_loop=False

        pred_save[:,:,i] =  pred_full

    # Save data to a netcdf file
    save_dict = {}
    for i in range(n_outputs):
        save_dict[output_names[i]] = (['target_season', 'lead'],  pred_save[i,:,:])

    ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
                                       'lead': lead_times} )
    ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))