NeuralForecast 模型的参数 windows_batch的含义

flyfish

py 复制代码

import pandas as pd
import numpy as np

AirPassengers = np.array(
    [112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0],
    dtype=np.float32,
)

AirPassengersDF = pd.DataFrame(
    {
        "unique_id": np.ones(len(AirPassengers)),
        "ds": pd.date_range(
            start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
        ),
        "y": AirPassengers,
    }
)

Y_df = AirPassengersDF
Y_df = Y_df.reset_index(drop=True)
Y_df.head()
#Model Training

from neuralforecast.core import NeuralForecast
from neuralforecast.models import VanillaTransformer

horizon = 3
models = [VanillaTransformer(input_size=2 * horizon, h=horizon, max_steps=2)]

nf = NeuralForecast(models=models, freq='M')

for model in nf.models:
    print(f'Model: {model.__class__.__name__}')
    for param, value in model.__dict__.items():
        print(f'  {param}: {value}')
        
        
nf.fit(df=Y_df)

输出

json 复制代码

Seed set to 1
Model: VanillaTransformer
  training: True
  _parameters: OrderedDict()
  _buffers: OrderedDict()
  _non_persistent_buffers_set: set()
  _backward_pre_hooks: OrderedDict()
  _backward_hooks: OrderedDict()
  _is_full_backward_hook: None
  _forward_hooks: OrderedDict()
  _forward_hooks_with_kwargs: OrderedDict()
  _forward_hooks_always_called: OrderedDict()
  _forward_pre_hooks: OrderedDict()
  _forward_pre_hooks_with_kwargs: OrderedDict()
  _state_dict_hooks: OrderedDict()
  _state_dict_pre_hooks: OrderedDict()
  _load_state_dict_pre_hooks: OrderedDict()
  _load_state_dict_post_hooks: OrderedDict()
  _modules: OrderedDict([('loss', MAE()), ('valid_loss', MAE()), ('padder_train', ConstantPad1d(padding=(0, 3), value=0)), ('scaler', TemporalNorm()), ('enc_embedding', DataEmbedding(
  (value_embedding): TokenEmbedding(
    (tokenConv): Conv1d(1, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
  )
  (position_embedding): PositionalEmbedding()
  (dropout): Dropout(p=0.05, inplace=False)
)), ('dec_embedding', DataEmbedding(
  (value_embedding): TokenEmbedding(
    (tokenConv): Conv1d(1, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
  )
  (position_embedding): PositionalEmbedding()
  (dropout): Dropout(p=0.05, inplace=False)
)), ('encoder', TransEncoder(
  (attn_layers): ModuleList(
    (0-1): 2 x TransEncoderLayer(
      (attention): AttentionLayer(
        (inner_attention): FullAttention(
          (dropout): Dropout(p=0.05, inplace=False)
        )
        (query_projection): Linear(in_features=128, out_features=128, bias=True)
        (key_projection): Linear(in_features=128, out_features=128, bias=True)
        (value_projection): Linear(in_features=128, out_features=128, bias=True)
        (out_projection): Linear(in_features=128, out_features=128, bias=True)
      )
      (conv1): Conv1d(128, 32, kernel_size=(1,), stride=(1,))
      (conv2): Conv1d(32, 128, kernel_size=(1,), stride=(1,))
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.05, inplace=False)
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)), ('decoder', TransDecoder(
  (layers): ModuleList(
    (0): TransDecoderLayer(
      (self_attention): AttentionLayer(
        (inner_attention): FullAttention(
          (dropout): Dropout(p=0.05, inplace=False)
        )
        (query_projection): Linear(in_features=128, out_features=128, bias=True)
        (key_projection): Linear(in_features=128, out_features=128, bias=True)
        (value_projection): Linear(in_features=128, out_features=128, bias=True)
        (out_projection): Linear(in_features=128, out_features=128, bias=True)
      )
      (cross_attention): AttentionLayer(
        (inner_attention): FullAttention(
          (dropout): Dropout(p=0.05, inplace=False)
        )
        (query_projection): Linear(in_features=128, out_features=128, bias=True)
        (key_projection): Linear(in_features=128, out_features=128, bias=True)
        (value_projection): Linear(in_features=128, out_features=128, bias=True)
        (out_projection): Linear(in_features=128, out_features=128, bias=True)
      )
      (conv1): Conv1d(128, 32, kernel_size=(1,), stride=(1,))
      (conv2): Conv1d(32, 128, kernel_size=(1,), stride=(1,))
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.05, inplace=False)
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (projection): Linear(in_features=128, out_features=1, bias=True)
))])
  prepare_data_per_node: True
  allow_zero_length_dataloader_with_multiple_devices: False
  _log_hyperparams: True
  _dtype: torch.float32
  _device: cpu
  _trainer: None
  _example_input_array: None
  _automatic_optimization: True
  _strict_loading: None
  _current_fx_name: None
  _param_requires_grad_state: {}
  _metric_attributes: None
  _compiler_ctx: None
  _fabric: None
  _fabric_optimizers: []
  _hparams_name: kwargs
  _hparams: "activation":                    gelu
"alias":                         None
"batch_size":                    32
"conv_hidden_size":              32
"decoder_input_size_multiplier": 0.5
"decoder_layers":                1
"drop_last_loader":              False
"dropout":                       0.05
"early_stop_patience_steps":     -1
"encoder_layers":                2
"exclude_insample_y":            False
"futr_exog_list":                None
"h":                             3
"hidden_size":                   128
"hist_exog_list":                None
"inference_windows_batch_size":  1024
"input_size":                    6
"learning_rate":                 0.0001
"loss":                          MAE()
"lr_scheduler":                  None
"lr_scheduler_kwargs":           None
"max_steps":                     2
"n_head":                        4
"num_lr_decays":                 -1
"num_workers_loader":            0
"optimizer":                     None
"optimizer_kwargs":              None
"random_seed":                   1
"scaler_type":                   identity
"start_padding_enabled":         False
"stat_exog_list":                None
"step_size":                     1
"val_check_steps":               100
"valid_batch_size":              None
"valid_loss":                    None
"windows_batch_size":            1024
  _hparams_initial: "activation":                    gelu
"alias":                         None
"batch_size":                    32
"conv_hidden_size":              32
"decoder_input_size_multiplier": 0.5
"decoder_layers":                1
"drop_last_loader":              False
"dropout":                       0.05
"early_stop_patience_steps":     -1
"encoder_layers":                2
"exclude_insample_y":            False
"futr_exog_list":                None
"h":                             3
"hidden_size":                   128
"hist_exog_list":                None
"inference_windows_batch_size":  1024
"input_size":                    6
"learning_rate":                 0.0001
"loss":                          MAE()
"lr_scheduler":                  None
"lr_scheduler_kwargs":           None
"max_steps":                     2
"n_head":                        4
"num_lr_decays":                 -1
"num_workers_loader":            0
"optimizer":                     None
"optimizer_kwargs":              None
"random_seed":                   1
"scaler_type":                   identity
"start_padding_enabled":         False
"stat_exog_list":                None
"step_size":                     1
"val_check_steps":               100
"valid_batch_size":              None
"valid_loss":                    None
"windows_batch_size":            1024
  random_seed: 1
  train_trajectories: []
  valid_trajectories: []
  optimizer: None
  optimizer_kwargs: {}
  lr_scheduler: None
  lr_scheduler_kwargs: {}
  futr_exog_list: []
  hist_exog_list: []
  stat_exog_list: []
  futr_exog_size: 0
  hist_exog_size: 0
  stat_exog_size: 0
  trainer_kwargs: {'max_steps': 2, 'enable_checkpointing': False}
  h: 3
  input_size: 6
  windows_batch_size: 1024
  start_padding_enabled: False
  batch_size: 32
  valid_batch_size: 32
  inference_windows_batch_size: 1024
  learning_rate: 0.0001
  max_steps: 2
  num_lr_decays: -1
  lr_decay_steps: 100000000.0
  early_stop_patience_steps: -1
  val_check_steps: 100
  step_size: 1
  exclude_insample_y: False
  val_size: 0
  test_size: 0
  decompose_forecast: False
  num_workers_loader: 0
  drop_last_loader: False
  validation_step_outputs: []
  alias: None
  label_len: 3
  c_out: 1
  output_attention: False
  enc_in: 1

举例说明如何构建windows

复制代码

import pandas as pd
import numpy as np

AirPassengers = np.array(
    [112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0],
    dtype=np.float32,
)

AirPassengersDF = pd.DataFrame(
    {
        "unique_id": np.ones(len(AirPassengers)),
        "ds": pd.date_range(
            start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
        ),
        "y": AirPassengers,
    }
)

Y_df = AirPassengersDF
Y_df = Y_df.reset_index(drop=True)
Y_df.head()
#Model Training

from neuralforecast.core import NeuralForecast
from neuralforecast.models import NBEATS

horizon = 3
models = [NBEATS(input_size=2 * horizon, h=horizon, max_steps=2)]

nf = NeuralForecast(models=models, freq='M')
nf.fit(df=Y_df)

window_size 是窗口的总大小，它由 input_size 和 h 决定。

9= input_size(6) +h(3)

可以与原数据集对比下，是一个一个的往下移

当移动到 132.0的时候，为了凑齐9行，剩余的用0填充

窗口的形状就是 windows1 shape: torch.Size([4, 9, 2])

py 复制代码

 window1: tensor([
 		[[112.,   1.],
         [118.,   1.],
         [132.,   1.],
         [129.,   1.],
         [121.,   1.],
         [135.,   1.],
         [148.,   1.],
         [148.,   1.],
         [136.,   1.]],

        [[118.,   1.],
         [132.,   1.],
         [129.,   1.],
         [121.,   1.],
         [135.,   1.],
         [148.,   1.],
         [148.,   1.],
         [136.,   1.],
         [119.,   1.]],

        [[132.,   1.],
         [129.,   1.],
         [121.,   1.],
         [135.,   1.],
         [148.,   1.],
         [148.,   1.],
         [136.,   1.],
         [119.,   1.],
         [  0.,   0.]],

        [[129.,   1.],
         [121.,   1.],
         [135.,   1.],
         [148.,   1.],
         [148.,   1.],
         [136.,   1.],
         [119.,   1.],
         [  0.,   0.],
         [  0.,   0.]]])

windows_batch_size

最后由 windows1 shape: torch.Size([4, 9, 2])变成了 indows2 shape: torch.Size([1024, 9, 2])

也就是我们的传参windows_batch_size = 1024

下列举出4个例子，实际是1024个

表示采样了 1024 个窗口，每个窗口大小为9，包含 2 个特征。

py 复制代码

....
 	 [[118.,  1.],
     [132.,   1.],
     [129.,   1.],
     [121.,   1.],
     [135.,   1.],
     [148.,   1.],
     [148.,   1.],
     [136.,   1.],
     [119.,   1.]],

    [[129.,   1.],
     [121.,   1.],
     [135.,   1.],
     [148.,   1.],
     [148.,   1.],
     [136.,   1.],
     [119.,   1.],
     [  0.,   0.],
     [  0.,   0.]],

    [[118.,   1.],
     [132.,   1.],
     [129.,   1.],
     [121.,   1.],
     [135.,   1.],
     [148.,   1.],
     [148.,   1.],
     [136.,   1.],
     [119.,   1.]],

    [[118.,   1.],
     [132.,   1.],
     [129.,   1.],
     [121.,   1.],
     [135.,   1.],
     [148.,   1.],
     [148.,   1.],
     [136.,   1.],
     [119.,   1.]],

最终训练时，返回的数据

py 复制代码

windows_batch: {'temporal': 1024 个窗口数据, 
                'temporal_cols': Index(['y', 'available_mask'], dtype='object'), 
                'static': None, 
                'static_cols': None}