NeuralForecast 模型的参数 windows_batch的含义
flyfish
py
import pandas as pd
import numpy as np
AirPassengers = np.array(
[112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0],
dtype=np.float32,
)
AirPassengersDF = pd.DataFrame(
{
"unique_id": np.ones(len(AirPassengers)),
"ds": pd.date_range(
start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
),
"y": AirPassengers,
}
)
Y_df = AirPassengersDF
Y_df = Y_df.reset_index(drop=True)
Y_df.head()
#Model Training
from neuralforecast.core import NeuralForecast
from neuralforecast.models import VanillaTransformer
horizon = 3
models = [VanillaTransformer(input_size=2 * horizon, h=horizon, max_steps=2)]
nf = NeuralForecast(models=models, freq='M')
for model in nf.models:
print(f'Model: {model.__class__.__name__}')
for param, value in model.__dict__.items():
print(f' {param}: {value}')
nf.fit(df=Y_df)
输出
json
Seed set to 1
Model: VanillaTransformer
training: True
_parameters: OrderedDict()
_buffers: OrderedDict()
_non_persistent_buffers_set: set()
_backward_pre_hooks: OrderedDict()
_backward_hooks: OrderedDict()
_is_full_backward_hook: None
_forward_hooks: OrderedDict()
_forward_hooks_with_kwargs: OrderedDict()
_forward_hooks_always_called: OrderedDict()
_forward_pre_hooks: OrderedDict()
_forward_pre_hooks_with_kwargs: OrderedDict()
_state_dict_hooks: OrderedDict()
_state_dict_pre_hooks: OrderedDict()
_load_state_dict_pre_hooks: OrderedDict()
_load_state_dict_post_hooks: OrderedDict()
_modules: OrderedDict([('loss', MAE()), ('valid_loss', MAE()), ('padder_train', ConstantPad1d(padding=(0, 3), value=0)), ('scaler', TemporalNorm()), ('enc_embedding', DataEmbedding(
(value_embedding): TokenEmbedding(
(tokenConv): Conv1d(1, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
)
(position_embedding): PositionalEmbedding()
(dropout): Dropout(p=0.05, inplace=False)
)), ('dec_embedding', DataEmbedding(
(value_embedding): TokenEmbedding(
(tokenConv): Conv1d(1, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
)
(position_embedding): PositionalEmbedding()
(dropout): Dropout(p=0.05, inplace=False)
)), ('encoder', TransEncoder(
(attn_layers): ModuleList(
(0-1): 2 x TransEncoderLayer(
(attention): AttentionLayer(
(inner_attention): FullAttention(
(dropout): Dropout(p=0.05, inplace=False)
)
(query_projection): Linear(in_features=128, out_features=128, bias=True)
(key_projection): Linear(in_features=128, out_features=128, bias=True)
(value_projection): Linear(in_features=128, out_features=128, bias=True)
(out_projection): Linear(in_features=128, out_features=128, bias=True)
)
(conv1): Conv1d(128, 32, kernel_size=(1,), stride=(1,))
(conv2): Conv1d(32, 128, kernel_size=(1,), stride=(1,))
(norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.05, inplace=False)
)
)
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)), ('decoder', TransDecoder(
(layers): ModuleList(
(0): TransDecoderLayer(
(self_attention): AttentionLayer(
(inner_attention): FullAttention(
(dropout): Dropout(p=0.05, inplace=False)
)
(query_projection): Linear(in_features=128, out_features=128, bias=True)
(key_projection): Linear(in_features=128, out_features=128, bias=True)
(value_projection): Linear(in_features=128, out_features=128, bias=True)
(out_projection): Linear(in_features=128, out_features=128, bias=True)
)
(cross_attention): AttentionLayer(
(inner_attention): FullAttention(
(dropout): Dropout(p=0.05, inplace=False)
)
(query_projection): Linear(in_features=128, out_features=128, bias=True)
(key_projection): Linear(in_features=128, out_features=128, bias=True)
(value_projection): Linear(in_features=128, out_features=128, bias=True)
(out_projection): Linear(in_features=128, out_features=128, bias=True)
)
(conv1): Conv1d(128, 32, kernel_size=(1,), stride=(1,))
(conv2): Conv1d(32, 128, kernel_size=(1,), stride=(1,))
(norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.05, inplace=False)
)
)
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(projection): Linear(in_features=128, out_features=1, bias=True)
))])
prepare_data_per_node: True
allow_zero_length_dataloader_with_multiple_devices: False
_log_hyperparams: True
_dtype: torch.float32
_device: cpu
_trainer: None
_example_input_array: None
_automatic_optimization: True
_strict_loading: None
_current_fx_name: None
_param_requires_grad_state: {}
_metric_attributes: None
_compiler_ctx: None
_fabric: None
_fabric_optimizers: []
_hparams_name: kwargs
_hparams: "activation": gelu
"alias": None
"batch_size": 32
"conv_hidden_size": 32
"decoder_input_size_multiplier": 0.5
"decoder_layers": 1
"drop_last_loader": False
"dropout": 0.05
"early_stop_patience_steps": -1
"encoder_layers": 2
"exclude_insample_y": False
"futr_exog_list": None
"h": 3
"hidden_size": 128
"hist_exog_list": None
"inference_windows_batch_size": 1024
"input_size": 6
"learning_rate": 0.0001
"loss": MAE()
"lr_scheduler": None
"lr_scheduler_kwargs": None
"max_steps": 2
"n_head": 4
"num_lr_decays": -1
"num_workers_loader": 0
"optimizer": None
"optimizer_kwargs": None
"random_seed": 1
"scaler_type": identity
"start_padding_enabled": False
"stat_exog_list": None
"step_size": 1
"val_check_steps": 100
"valid_batch_size": None
"valid_loss": None
"windows_batch_size": 1024
_hparams_initial: "activation": gelu
"alias": None
"batch_size": 32
"conv_hidden_size": 32
"decoder_input_size_multiplier": 0.5
"decoder_layers": 1
"drop_last_loader": False
"dropout": 0.05
"early_stop_patience_steps": -1
"encoder_layers": 2
"exclude_insample_y": False
"futr_exog_list": None
"h": 3
"hidden_size": 128
"hist_exog_list": None
"inference_windows_batch_size": 1024
"input_size": 6
"learning_rate": 0.0001
"loss": MAE()
"lr_scheduler": None
"lr_scheduler_kwargs": None
"max_steps": 2
"n_head": 4
"num_lr_decays": -1
"num_workers_loader": 0
"optimizer": None
"optimizer_kwargs": None
"random_seed": 1
"scaler_type": identity
"start_padding_enabled": False
"stat_exog_list": None
"step_size": 1
"val_check_steps": 100
"valid_batch_size": None
"valid_loss": None
"windows_batch_size": 1024
random_seed: 1
train_trajectories: []
valid_trajectories: []
optimizer: None
optimizer_kwargs: {}
lr_scheduler: None
lr_scheduler_kwargs: {}
futr_exog_list: []
hist_exog_list: []
stat_exog_list: []
futr_exog_size: 0
hist_exog_size: 0
stat_exog_size: 0
trainer_kwargs: {'max_steps': 2, 'enable_checkpointing': False}
h: 3
input_size: 6
windows_batch_size: 1024
start_padding_enabled: False
batch_size: 32
valid_batch_size: 32
inference_windows_batch_size: 1024
learning_rate: 0.0001
max_steps: 2
num_lr_decays: -1
lr_decay_steps: 100000000.0
early_stop_patience_steps: -1
val_check_steps: 100
step_size: 1
exclude_insample_y: False
val_size: 0
test_size: 0
decompose_forecast: False
num_workers_loader: 0
drop_last_loader: False
validation_step_outputs: []
alias: None
label_len: 3
c_out: 1
output_attention: False
enc_in: 1
举例说明 如何构建windows
import pandas as pd
import numpy as np
AirPassengers = np.array(
[112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0],
dtype=np.float32,
)
AirPassengersDF = pd.DataFrame(
{
"unique_id": np.ones(len(AirPassengers)),
"ds": pd.date_range(
start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
),
"y": AirPassengers,
}
)
Y_df = AirPassengersDF
Y_df = Y_df.reset_index(drop=True)
Y_df.head()
#Model Training
from neuralforecast.core import NeuralForecast
from neuralforecast.models import NBEATS
horizon = 3
models = [NBEATS(input_size=2 * horizon, h=horizon, max_steps=2)]
nf = NeuralForecast(models=models, freq='M')
nf.fit(df=Y_df)
window_size 是窗口的总大小,它由 input_size 和 h 决定。
9= input_size(6) +h(3)
可以与原数据集对比下,是一个一个的往下移
当移动到 132.0的时候,为了凑齐9行,剩余的用0填充
窗口的形状就是 windows1 shape: torch.Size([4, 9, 2])
py
window1: tensor([
[[112., 1.],
[118., 1.],
[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.]],
[[118., 1.],
[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.]],
[[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.],
[ 0., 0.]],
[[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.],
[ 0., 0.],
[ 0., 0.]]])
windows_batch_size
最后由 windows1 shape: torch.Size([4, 9, 2])变成了 indows2 shape: torch.Size([1024, 9, 2])
也就是我们的传参windows_batch_size = 1024
下列举出4个例子,实际是1024个
表示采样了 1024 个窗口,每个窗口大小为9,包含 2 个特征。
py
....
[[118., 1.],
[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.]],
[[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.],
[ 0., 0.],
[ 0., 0.]],
[[118., 1.],
[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.]],
[[118., 1.],
[132., 1.],
[129., 1.],
[121., 1.],
[135., 1.],
[148., 1.],
[148., 1.],
[136., 1.],
[119., 1.]],
最终训练时,返回的数据
py
windows_batch: {'temporal': 1024 个窗口数据,
'temporal_cols': Index(['y', 'available_mask'], dtype='object'),
'static': None,
'static_cols': None}