文章目录
- 深度学习超参数调优
- 超参数调优框架
-
- Optuna深度学习超参数优化框架
- [nvidia nemo大模型超参数优化框架](#nvidia nemo大模型超参数优化框架)
参数调整理论: 黑盒优化:超参数优化算法最新进展总结
- 均为转载,联系侵删
深度学习超参数调优
- pytorch 网格搜索LSTM最优参数 python网格搜索优化参数
- Keras深度学习超参数优化官方手册
- Keras深度学习超参数优化手册-CSDN博客版
- 超参数搜索不够高效?这几大策略了解一下
- 使用贝叶斯优化进行深度神经网络超参数优化
网格搜索
示例一:网格搜索回归模型超参数
python
# grid search cnn for airline passengers
from math import sqrt
from numpy import array, mean
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPooling1D
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
return data[:-n_test], data[-n_test:]
# transform list into supervised learning format
def series_to_supervised(data, n_in=1, n_out=1):
df = DataFrame(data)
cols = list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
# put it all together
agg = concat(cols, axis=1)
# drop rows with NaN values
agg.dropna(inplace=True)
return agg.values
# root mean squared error or rmse
def measure_rmse(actual, predicted):
return sqrt(mean_squared_error(actual, predicted))
# difference dataset
def difference(data, order):
return [data[i] - data[i - order] for i in range(order, len(data))]
# fit a model
def model_fit(train, config):
# unpack config
n_input, n_filters, n_kernel, n_epochs, n_batch, n_diff = config
# prepare data
if n_diff > 0:
train = difference(train, n_diff)
# transform series into supervised format
data = series_to_supervised(train, n_in=n_input)
# separate inputs and outputs
train_x, train_y = data[:, :-1], data[:, -1]
# reshape input data into [samples, timesteps, features]
n_features = 1
train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
# define model
model = Sequential()
model.add(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu', input_shape=(n_input, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
# fit
model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
return model
# forecast with the fit model
def model_predict(model, history, config):
# unpack config
n_input, _, _, _, _, n_diff = config
# prepare data
correction = 0.0
if n_diff > 0:
correction = history[-n_diff]
history = difference(history, n_diff)
x_input = array(history[-n_input:]).reshape((1, n_input, 1))
# forecast
yhat = model.predict(x_input, verbose=0)
return correction + yhat[0]
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
predictions = list()
# split dataset
train, test = train_test_split(data, n_test)
# fit model
model = model_fit(train, cfg)
# seed history with training dataset
history = [x for x in train]
# step over each time-step in the test set
for i in range(len(test)):
# fit model and make forecast for history
yhat = model_predict(model, history, cfg)
# store forecast in list of predictions
predictions.append(yhat)
# add actual observation to history for the next loop
history.append(test[i])
# estimate prediction error
error = measure_rmse(test, predictions)
print(' > %.3f' % error)
return error
# score a model, return None on failure
def repeat_evaluate(data, config, n_test, n_repeats=10):
# convert config to a key
key = str(config)
# fit and evaluate the model n times
scores = [walk_forward_validation(data, n_test, config) for _ in range(n_repeats)]
# summarize score
result = mean(scores)
print('> Model[%s] %.3f' % (key, result))
return (key, result)
# grid search configs
def grid_search(data, cfg_list, n_test):
# evaluate configs
scores = [repeat_evaluate(data, cfg, n_test) for cfg in cfg_list]
# sort configs by error, asc
scores.sort(key=lambda tup: tup[1])
return scores
# create a list of configs to try
def model_configs():
# define scope of configs
n_input = [12]
n_filters = [64]
n_kernels = [3, 5]
n_epochs = [100]
n_batch = [1, 150]
n_diff = [0, 12]
# create configs
configs = list()
for a in n_input:
for b in n_filters:
for c in n_kernels:
for d in n_epochs:
for e in n_batch:
for f in n_diff:
cfg = [a, b, c, d, e, f]
configs.append(cfg)
print('Total configs: %d' % len(configs))
return configs
# define dataset
# 下载数据集:https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv
series = read_csv('airline-passengers.csv', header=0, index_col=0)
data = series.values
# data split
n_test = 12
# model configs
cfg_list = model_configs()
# grid search
scores = grid_search(data, cfg_list, n_test)
print('done')
# list top 10 configs
for cfg, error in scores[:3]:
print(cfg, error)
示例二:Keras网格搜索
python
"""
调整batch size和epochs
"""
# Use scikit-learn to grid search the batch size and epochs
import numpy as np
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier
# Function to create model, required for KerasClassifier
def create_model():
# create model
model = Sequential()
model.add(Dense(12, input_shape=(8,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)
# load dataset
dataset = np.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = KerasClassifier(model=create_model, verbose=0)
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
"""
更多参考:https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
"""
随机搜索
python
# Load the dataset
X, Y = load_dataset()
# Create model for KerasClassifier
def create_model(hparams1=dvalue,
hparams2=dvalue,
...
hparamsn=dvalue):
# Model definition
...
model = KerasClassifier(build_fn=create_model)
# Specify parameters and distributions to sample from
hparams1 = randint(1, 100)
hparams2 = ['elu', 'relu', ...]
...
hparamsn = uniform(0, 1)
# Prepare the Dict for the Search
param_dist = dict(hparams1=hparams1,
hparams2=hparams2,
...
hparamsn=hparamsn)
# Search in action!
n_iter_search = 16 # Number of parameter settings that are sampled.
random_search = RandomizedSearchCV(estimator=model,
param_distributions=param_dist,
n_iter=n_iter_search,
n_jobs=,
cv=,
verbose=)
random_search.fit(X, Y)
# Show the results
print("Best: %f using %s" % (random_search.best_score_, random_search.best_params_))
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
params = random_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
贝叶斯搜索
python
"""
准备数据
"""
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
# split into train, validation and test sets
train_x, val_x, train_y, val_y = train_test_split(train_images, train_labels, stratify=train_labels, random_state=48, test_size=0.05)
(test_x, test_y)=(test_images, test_labels)
# normalize pixels to range 0-1
train_x = train_x / 255.0
val_x = val_x / 255.0
test_x = test_x / 255.0
#one-hot encode target variable
train_y = to_categorical(train_y)
val_y = to_categorical(val_y)
test_y = to_categorical(test_y)
# pip3 install keras-tuner
"""
调整获取最优参数(MLP版)
"""
model = Sequential()
model.add(Dense(units = hp.Int('dense-bot', min_value=50, max_value=350, step=50), input_shape=(784,), activation='relu'))
for i in range(hp.Int('num_dense_layers', 1, 2)):
model.add(Dense(units=hp.Int('dense_' + str(i), min_value=50, max_value=100, step=25), activation='relu'))
model.add(Dropout(hp.Choice('dropout_'+ str(i), values=[0.0, 0.1, 0.2])))
model.add(Dense(10,activation="softmax"))
hp_optimizer=hp.Choice('Optimizer', values=['Adam', 'SGD'])
if hp_optimizer == 'Adam':
hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])
elif hp_optimizer == 'SGD':
hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])
nesterov=True
momentum=0.9
model.compile(optimizer = hp_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
tuner_mlp = kt.tuners.BayesianOptimization(
model,
seed=random_seed,
objective='val_loss',
max_trials=30,
directory='.',
project_name='tuning-mlp')
tuner_mlp.search(train_x, train_y, epochs=50, batch_size=32, validation_data=(dev_x, dev_y), callbacks=callback)
best_mlp_hyperparameters = tuner_mlp.get_best_hyperparameters(1)[0]
print("Best Hyper-parameters")
# best_mlp_hyperparameters.values
"""
使用最优参数来训练模型
"""
model_mlp = Sequential()
model_mlp.add(Dense(best_mlp_hyperparameters['dense-bot'], input_shape=(784,), activation='relu'))
for i in range(best_mlp_hyperparameters['num_dense_layers']):
model_mlp.add(Dense(units=best_mlp_hyperparameters['dense_' +str(i)], activation='relu'))
model_mlp.add(Dropout(rate=best_mlp_hyperparameters['dropout_' +str(i)]))
model_mlp.add(Dense(10,activation="softmax"))
model_mlp.compile(optimizer=best_mlp_hyperparameters['Optimizer'], loss='categorical_crossentropy',metrics=['accuracy'])
history_mlp= model_mlp.fit(train_x, train_y, epochs=100, batch_size=32, validation_data=(dev_x, dev_y), callbacks=callback)
# model_mlp=tuner_mlp.hypermodel.build(best_mlp_hyperparameters)
# history_mlp=model_mlp.fit(train_x, train_y, epochs=100, batch_size=32, validation_data=(dev_x, dev_y), callbacks=callback)
"""
效果测试
"""
mlp_test_loss, mlp_test_acc = model_mlp.evaluate(test_x, test_y, verbose=2)
print('\nTest accuracy:', mlp_test_acc)
# Test accuracy: 0.8823
"""
CNN版
"""
"""
基线模型
"""
model_cnn = Sequential()
model_cnn.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model_cnn.add(MaxPooling2D((2, 2)))
model_cnn.add(Flatten())
model_cnn.add(Dense(100, activation='relu'))
model_cnn.add(Dense(10, activation='softmax'))
model_cnn.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
"""
贝叶斯搜索超参数
"""
model = Sequential()
model = Sequential()
model.add(Input(shape=(28, 28, 1)))
for i in range(hp.Int('num_blocks', 1, 2)):
hp_padding=hp.Choice('padding_'+ str(i), values=['valid', 'same'])
hp_filters=hp.Choice('filters_'+ str(i), values=[32, 64])
model.add(Conv2D(hp_filters, (3, 3), padding=hp_padding, activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(hp.Choice('dropout_'+ str(i), values=[0.0, 0.1, 0.2])))
model.add(Flatten())
hp_units = hp.Int('units', min_value=25, max_value=150, step=25)
model.add(Dense(hp_units, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(10,activation="softmax"))
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
hp_optimizer=hp.Choice('Optimizer', values=['Adam', 'SGD'])
if hp_optimizer == 'Adam':
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
elif hp_optimizer == 'SGD':
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
nesterov=True
momentum=0.9
model.compile( optimizer=hp_optimizer,loss='categorical_crossentropy', metrics=['accuracy'])
tuner_cnn = kt.tuners.BayesianOptimization(
model,
objective='val_loss',
max_trials=100,
directory='.',
project_name='tuning-cnn')
"""
采用最佳超参数训练模型
"""
model_cnn = Sequential()
model_cnn.add(Input(shape=(28, 28, 1)))
for i in range(best_cnn_hyperparameters['num_blocks']):
hp_padding=best_cnn_hyperparameters['padding_'+ str(i)]
hp_filters=best_cnn_hyperparameters['filters_'+ str(i)]
model_cnn.add(Conv2D(hp_filters, (3, 3), padding=hp_padding, activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
model_cnn.add(MaxPooling2D((2, 2)))
model_cnn.add(Dropout(best_cnn_hyperparameters['dropout_'+ str(i)]))
model_cnn.add(Flatten())
model_cnn.add(Dense(best_cnn_hyperparameters['units'], activation='relu', kernel_initializer='he_uniform'))
model_cnn.add(Dense(10,activation="softmax"))
model_cnn.compile(optimizer=best_cnn_hyperparameters['Optimizer'],
loss='categorical_crossentropy',
metrics=['accuracy'])
print(model_cnn.summary())
history_cnn= model_cnn.fit(train_x, train_y, epochs=50, batch_size=32, validation_data=(dev_x, dev_y), callbacks=callback)
cnn_test_loss, cnn_test_acc = model_cnn.evaluate(test_x, test_y, verbose=2)
print('\nTest accuracy:', cnn_test_acc)
# Test accuracy: 0.92
超参数调优框架
Optuna深度学习超参数优化框架
python
import os
import optuna
import plotly
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_parallel_coordinate
# 下述代码指定了SGDClassifier分类器的参数:alpha、max_iter 的搜索空间、损失函数loss的搜索空间。
def objective(trial):
iris = sklearn.datasets.load_iris()
classes = list(set(iris.target))
train_x, valid_x, train_y, valid_y = sklearn.model_selection.train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)
#指定参数搜索空间
alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
max_iter = trial.suggest_int('max_iter',64,192,step=64)
loss = trial.suggest_categorical('loss',['hinge','log','perceptron'])
clf = sklearn.linear_model.SGDClassifier(alpha=alpha,max_iter=max_iter)
# 下述代码指定了学习率learning_rate、优化器optimizer、神经元个数n_uint 的搜索空间。
def objective(trial):
params = {
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
'optimizer': trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]),
'n_unit': trial.suggest_int("n_unit", 4, 18)
}
model = build_model(params)
accuracy = train_and_evaluate(params, model)
return accuracy
# 记录超参数训练过程
def objective(trial):
iris = sklearn.datasets.load_iris()
classes = list(set(iris.target))
train_x, valid_x, train_y, valid_y = sklearn.model_selection.train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)
alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
max_iter = trial.suggest_int('max_iter',64,192,step=64)
loss = trial.suggest_categorical('loss',['hinge','log','perceptron'])
clf = sklearn.linear_model.SGDClassifier(alpha=alpha,max_iter=max_iter)
for step in range(100):
clf.partial_fit(train_x, train_y, classes=classes)
intermediate_value = 1.0 - clf.score(valid_x, valid_y)
trial.report(intermediate_value, step)
if trial.should_prune():
raise optuna.TrialPruned()
return 1.0 - clf.score(valid_x, valid_y)
# 创建优化过程
def objective(trial):
iris = sklearn.datasets.load_iris()
classes = list(set(iris.target))
train_x, valid_x, train_y, valid_y = sklearn.model_selection.train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)
alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
max_iter = trial.suggest_int('max_iter',64,192,step=64)
loss = trial.suggest_categorical('loss',['hinge','log','perceptron'])
clf = sklearn.linear_model.SGDClassifier(alpha=alpha,max_iter=max_iter)
for step in range(100):
clf.partial_fit(train_x, train_y, classes=classes)
intermediate_value = 1.0 - clf.score(valid_x, valid_y)
trial.report(intermediate_value, step)
if trial.should_prune():
raise optuna.TrialPruned()
return 1.0 - clf.score(valid_x, valid_y)
study = optuna.create_study(storage='path',study_name='first',pruner=optuna.pruners.MedianPruner())
#study = optuna.study.load_study('first','path')
study.optimize(objective, n_trials=20)
print("Study statistics: ")
print(" Number of finished trials: ", len(study.trials))
print(" Number of pruned trials: ", len(pruned_trials))
print(" Number of complete trials: ", len(complete_trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print("{}:{}".format(key, value))
# 可视化搜索结果
optuna.visualization.plot_contour(study)
#若不行,请尝试:
vis_path = r'result-vis/'
graph_cout = optuna.visualization.plot_contour(study,params=['n_layers','lr'])
plotly.offline.plot(graph_cout,filename=vis_path+'graph_cout.html')
plot_optimization_history(study)
#若不行,请尝试:
vis_path = r'result-vis/'
history = plot_optimization_history(study)
plotly.offline.plot(history,filename=vis_path+'history.html')
plot_intermediate_values(study)
#若不行,请尝试:
vis_path = r'result-vis/'
intermed = plot_intermediate_values(study)
plotly.offline.plot(intermed,filename=vis_path+'intermed.html')
plot_slice(study, params=['alpha','max_iter','loss'])
#若不行,请尝试:
vis_path = r'result-vis/'
slices = plot_slice(study)
plotly.offline.plot(slices,filename=vis_path+'slices.html')
plot_parallel_coordinate(study,params=['alpha','max_iter','loss'])
#若不行,请尝试:
vis_path = r'result-vis/'
paraller = plot_parallel_coordinate(study)
plotly.offline.plot(paraller,filename=vis_path+'paraller.html')
nvidia nemo大模型超参数优化框架
- 用户手册:nvidia nemo用户手册