当前位置：网站首页>Pytoch and weight decay (L2 norm)

Pytoch and weight decay (L2 norm)

2022-07-19 10:50:00 【phac123】

theory

$l(w_1, w_2, b)=\frac{1}{n}\sum_{i=1}^n\frac{1}{2}(x_1^{(i)}w_1+x_2^{(i)}+b-y^{(i)})^2$
with penalty as follows :
$l(w_1, w_2, b)+\frac{\lambda}{2n}|||w||^2$
Where the super parameter $\lambda>0$ When the weight parameters are 0 When , Minimum penalty . When $\lambda$ When I was older , The penalty term accounts for a large proportion in the loss function , This usually makes the elements of the learned weight parameters closer to 0. When λλ Set to 0 when , The penalty item doesn't work at all . In the above formula $L_2$ Norm square $w||^2$ After unfolding, we get $w_1^2+w_2^2$ . With $L_2$ After the norm penalty term , In small batch random gradient descent , We will discuss the weights in the linear regression section $w_1,w_2$ The iteration method of is changed to :
Insert picture description here
so , $L_2$ Norm regularization makes the weight $w_1,w_2$ Multiply by less than 1 Number of numbers , Then subtract the gradient without penalty term . therefore , $L_2$ Norm regularization is also called weight attenuation . Weight attenuation increases the limit for the model to be learned by punishing the model parameters with large absolute value , This may be valid for overfitting . In the real world , We sometimes add the sum of squares of deviation elements to the penalty term .

Realize high-dimensional linear regression

sketch

below , We take high-dimensional linear regression as an example to introduce an over fitting problem , And use weight attenuation to deal with over fitting . Set the dimension of data sample characteristics as pp. For training data sets and test data sets, the characteristics are $x_1, x_2,...,x_p$ Any sample of , We use the following linear function to generate the label of the sample ：
$y=0.05+\sum_{i=1}^p0.01x_i+ϵ$
The noise item is ϵ To obey the mean is 0、 The standard deviation is 0.01 Is a normal distribution . In order to easily observe the fitting , We consider the high-dimensional linear regression problem , If set dimension p=200; meanwhile c We deliberately set the number of samples in the training data set low c Such as 20.

d2lzh_pytorch.py

import random
from IPython import display
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import time
import sys
import torch.nn as nn


def use_svg_display():
    #  It's shown in vector form 
    display.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)):
    use_svg_display()
    #  Set the size of the drawing 
    plt.rcParams['figure.figsize'] = figsize

''' Given batch_size, feature, labels, Scramble the data and generate a data set of a specified size '''
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size): #(start, staop, step)
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # There may not be one last time batch
        yield features.index_select(0, j), labels.index_select(0, j)

''' Define the model of linear regression '''
def linreg(X, w, b):
    return torch.mm(X, w) + b

''' Define the loss function of linear regression '''
def squared_loss(y_hat, y):
    return (y_hat - y.view(y_hat.size())) ** 2 / 2

''' Optimization algorithm of linear regression  ——  Small batch random gradient descent method '''
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size # What we use here is param.data

'''MINIST, You can convert numeric labels into corresponding text labels '''
def get_fashion_mnist_labels(labels):
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

''' Define a function that can draw multiple images and corresponding labels in one line '''
def show_fashion_mnist(images, labels):
    use_svg_display()
    #  there _ Means we ignore （ Don't use ） The variable of 
    _, figs = plt.subplots(1, len(images), figsize=(12, 12))
    for f, img, lbl in zip(figs, images, labels):
        f.imshow(img.view((28, 28)).numpy())
        f.set_title(lbl)
        f.axes.get_xaxis().set_visible(False)
        f.axes.get_yaxis().set_visible(False)
    plt.show()

''' Get and read Fashion-MNIST Data sets ; This function will return train_iter and test_iter Two variables '''
def load_data_fashion_mnist(batch_size):
    mnist_train = torchvision.datasets.FashionMNIST(root='Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transforms.ToTensor())
    mnist_test = torchvision.datasets.FashionMNIST(root='Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transforms.ToTensor())
    if sys.platform.startswith('win'):
        num_workers = 0  # 0 It means that there is no extra process to speed up reading data 
    else:
        num_workers = 4
    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_iter, test_iter

''' Evaluation model net In the data set data_iter The accuracy of '''
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

''' Training models ,softmax'''
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()

            #  Gradient clear 
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()

            l.backward()
            if optimizer is None:
                sgd(params, lr, batch_size)
            else:
                optimizer.step()

            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

''' Yes x Shape conversion '''
class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)

''' Graphing functions , among y The axis uses a logarithmic scale , Draw training and testing Loss Images '''
def semilogy(x_vals, y_vals, x_label, y_label, x2_vals = None, y2_vals = None,
             legend = None, figsize=(3.5, 2.5)):
    set_figsize(figsize)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.semilogy(x_vals, y_vals)
    if x2_vals and y2_vals:
        plt.semilogy(x2_vals, y2_vals, linestyle=':')
        plt.legend(legend)
    plt.show()

Manual implementation

main.py

import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l

#  Generate data set 
n_train, n_test, num_inputs = 20, 100, 200
true_w, true_b = torch.ones(num_inputs, 1) * 0.01, 0.05

features = torch.randn((n_train + n_test, num_inputs))
labels = torch.matmul(features, true_w) + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
train_features, test_features = features[:n_train, :], features[n_train:, :]
train_labels, test_labels = labels[:n_train], labels[n_train:]

#  Initialize parameters 
def init_params():
    w = torch.random((num_inputs, 1), required_grad = True)
    b = torch.zeros(1, required_grad = True)
    return [w, b]

#  Definition L2 Norm penalty term 
def l2_penalty(w):
    return (w ** 2).sum()/2

#  Define training and testing 
batch_size, num_epochs, lr = 1, 100, 0.003
net, loss = d2l.linreg, d2l.squared_loss

dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle = True)

def fit_and_plot(lambd):
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X,y in train_iter:
            #  Added l2 Penalty item 
            l = loss(net(X, w, b)) + lambd * l2_penalty(w)
            l = l.sum()

            if w.grad is not None:
                w.grad.data.zero_()
                b.grad.data.zero_()
            l.backward()
            d2l.sgd([w,b], lr, batch_size)

        train_ls.append(loss(net(train_features, w, b), train_labels).mean().item())
        test_ls.append(loss(net(test_features, w, b), test_labels).mean().item())

    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('L2 norm of w:', w.norm().item())

Observed fitting

Next , Let's train and test the high-dimensional linear regression model . When $\lambda$ Set to 0 when , We don't use weight attenuation . Results the training error is much smaller than the error on the test set . This is a typical over fitting phenomenon .

fit_and_plot(lambd=0)

Output :

L2 norm of w: 14.296510696411133

Insert picture description here

Use weight attenuation

Let's use weight attenuation . It can be seen that , Although the training error has improved , But the error on the test set has decreased . The over fitting phenomenon is alleviated to a certain extent . in addition , Weight parameter $L_2$ The norm is smaller than when weight attenuation is not used , At this time, the weight parameter is closer to 0.

fit_and_plot(lambd=3)

Output ：
Insert picture description here

Concise implementation

Here we directly construct the optimizer instance through weight_decay Parameter to specify the weight attenuation super parameter . By default ,PyTorch It will attenuate the weight and deviation at the same time . We can construct optimizer instances for weights and deviations respectively , Thus, only the weight is attenuated .
main.py

import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l

#  Generate data set 
n_train, n_test, num_inputs = 20, 100, 200
true_w, true_b = torch.ones(num_inputs, 1) * 0.01, 0.05

features = torch.randn((n_train + n_test, num_inputs))
labels = torch.matmul(features, true_w) + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
train_features, test_features = features[:n_train, :], features[n_train:, :]
train_labels, test_labels = labels[:n_train], labels[n_train:]

batch_size, num_epochs, lr = 1, 100, 0.003
loss = torch.nn.MSELoss()

dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)

def fit_and_plot_pytorch(wd):
    #  Attenuate the weight parameter . Weight names are generally named with weight ending 
    net = nn.Linear(num_inputs, 1)
    nn.init.normal_(net.weight, mean = 0, std=1)
    nn.init.normal_(net.bias, mean = 0, std = 0.01)
    optimizer_w = torch.optim.SGD(params=[net.weight], lr = lr, weight_decay=wd)
    optimizer_b = torch.optim.SGD(params=[net.bias], lr = lr)

    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X,y in train_iter:
            l = loss(net(X), y).mean()
            optimizer_w.zero_grad()
            optimizer_b.zero_grad()

            l.backward()

            optimizer_w.step()
            optimizer_b.step()
        train_ls.append(loss(net(train_features), train_labels).mean().item())
        test_ls.append(loss(net(test_features), test_labels).mean().item())
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('L2 norm of w:', net.weight.data.norm().item())

fit_and_plot_pytorch(0)
fit_and_plot_pytorch(3)

Similar to the experimental phenomenon of weight attenuation from zero , Using weight attenuation can alleviate the over fitting problem to a certain extent .

fit_and_plot_pytorch(0)

Output :

L2 norm of w: 13.87320613861084

Insert picture description here

fit_and_plot_pytorch(3)

Output :

L2 norm of w: 0.046947550028562546

Insert picture description here

Summary

By adding penalty terms to the model loss function, regularization makes the learned model parameters smaller , It is a common means to deal with over fitting .
Weight attenuation is equivalent to $L_2$ Norm regularization , It usually makes the elements of the learned weight parameters closer to 0.
The weight attenuation can be achieved through the in the optimizer weight_decay Super parameter to specify .
Multiple optimizer instances can be defined to use different iterative methods for different model parameters .