PaddleVideo：Squeeze Time算法移植

参考
PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideo · GitHubAwesome video understanding toolkits based on PaddlePaddle. It supports video data annotation tools, lightweight RGB and skeleton based action recognition model, practical applications for video tagging and sport action detection. - PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideohttps://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/contribute/add_new_algorithm.md

1：添加backbone：（网络我自己砍了几刀，目的是想和ppTSM-v2做对比）

paddlevideo/modeling/backbones/squeezetime.py

复制代码

from __future__ import absolute_import, division, print_function

import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal,Constant
import paddle.nn.functional as F

from ..registry import BACKBONES


def get_inplanes():
    return [64, 128, 256, 512]

class SpatialConv(nn.Layer):
    """
    Inter-temporal Object Interaction Module (IOI)
    """
    def __init__(self, dim_in, dim_out, pos_dim=7):
        super(SpatialConv, self).__init__()

        self.short_conv = nn.Conv2D(dim_in, dim_out, kernel_size=3, stride=1, padding=1, groups=1)

        self.glo_conv = nn.Sequential(
            nn.Conv2D(dim_in, 16, kernel_size=3, stride=1, padding=1, groups=1),
            nn.BatchNorm2D(16), nn.ReLU(),
            nn.Conv2D(16, 16, kernel_size=7, stride=1, padding=3),
            nn.BatchNorm2D(16), nn.ReLU(),
            nn.Conv2D(16, dim_out, kernel_size=3, stride=1, padding=1, groups=1), nn.Sigmoid()
        )

        self.pos_embed = self.create_parameter(shape=[1, 16, pos_dim, pos_dim], default_initializer=nn.initializer.KaimingNormal())

    def forward(self, x, param):
        x_short = self.short_conv(x)
        x = x * param

        for i in range(len(self.glo_conv)):
            if i == 3:
                _, _, H, W = x.shape
                if self.pos_embed.shape[2] != H or self.pos_embed.shape[3] != W:
                    pos_embed = F.interpolate(self.pos_embed, size=(H, W), mode='bilinear', align_corners=True)
                else:
                    pos_embed = self.pos_embed
                x = x + pos_embed

            x = self.glo_conv[i](x)

        return x_short * x

class Conv2d(nn.Layer):
    """
    Channel-Time Learning Module (CTL)
    """
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
        padding_mode: str = 'zeros', 
        pos_dim = 7):
        super(Conv2d, self).__init__()

        self.stride = stride

        self.param_conv = nn.Sequential(
            nn.AdaptiveAvgPool2D((1, 1)),
            nn.Conv2D(in_channels, in_channels, 1, stride=1, padding=1 // 2, bias_attr=False),
            nn.BatchNorm2D(in_channels),
            nn.ReLU(),
            nn.Conv2D(in_channels, in_channels, 1, bias_attr=False),
            nn.Sigmoid()
        )

        self.temporal_conv = nn.Conv2D(
            in_channels=in_channels, 
            out_channels=out_channels, 
            kernel_size=kernel_size, 
            stride=1, 
            padding=padding, 
            dilation=dilation, 
            groups=groups, 
            bias_attr=bias, 
            padding_mode=padding_mode
        )

        self.spatial_conv = SpatialConv(dim_in=in_channels, dim_out=out_channels, pos_dim=pos_dim)

    def forward(self, x):
        param = self.param_conv(x)
        x = self.temporal_conv(param * x) + self.spatial_conv(x, param)
        return x

def conv3x3x3(in_planes, out_planes, stride=1, pos_dim=7):
    return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False, pos_dim=pos_dim)

def conv1x1x1(in_planes, out_planes, stride=1):
    return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)

class BasicBlock(nn.Layer):
    """
    Channel-Time Learning (CTL) Block
    """
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):
        super().__init__()

        self.conv1 = conv3x3x3(in_planes, planes, stride)
        self.bn1 = nn.BatchNorm2D(planes)
        self.relu = nn.ReLU()

        self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)
        self.bn2 = nn.BatchNorm2D(planes)

        self.shortcut_conv = shortcut_conv

        self.stride = stride
        if stride != 1:
            self.downsample = nn.Sequential(
                nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),
                nn.BatchNorm2D(in_planes)
            )

    def forward(self, x):
        if self.stride != 1:
            x = self.downsample(x)

        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.shortcut_conv is not None:
            residual = self.shortcut_conv(x)

        out += residual
        out = self.relu(out)

        return out

class Bottleneck(nn.Layer):
    """
    Channel-Time Learning (CTL) Block
    """
    expansion = 4

    def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):
        super().__init__()

        self.conv1 = conv1x1x1(in_planes, planes)
        self.bn1 = nn.BatchNorm2D(planes)

        self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)
        self.bn2 = nn.BatchNorm2D(planes)

        self.conv3 = conv1x1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2D(planes * self.expansion)

        self.relu = nn.ReLU()

        self.shortcut_conv = shortcut_conv

        self.stride = stride

        if stride != 1:
            self.downsample = nn.Sequential(
                nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),
                nn.BatchNorm2D(in_planes)
            )

    def forward(self, x):
        if self.stride != 1:
            x = self.downsample(x)

        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.shortcut_conv is not None:
            residual = self.shortcut_conv(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Layer):
    def __init__(self,
                 block,
                 layers,
                 block_inplanes,
                 n_input_channels=3,
                 no_max_pool=False,
                 shortcut_type='B',
                 widen_factor=1.0,
                 dropout=0.2, 
                 freeze_bn=False, 
                 spatial_stride=[1,2,2,2], 
                 pos_dim=[64,32,16,8]):
        super().__init__()

        self.freeze_bn = freeze_bn
        block_inplanes = [int(x * widen_factor) for x in block_inplanes]

        self.in_planes = block_inplanes[0]
        self.no_max_pool = no_max_pool
        self.dropout = dropout

        self.conv1 = nn.Conv2D(n_input_channels,
                               self.in_planes,
                               kernel_size=5,
                               stride=2,
                               padding=2,
                               groups=1,
                               bias_attr=False)

        self.bn1 = nn.BatchNorm2D(self.in_planes)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
                                       shortcut_type, stride=spatial_stride[0], pos_dim=pos_dim[0])

        self.layer2 = self._make_layer(block,
                                       block_inplanes[1],
                                       layers[1],
                                       shortcut_type,
                                       stride=spatial_stride[1], pos_dim=pos_dim[1])

        self.layer3 = self._make_layer(block,
                                       block_inplanes[2],
                                       layers[2],
                                       shortcut_type,
                                       stride=spatial_stride[2], pos_dim=pos_dim[2])

        self.layer4 = self._make_layer(block,
                                       block_inplanes[3],
                                       layers[3],
                                       shortcut_type,
                                       stride=spatial_stride[3], pos_dim=pos_dim[3])


    def _downsample_basic_block(self, x, planes, stride):
        out = F.avg_pool2d(x, kernel_size=1, stride=stride)
        zero_pads = paddle.zeros([out.shape[0], planes - out.shape[1], out.shape[2], out.shape[3]])

        if isinstance(out, paddle.CUDAPlace):
            zero_pads = zero_pads.cuda()

        out = paddle.concat([out, zero_pads], axis=1)

        return out

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, pos_dim=7):
        shortcut = None
        if self.in_planes != planes * block.expansion:
            shortcut = nn.Sequential(
                conv1x1x1(self.in_planes, planes * block.expansion, stride=1),
                nn.BatchNorm2D(planes * block.expansion)
            )

        layers = []
        layers.append(
            block(in_planes=self.in_planes,
                  planes=planes,
                  stride=stride, shortcut_conv=shortcut, pos_dim=pos_dim)
        )

        self.in_planes = planes * block.expansion

        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes, pos_dim=pos_dim))

        return nn.Sequential(*layers)

    def forward(self, x):
        print('##################', x.shape)
        if len(x.shape) == 3:
            x = paddle.unsqueeze(x, axis=0)
        N, C, H, W = x.shape
        x = x.reshape([int(N/16), -1, H, W])

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        if not self.no_max_pool:
            x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        return x

    def train(self, mode=True):
        freeze_bn = self.freeze_bn
        freeze_bn_affine = self.freeze_bn
        super(ResNet, self).train(mode)

        if freeze_bn:
            print("Freezing Mean/Var of BatchNorm2D.")
            for m in self.sublayers():
                if isinstance(m, nn.BatchNorm2D):
                    m.eval()

        if freeze_bn_affine:
            print("Freezing Weight/Bias of BatchNorm2D.")
            for m in self.sublayers():
                if isinstance(m, nn.BatchNorm2D):
                    m.weight.stop_gradient = True
                    m.bias.stop_gradient = True

def SqueezeTime_model(**kwargs):
    model = ResNet(Bottleneck, [2, 2, 2, 2], get_inplanes(), **kwargs)
    return model


@BACKBONES.register()
def SqueezeTime(pretrained=None, use_ssld=False, **kwargs):
    """
    Build SqueezeTime Model
    """

    model = SqueezeTime_model(widen_factor=0.5, dropout=0.5, n_input_channels=48, freeze_bn=False, spatial_stride=[1, 2, 2, 2], pos_dim=[64, 32, 16, 8])
    return  model

2：导入backbone：

paddlevideo/modeling/backbones/init.py

复制代码

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .actbert import BertForMultiModalPreTraining
from .adds import ADDS_DepthNet
from .agcn import AGCN
from .asrf import ASRF
from .bmn import BMN
from .cfbi import CFBI
from .movinet import MoViNet
from .ms_tcn import MSTCN
from .resnet import ResNet
from .resnet_slowfast import ResNetSlowFast
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .resnet_tsm import ResNetTSM
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tweaks_tsm import ResNetTweaksTSM
from .resnet_tweaks_tsn import ResNetTweaksTSN
from .stgcn import STGCN
from .swin_transformer import SwinTransformer3D
from .transnetv2 import TransNetV2
from .vit import VisionTransformer
from .vit_tweaks import VisionTransformer_tweaks
from .ms_tcn import MSTCN
from .asrf import ASRF
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .cfbi import CFBI
from .ctrgcn import CTRGCN
from .agcn2s import AGCN2s
from .movinet import MoViNet
from .resnet3d_slowonly import ResNet3dSlowOnly
from .toshift_vit import TokenShiftVisionTransformer
from .pptsm_mv2 import PPTSM_MobileNetV2
from .pptsm_mv3 import PPTSM_MobileNetV3
from .pptsm_v2 import PPTSM_v2
from .yowo import YOWO
from .squeezetime import SqueezeTime

__all__ = [
    'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',
    'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',
    'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',
    'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',
    'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',
    'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',
    'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO', 'SqueezeTime'
]

3：添加head：

paddlevideo/modeling/heads/i2d_head.py

复制代码

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
from paddle import ParamAttr

from ..registry import HEADS
from ..weight_init import weight_init_
from .base import BaseHead


@HEADS.register()
class I2DHead(BaseHead):
    """Classification head for I2D.

    Args:
        num_classes (int): Number of classes to be classified.
        in_channels (int): Number of channels in input feature.
        loss_cls (dict): Config for building loss.
            Default: dict(name='CrossEntropyLoss')
        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
        drop_ratio (float): Probability of dropout layer. Default: 0.5.
        std (float): Std value for Initiation. Default: 0.01.
        kwargs (dict, optional): Any keyword argument to be used to initialize
            the head.
    """
    def __init__(self,
                 num_classes,
                 in_channels,
                 loss_cfg=dict(name='CrossEntropyLoss'),
                 spatial_type='avg',
                 drop_ratio=0.5,
                 std=0.01,
                 **kwargs):

        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)

        self.spatial_type = spatial_type
        self.dropout_ratio = drop_ratio
        self.init_std = std
                     
        if self.dropout_ratio != 0:
            self.dropout = nn.Dropout(p=self.dropout_ratio)
        else:
            self.dropout = None
            
        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)

        if self.spatial_type == 'avg':
            self.avg_pool = nn.AdaptiveAvgPool2D((1, 1))
        else:
            self.avg_pool = nn.AdaptiveMaxPool2D((1,1))


    def forward(self, x, num_segs = None):
        """Defines the computation performed at every call.

        Args:
            x (Tensor): The input data.

        Returns:
            Tensor: The classification scores for input samples.
        """   
        
        # [N, in_channels, 4, 7, 7]
        if self.avg_pool is not None:
            x = self.avg_pool(x)
            
        # [N, in_channels, 1, 1, 1]
        if self.dropout is not None:
            x = self.dropout(x)
            
        # [N, in_channels, 1, 1, 1]
        x = paddle.reshape(x, [x.shape[0], -1])
        
        # [N, in_channels]
        cls_score = self.fc_cls(x)
        
        # [N, num_classes]
        return cls_score


    # def forward_new(self, x, num_segs = None):
    #     """Defines the computation performed at every call.

    #     Args:
    #         x (Tensor): The input data.

    #     Returns:
    #         Tensor: The classification scores for input samples.
    #     """      
        
    #     # [N, in_channels, 4, 7, 7]
    #     if self.avg_pool is not None:
    #         x = self.avg_pool(x)
            
    #     # [N, in_channels, 1, 1, 1]
    #     if self.dropout is not None:
    #         x = self.dropout(x)
            
    #     # [N, in_channels, 1, 1, 1]
    #     x = paddle.reshape(x, [x.shape[0], -1])
        
    #     # [N, in_channels]
    #     cls_score = self.fc_cls(x)
        
    #     # [N, num_classes]
    #     return cls_score

4：导入head：

paddlevideo/modeling/heads/init.py

复制代码

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .adds_head import AddsHead
from .asrf_head import ASRFHead
from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead
from .base import BaseHead
from .bbox_head import BBoxHeadAVA
from .cfbi_head import CollaborativeEnsemblerMS
from .i3d_head import I3DHead
from .movinet_head import MoViNetHead
from .ms_tcn_head import MSTCNHead
from .pptimesformer_head import ppTimeSformerHead
from .pptsm_head import ppTSMHead
from .pptsn_head import ppTSNHead
from .roi_head import AVARoIHead
from .single_straight3d import SingleRoIExtractor3D
from .slowfast_head import SlowFastHead
from .stgcn_head import STGCNHead
from .timesformer_head import TimeSformerHead
from .transnetv2_head import TransNetV2Head
from .tsm_head import TSMHead
from .tsn_head import TSNHead
from .ms_tcn_head import MSTCNHead
from .asrf_head import ASRFHead
from .ctrgcn_head import CTRGCNHead
from .movinet_head import MoViNetHead
from .agcn2s_head import AGCN2sHead
from .token_shift_head import TokenShiftHead
from .i2d_head import I2DHead

__all__ = [
    'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',
    'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',
    'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',
    'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',
    'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',
    'AGCN2sHead', 'I2DHead'
]

5：训练配置文件：

configs/recognition/pptsm/v2/md_ppsqt_16frames_uniform.yaml

复制代码

MODEL: #MODEL field
    framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
    backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
        name: "SqueezeTime" #Mandatory, The name of backbone.
    head:
        name: "I2DHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
        #pretrained: "" #Optional, pretrained model path.
        num_classes: 2
        in_channels: 1024

DATASET: #DATASET field
    batch_size: 16  #Mandatory, bacth size
    num_workers: 4 #Mandatory, the number of subprocess on each GPU.
    train:
        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
        data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, train data root path
        file_path: "/home/mnt/sdd/Data/data_fights/train_list.txt" #Mandatory, train data index file path
        suffix: 'img_{:06}.jpg'
    valid:
        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
        data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root path
        file_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file path
        suffix: 'img_{:06}.jpg'
    test:
        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
        data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root path
        file_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file path
        suffix: 'img_{:06}.jpg'

PIPELINE: #PIPELINE field
    train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
        decode:
            name: "FrameDecoder"
        sample:
            name: "Sampler"
            num_seg: 16
            seg_len: 1
            valid_mode: False
        transform: #Mandotary, image transfrom operator
            - Scale:
                short_size: 256
            - MultiScaleCrop:
                target_size: 256
            - RandomCrop:
                target_size: 224
            - RandomFlip:
            - Image2Array:
            - Normalization:
                mean: [0.485, 0.456, 0.406]
                std: [0.229, 0.224, 0.225]
    valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
        decode:
            name: "FrameDecoder"
        sample:
            name: "Sampler"
            num_seg: 16
            seg_len: 1
            valid_mode: True
        transform:
            - Scale:
                short_size: 256
            - CenterCrop:
                target_size: 224
            - Image2Array:
            - Normalization:
                mean: [0.485, 0.456, 0.406]
                std: [0.229, 0.224, 0.225]
    test:  #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
        decode:
            name: "FrameDecoder"
        sample:
            name: "Sampler"
            num_seg: 16
            seg_len: 1
            valid_mode: True
        transform:
            - Scale:
                short_size: 256
            - CenterCrop:
                target_size: 224
            - Image2Array:
            - Normalization:
                mean: [0.485, 0.456, 0.406]
                std: [0.229, 0.224, 0.225]

OPTIMIZER: #OPTIMIZER field
  name: 'Momentum'
  momentum: 0.9
  learning_rate:
    iter_step: True
    name: 'CustomWarmupCosineDecay'
    max_epoch: 120
    warmup_epochs: 10
    warmup_start_lr: 0.005
    cosine_base_lr: 0.01
  weight_decay:
    name: 'L2'
    value: 1e-4
  use_nesterov: True

MIX:
    name: "Mixup"
    alpha: 0.2


METRIC:
    name: 'CenterCropMetric'

INFERENCE:
    name: 'ppSQT_Inference_helper'
    num_seg: 16
    target_size: 224

model_name: "ppSQT"
log_interval: 10 #Optional, the interal of logger, default:10
epochs: 120  #Mandatory, total epoch
log_level: "INFO" #Optional, the logger level. default: "INFO"

6：训练：

复制代码

# multi-gpu-st
export CUDA_VISIBLE_DEVICES=0,1
python -B -m paddle.distributed.launch --gpus="0,1"  --log_dir=./log/log_sqt_frame_16  main.py  --validate -c configs/recognition/pptsm/v2/ppsqt_lcnet_md_16frames_uniform.yaml

7：结果：精度比ppTSM-v2低8个点左右。有可能是没有预训练权重的问题。