init commit

This commit is contained in:
2025-09-05 10:30:26 +02:00
commit c144fcd356
1526 changed files with 1115326 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cmake_minimum_required(VERSION 3.15.6)
project(CMSISNN)
set(CMSIS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..")
option(BUILD_CMSIS_NN_FUNCTIONS "Build CMSIS-NN Source." ON)
if(BUILD_CMSIS_NN_FUNCTIONS)
add_subdirectory(Source)
endif()

View File

@@ -0,0 +1,169 @@
/******************************************************************************
* @file arm_nn_math_types.h
* @brief Compiler include and basic types
* @version V1.1.0
* @date 09 March 2022
* Target Processor: Cortex-M
******************************************************************************/
/*
* Copyright (c) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
Copied from CMSIS/DSP/arm_math_types.h and modified
*/
#ifndef _ARM_NN_MATH_TYPES_H_
#define _ARM_NN_MATH_TYPES_H_
/* DSP inlcude for enum arm_status. */
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
/* Included for instrinsics definitions */
#if defined(_MSC_VER)
#include <stdint.h>
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static __forceinline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __declspec(align(x))
#endif
#elif defined(__GNUC_PYTHON__)
#include <stdint.h>
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#else
#include "cmsis_compiler.h"
#endif
#include <float.h>
#include <limits.h>
#include <math.h>
#include <string.h>
/* evaluate ARM DSP feature */
#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
#ifndef ARM_MATH_DSP
#define ARM_MATH_DSP 1
#endif
#endif
#if __ARM_FEATURE_MVE
#ifndef ARM_MATH_MVEI
#define ARM_MATH_MVEI
#endif
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
// #pragma GCC diagnostic pop
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
#ifdef __cplusplus
}
#endif
#if __ARM_FEATURE_MVE
#include <arm_mve.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Add necessary typedefs
*/
#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL))
#define NN_Q15_MAX ((q15_t)(0x7FFF))
#define NN_Q7_MAX ((q7_t)(0x7F))
#define NN_Q31_MIN ((q31_t)(0x80000000L))
#define NN_Q15_MIN ((q15_t)(0x8000))
#define NN_Q7_MIN ((q7_t)(0x80))
/**
* @brief Error status returned by some functions in the library.
*/
typedef enum
{
ARM_CMSIS_NN_SUCCESS = 0, /**< No error */
ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */
ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */
} arm_cmsis_nn_status;
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */

View File

@@ -0,0 +1,56 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 17. August 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_NN_TABLES_H
#define _ARM_NN_TABLES_H
#include "arm_nn_math_types.h"
/**
* @brief tables for various activation functions
*
*/
extern const q15_t sigmoidTable_q15[256];
extern const q7_t sigmoidTable_q7[256];
extern const q7_t tanhTable_q7[256];
extern const q15_t tanhTable_q15[256];
/**
* @brief 2-way tables for various activation functions
*
* 2-way table, H table for value larger than 1/4
* L table for value smaller than 1/4, H table for remaining
* We have this only for the q15_t version. It does not make
* sense to have it for q7_t type
*/
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
#endif /* ARM_NN_TABLES_H */

View File

@@ -0,0 +1,137 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_types.h
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: 22. Februari 2022
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#ifndef _ARM_NN_TYPES_H
#define _ARM_NN_TYPES_H
#include <stdint.h>
/** CMSIS-NN object to contain the width and height of a tile */
typedef struct
{
int32_t w; /**< Width */
int32_t h; /**< Height */
} cmsis_nn_tile;
/** CMSIS-NN object used for the function context. */
typedef struct
{
void *buf; /**< Pointer to a buffer needed for the optimization */
int32_t size; /**< Buffer size */
} cmsis_nn_context;
/** CMSIS-NN object to contain the dimensions of the tensors */
typedef struct
{
int32_t n; /**< Generic dimension to contain either the batch size or output channels.
Please refer to the function documentation for more information */
int32_t h; /**< Height */
int32_t w; /**< Width */
int32_t c; /**< Input channels */
} cmsis_nn_dims;
/** CMSIS-NN object for the per-channel quantization parameters */
typedef struct
{
int32_t *multiplier; /**< Multiplier values */
int32_t *shift; /**< Shift values */
} cmsis_nn_per_channel_quant_params;
/** CMSIS-NN object for the per-tensor quantization parameters */
typedef struct
{
int32_t multiplier; /**< Multiplier value */
int32_t shift; /**< Shift value */
} cmsis_nn_per_tensor_quant_params;
/** CMSIS-NN object for the quantized Relu activation */
typedef struct
{
int32_t min; /**< Min value used to clamp the result */
int32_t max; /**< Max value used to clamp the result */
} cmsis_nn_activation;
/** CMSIS-NN object for the convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_conv_params;
/** CMSIS-NN object for Depthwise convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_dw_conv_params;
/** CMSIS-NN object for pooling layer parameters */
typedef struct
{
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_activation activation;
} cmsis_nn_pool_params;
/** CMSIS-NN object for Fully Connected layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t filter_offset; /**< Zero value for the filter tensor. Not used */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation activation;
} cmsis_nn_fc_params;
/** CMSIS-NN object for SVDF layer parameters */
typedef struct
{
int32_t rank;
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation input_activation;
cmsis_nn_activation output_activation;
} cmsis_nn_svdf_params;
/** CMSIS-NN object for Softmax s16 layer parameters */
typedef struct
{
const int16_t *exp_lut;
const int16_t *one_by_one_lut;
} cmsis_nn_softmax_lut_s16;
#endif // _ARM_NN_TYPES_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,96 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q15.c
* Description: Q15 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief neural network activation function using direct table look-up
*
* @note Refer header file for details.
*
*/
void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q15_t *pIn = data;
q15_t *pOut = data;
uint16_t shift_size = 8 + 3 - int_width;
uint32_t bit_mask = 0x7FF >> int_width;
uint32_t full_frac = bit_mask + 1;
const q15_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q15;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q15;
break;
}
while (i)
{
q15_t out;
q15_t in = *pIn++;
q15_t frac = (uint32_t)in & bit_mask;
q15_t value = lookup_table[(uint8_t)(in >> shift_size)];
if ((in >> shift_size) != 0x7f)
{
q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
/* doing the interpolation here for better accuracy */
out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size;
}
else
{
/* the largest positive value does not have a right side for linear interpolation */
out = value;
}
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,89 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q7.c
* Description: Q7 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
*
* @details
*
* This is the direct table look-up approach.
*
* Assume here the integer part of the fixed-point is <= 3.
* More than 3 just not making much sense, makes no difference with
* saturation followed by any of these activation functions.
*/
void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
const q7_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q7;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q7;
break;
}
while (i)
{
in = *pIn++;
out = lookup_table[(uint8_t)(in >> shift_size)];
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,65 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu6_s8.c
* Description: Basic s8 version of ReLU6
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/*
* Basic ReLU6 function
*
* Refer to header file for details.
*
*/
void arm_relu6_s8(q7_t *data, uint16_t size)
{
int32_t i;
for (i = 0; i < size; i++)
{
int32_t ip = data[i];
ip = MAX(ip, 0);
data[i] = MIN(ip, 6);
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,104 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q15.c
* Description: Q15 version of ReLU
*
* $Date: 20. July 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q15 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q15(q15_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 1;
q15_t *input = data;
q15_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = arm_nn_read_q15x2_ia((const q15_t **)&input);
/* extract the first bit */
buf = __ROR(in & 0x80008000, 15);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB16(0x00000000, buf);
arm_nn_write_q15x2_ia(&output, in & (~mask));
i--;
}
if (size & 0x1)
{
if (*input < 0)
{
*input = 0;
}
input++;
}
#else
/* Run the following code as reference implementation for M cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif /* ARM_MATH_DSP */
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,109 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q7.c
* Description: Q7 version of ReLU
*
* $Date: 20. July 2021
* $Revision: V.1.1.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q7(q7_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 2;
q7_t *input = data;
q7_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = arm_nn_read_q7x4_ia((const q7_t **)&input);
/* extract the first bit */
buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB8(0x00000000, buf);
arm_nn_write_q7x4_ia(&output, in & (~mask));
i--;
}
i = size & 0x3;
while (i)
{
if (*input < 0)
{
*input = 0;
}
input++;
i--;
}
#else
/* Run the following code as reference implementation for cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_*.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,105 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s16
* Description: Elementwise add
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s16 elementwise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A + B */
input_1 = *input_1_vect++ << left_shift;
input_2 = *input_2_vect++ << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (int16_t)sum;
/* Decrement loop counter */
loop_count--;
}
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,234 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s8
* Description: Elementwise add
*
* $Date: 3 Februari 2022
* $Revision: V.2.6.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s8 elementwise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
#if defined(ARM_MATH_MVEI)
int32_t count = block_size;
while (count > 0)
{
int32x4_t vect_1;
int32x4_t vect_2;
mve_pred16_t p = vctp32q((uint32_t)count);
vect_1 = vldrbq_z_s32(input_1_vect, p);
vect_2 = vldrbq_z_s32(input_2_vect, p);
vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
vect_1 = vshlq_r_s32(vect_1, left_shift);
vect_2 = vshlq_r_s32(vect_2, left_shift);
vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift);
vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift);
vect_1 = vaddq_s32(vect_1, vect_2);
vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift);
vect_1 = vaddq_n_s32(vect_1, out_offset);
vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
input_1_vect += 4;
input_2_vect += 4;
vstrbq_p_s32(output, vect_1, p);
output += 4;
count -= 4;
}
#else
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Sum 1 */
input_1 = (b_1 & 0x0FFFF) << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = (b_2 & 0x0FFFF) << left_shift;
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r1 = (q7_t)sum;
/* Sum 3 */
input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r3 = (q7_t)sum;
/* Sum 2 */
input_1 = (a_1 & 0x0FFFF) << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = (a_2 & 0x0FFFF) << left_shift;
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r2 = (q7_t)sum;
/* Sum 4 */
input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r4 = (q7_t)sum;
arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0)
{
/* C = A + B */
input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (q7_t)sum;
/* Decrement loop counter */
loop_count--;
}
#endif /* ARM_MATH_MVEI */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,95 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s16
* Description: Element wise multiplication
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s16 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t mul_res;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++;
input_2 = *input_2_vect++;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (int16_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,200 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s8
* Description: Element wise multiplication
*
* $Date: 3 Februari 2022
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s8 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
int32_t loop_count;
#if defined(ARM_MATH_MVEI)
loop_count = (block_size + 3) / 4;
uint32_t num_elements = block_size;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp32q(num_elements);
int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p);
input_1 = vaddq_n_s32(input_1, input_1_offset);
int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p);
input_2 = vaddq_n_s32(input_2, input_2_offset);
int32x4_t res_0 = vmulq_s32(input_1, input_2);
res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
res_0 += vdupq_n_s32(out_offset);
res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
vstrbq_p_s32(output, res_0, p);
input_1_vect += 4;
input_2_vect += 4;
output += 4;
num_elements -= 4;
}
#else
int32_t input_1;
int32_t input_2;
int32_t mul_res;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Mul 1 */
input_1 = (int16_t)(b_1 & 0x0FFFFL);
input_2 = (int16_t)(b_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r1 = (q7_t)mul_res;
/* Mul 3 */
input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r3 = (q7_t)mul_res;
/* Mul 2 */
input_1 = (int16_t)(a_1 & 0x0FFFFL);
input_2 = (int16_t)(a_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r2 = (q7_t)mul_res;
/* Mul 4 */
input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r4 = (q7_t)mul_res;
arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++ + input_1_offset;
input_2 = *input_2_vect++ + input_2_offset;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (q7_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
SET(ROOT ${CMSIS_PATH})
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
option(CONCATENATION "Concatenation" ON)
option(FULLYCONNECTED "Fully Connected" ON)
option(CONVOLUTION "Convolutions" ON)
option(ACTIVATION "Activations" ON)
option(POOLING "Pooling" ON)
option(SOFTMAX "Softmax" ON)
option(BASICMATHSNN "Basic Maths for NN" ON)
option(RESHAPE "Reshape" ON)
option(SVDF "SVDF" ON)
# When OFF it is the default behavior : all tables are included.
option(NNSUPPORT "NN Support" ON)
###########################
#
# CMSIS NN
#
###########################
# NN Sources
SET(NN ${ROOT}/CMSIS/NN)
list(APPEND CMAKE_MODULE_PATH ${NN}/Source)
add_library(cmsis-nn STATIC)
target_compile_options(cmsis-nn PRIVATE -Ofast)
### Includes
target_include_directories(cmsis-nn PUBLIC "${NN}/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/DSP/Include")
if (BASICMATHSNN)
add_subdirectory(BasicMathFunctions)
endif()
if (CONCATENATION)
add_subdirectory(ConcatenationFunctions)
endif()
if (FULLYCONNECTED)
add_subdirectory(FullyConnectedFunctions)
endif()
if (CONVOLUTION)
add_subdirectory(ConvolutionFunctions)
endif()
if (ACTIVATION)
add_subdirectory(ActivationFunctions)
endif()
if (POOLING)
add_subdirectory(PoolingFunctions)
endif()
if (SOFTMAX)
add_subdirectory(SoftmaxFunctions)
endif()
if (SVDF)
add_subdirectory(SVDFunctions)
endif()
if (RESHAPE)
add_subdirectory(ReshapeFunctions)
endif()
# Keep NNSUPPORT at the end
if (NNSUPPORT)
add_subdirectory(NNSupportFunctions)
endif()

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_*.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_w.c
* Description: s8 version of concatenation along the W axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the W axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_w(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w)
{
const uint32_t input_copy_size = input_x * input_y * input_z * input_w;
output += offset_w * (input_x * input_y * input_z);
arm_memcpy_q7(output, input, input_copy_size);
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,75 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_x.c
* Description: s8 version of concatenation along the X axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the X axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_x(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_x,
const uint32_t offset_x)
{
const uint32_t num_iterations = input_y * input_z * input_w;
output += offset_x;
uint32_t i;
// Copy per row
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_x);
input += input_x;
output += output_x;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_y.c
* Description: s8 version of concatenation along the Y axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Y axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_y(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_y,
const uint32_t offset_y)
{
const uint32_t num_iterations = input_z * input_w;
const uint32_t input_copy_size = input_x * input_y;
const uint32_t output_stride = input_x * output_y;
output += offset_y * input_x;
uint32_t i;
// Copy per tile
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,75 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_z.c
* Description: s8 version of concatenation along the Z axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Z axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_z(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_z,
const uint32_t offset_z)
{
const uint32_t input_copy_size = input_x * input_y * input_z;
const uint32_t output_stride = input_x * input_y * output_z;
output += offset_z * (input_x * input_y);
uint32_t i;
for (i = 0; i < input_w; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,24 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8*.c")
file(GLOB SRC_S16 "./*_s16*.c")
target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})

View File

@@ -0,0 +1,205 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1_x_n_s8.c
* Description: s8 version of 1xN convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* 1xN s8 convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
arm_status status = ARM_MATH_SUCCESS;
if (output_dims->w % 4 != 0)
{
status = ARM_MATH_SIZE_MISMATCH;
goto out;
}
#if defined(ARM_MATH_MVEI)
(void)ctx;
const uint16_t input_x = input_dims->w;
const uint16_t kernel_x = filter_dims->w;
const uint16_t output_x = output_dims->w;
const uint16_t output_ch = output_dims->c;
const uint16_t input_ch = input_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t stride_x = conv_params->stride.w;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
{
int32_t input_begin_idx[4];
int32_t ker_begin_idx[4];
int32_t ker_end_idx[4];
for (int i = 0; i < 4; i++)
{
const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
input_begin_idx[i] = MAX(0, est_input_x_idx);
ker_begin_idx[i] = MAX(0, -est_input_x_idx);
ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
}
if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32x4_t s_offset;
int32_t acc[4];
{
int32_t sum_row[4];
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[0] * input_ch),
&sum_row[0],
&acc[0]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
input_data + input_begin_idx[1] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[1] * input_ch),
&sum_row[1],
&acc[1]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
input_data + input_begin_idx[2] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[2] * input_ch),
&sum_row[2],
&acc[2]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
input_data + input_begin_idx[3] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[3] * input_ch),
&sum_row[3],
&acc[3]);
s_offset = vldrwq_s32(sum_row);
}
int32x4_t res = vldrwq_s32(acc);
s_offset = vmulq_n_s32(s_offset, input_offset);
res = vaddq_s32(res, s_offset);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
output_data++;
}
output_data += (3 * output_ch);
}
else
{
output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
stride_x * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
}
#else
status = arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
out:
/* Return to application */
return status;
}
int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,235 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of 1x1 convolution (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
* and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
* separable convolution.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*
* [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
* https://arxiv.org/abs/1704.04861
*/
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
(void)dim_im_in_y;
int16_t i_out_y, i_out_x;
int16_t i_ch_out;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in);
pBuffer += ch_im_in;
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,161 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_s8_fast.c
* Description: Fast q7 version of 1x1 convolution (non-square shape)
*
* $Date: 12. November 2021
* $Revision: V.2.0.4
*
* Target Processor: Cortex-M Processors
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#include <stdio.h>
#define DIM_KER_X (1U)
#define DIM_KER_Y (1U)
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Fast s8 version for 1x1 convolution (non-square shape)
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 ||
conv_params->stride.w != 1 || conv_params->stride.h != 1)
{
return ARM_MATH_SIZE_MISMATCH;
}
(void)ctx;
(void)filter_dims;
(void)bias_dims;
#if defined(ARM_MATH_MVEI)
const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
const int32_t output_ch = output_dims->c;
const int32_t input_ch = input_dims->c;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
{
output_data = arm_nn_mat_mul_core_4x_s8(input_ch,
input_ch,
input_data + i_items * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
/* Handle left over elements */
for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row = 0;
int32_t acc;
(void)arm_nn_mat_mul_core_1x_s8(
input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc);
if (bias_data)
{
acc += bias_data[i_out_ch];
}
sum_row = (sum_row * input_offset);
acc += sum_row;
acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
acc += out_offset;
acc = MAX(acc, out_activation_min);
acc = MIN(acc, out_activation_max);
*output_data++ = acc;
}
}
#else
/* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
const int32_t rhs_rows = output_dims->c;
const int32_t rhs_cols = input_dims->c;
arm_nn_mat_mult_nt_t_s8(input_data,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max);
#endif
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
{
(void)input_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,209 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_basic.c
* Description: Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
uint16_t im2col_out_pixel_index = 0;
q15_t *pBuffer = bufferA;
q15_t *pOut = Im_out;
q15_t *im_buffer = bufferA;
const q15_t *pA;
int i;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
pA = wt;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = im_buffer;
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q15_t)__SSAT((sum >> out_shift), 16);
pOut++;
}
/* counter reset */
pBuffer = im_buffer;
im2col_out_pixel_index++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,259 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
* dim_im_out is a multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel * dim_kernel;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,270 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel_y * dim_kernel_x;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,280 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_RGB.c
* Description: Q7 version of convolution for RGB image
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 convolution function for RGB image
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals 3
*
* This kernel is written exclusively for convolution with ch_im_in
* equals 3. This applies on the first layer of CNNs which has input
* image with RGB format.
*/
arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
// This part implements the im2col function
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
pBuffer += 3;
}
else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
#ifndef ARM_MATH_BIG_ENDIAN
/*
* little-endian, | omit | 3rd | 2nd | 1st |
* MSB LSB
* top | 3rd | 1st |; bottom | omit | 2nd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = top.word;
* *(pBuffer+2) = bottom.half_words[0];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = top.half_words[0];
int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#else
/*
* big-endian, | 1st | 2nd | 3rd | omit |
* MSB LSB
* top | 2nd | omit |; bottom | 1st | 3rd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = bottom.word;
* *(pBuffer+2) = top.half_words[1];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = bottom.half_words[0];
int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#endif
pBuffer += 2;
}
}
}
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = 3 * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
/* if-for implementation */
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,227 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,229 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,380 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast.c
* Description: Fast Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
*
* ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel )
*
* The im2col converts the Q7 tensor input into Q15 column, which is stored in
* bufferA. There is reordering happenning during this im2col process with
* arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
* third elements are swapped.
*
* The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
* GEMM computation with the reordered columns.
*
* To speed-up the determination of the padding condition, we split the
* computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
* This reduces the total number of boundary condition checks and improves
* the data copying performance.
*/
arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out - padding; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out - padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in +
(i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel);
pBuffer += ch_im_in * dim_kernel;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,378 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of convolution (non-sqaure shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel_x);
pBuffer += ch_im_in * dim_kernel_x;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
/* if-for implementation */
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,241 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_fast_s16.c
* Description: Optimized s16 version of convolution.
*
* $Date: 12 August 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int16_t out_activation_min = conv_params->activation.min;
const int16_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q15_t *out = output_data;
/* This part implements the im2col function */
for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
{
for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
i_ker_y++)
{
for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
arm_memcpy_q7((q7_t *)two_column_buf,
(const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
input_ch * sizeof(q15_t));
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_activation_min,
out_activation_max,
(input_ch * kernel_y * kernel_x),
bias_data,
out);
/* Counter reset */
two_column_buf = buffer_a;
}
}
}
/* Left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Init the accumulator*/
q31_t sum = 0;
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
if (bias_data)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
q63_t acc_64 = sum + bias_data[i];
sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
}
else
{
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
}
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q15_t)sum;
}
}
#else
(void)input_data;
(void)output_data;
(void)bias_data;
(void)filter_data;
(void)buffer_a;
(void)kernel_x;
(void)kernel_y;
(void)pad_x;
(void)pad_y;
(void)stride_x;
(void)stride_y;
(void)out_activation_min;
(void)out_activation_max;
(void)output_mult;
(void)output_shift;
return ARM_MATH_ARGUMENT_ERROR;
#endif
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,156 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s16.c
* Description: s16 version of convolution using symmetric quantization.
*
* $Date: January 13, 2022
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
(void)ctx;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
int64_t conv_out_acc = 0;
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_start = MAX(0, start_y_max);
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_start = MAX(0, start_x_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_end = MIN(kernel_y, end_min_y);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_end = MIN(kernel_x, end_min_x);
for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t in_row = base_idx_y + dilation_y * i_ker_y;
const int32_t in_col = base_idx_x + dilation_x * i_ker_x;
for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] *
filter_data[i_out_ch * input_ch * kernel_y * kernel_x +
(i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
}
}
}
if (bias_data)
{
conv_out_acc += bias_data[i_out_ch];
}
int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]);
conv_out = MAX(conv_out, out_activation_min);
conv_out = MIN(conv_out, out_activation_max);
output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out;
}
}
}
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
(void)input_dims;
(void)filter_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,335 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s8.c
* Description: s8 version of convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s8 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const uint16_t input_x = input_dims->w;
const uint16_t input_y = input_dims->h;
const uint16_t input_ch = input_dims->c;
const uint16_t kernel_x = filter_dims->w;
const uint16_t kernel_y = filter_dims->h;
const uint16_t output_x = output_dims->w;
const uint16_t output_y = output_dims->h;
const uint16_t output_ch = output_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t pad_y = conv_params->padding.h;
const uint16_t stride_x = conv_params->stride.w;
const uint16_t stride_y = conv_params->stride.h;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_MVEI)
/* Generate upto four columns from the input tensor a GEMM computation */
q7_t *im2col_buf = (q7_t *)buffer_a;
q7_t *out = output_data;
int32_t buffer_fill_cnt = 0;
int32_t padded = 0;
const int32_t num_elem = kernel_x * kernel_y * input_ch;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
/* This part implements the im2col function */
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
}
im2col_buf += input_ch;
}
}
buffer_fill_cnt++;
/* Computation is filed for every 4 columns */
if (buffer_fill_cnt == 4 && (padded == 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mul_core_4x_s8(num_elem,
num_elem,
(q7_t *)buffer_a,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
}
else if (buffer_fill_cnt == 4 && (padded != 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
4,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
padded = 0;
}
}
}
/* Handle left over columns */
if (buffer_fill_cnt != 0)
{
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
buffer_fill_cnt,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
}
#else // #if defined(ARM_MATH_MVEI)
const uint16_t dilation_x = conv_params->dilation.w;
const uint16_t dilation_y = conv_params->dilation.h;
int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q7_t *out = output_data;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_with_offset(
input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset);
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_offset,
out_activation_min,
out_activation_max,
input_ch * kernel_y * kernel_x,
bias_data,
out);
/* counter reset */
two_column_buf = buffer_a;
}
}
}
/* left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Load the accumulator with bias first */
q31_t sum = 0;
if (bias_data)
{
sum = bias_data[i];
}
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
#if defined(ARM_MATH_DSP)
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
#else
uint16_t col_count = input_ch * kernel_y * kernel_x;
#endif
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q7_t)sum;
}
}
#endif // #if defined(ARM_MATH_MVEI)
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
// Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
// implementation of arm_nn_mat_mult_s8
col_length = (col_length + 7) / 8;
// 4 -> number of im2col buffers, 8 -> 8 elements per Q register
return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
#else
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,130 @@
/*
* Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s16.c
* Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 13 January 2022
* $Revision: V.1.2.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
#else
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
}
int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)conv_params;
(void)output_dims;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
}
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#else
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,133 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s8.c
* Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 02. December 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
}
int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
}
else
{
return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
}
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,212 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_3x3_s8.c
* Description: Optimized s8 depthwise convolution function for channel
* multiplier of 1 and 3x3 kernel size.
*
* $Date: 09. October 2020
* $Revision: V.2.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that
* in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)ctx;
(void)bias_dims;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
/* Check input constraints pad_x <= 1 */
if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
{
return ARM_MATH_ARGUMENT_ERROR;
}
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
int32_t in_ch = 0;
int32_t ker_w_start = MAX(0, -in_w);
for (; in_ch <= (input_ch - 4); in_ch += 4)
{
int32_t out_buff0 = bias[in_ch + 0];
int32_t out_buff1 = bias[in_ch + 1];
int32_t out_buff2 = bias[in_ch + 2];
int32_t out_buff3 = bias[in_ch + 3];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
int32_t in_val = 0;
int32_t ker_val = 0;
if (ker_w_start == 0)
{
in_val = arm_nn_read_q7x4(input_ptr);
ker_val = arm_nn_read_q7x4(kernel_ptr);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
in_val = arm_nn_read_q7x4(input_ptr + input_ch);
ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
if ((input_x - in_w) >= 3)
{
in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
out_buff0 += output_offset;
out_buff1 += output_offset;
out_buff2 += output_offset;
out_buff3 += output_offset;
out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff0;
output[out_idx++] = (int8_t)out_buff1;
output[out_idx++] = (int8_t)out_buff2;
output[out_idx++] = (int8_t)out_buff3;
}
// Leftover
for (; in_ch < input_ch; ++in_ch)
{
int32_t out_buff = bias[in_ch];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
if (ker_w_start == 0)
{
out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
}
out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
if ((input_x - in_w) >= 3)
{
out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
out_buff += output_offset;
out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff;
}
}
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,292 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s16.c
* Description: s16 version of depthwise convolution.
*
* $Date: 26. Jan 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])};
int64_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
// TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register
// spills. Try with unroll of 2 when enabling this.
int32_t in_val = input[in_idx + ker_w * input_ch];
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
out_buff32[0] =
arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]);
out_buff32[1] =
arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]);
out_buff32[2] =
arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]);
out_buff32[3] =
arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]);
out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max);
out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max);
out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max);
out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max);
output[out_idx++] = (int16_t)out_buff32[0];
output[out_idx++] = (int16_t)out_buff32[1];
output[out_idx++] = (int16_t)out_buff32[2];
output[out_idx++] = (int16_t)out_buff32[3];
}
}
}
}
}
static void depthwise_conv_s16_generic_s16(const int16_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const int8_t *kernel,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
int64_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += input[idx_0] * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]);
result = MAX(result, output_activation_min);
result = MIN(result, output_activation_max);
*output++ = (int16_t)result;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s16 depthwise convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int64_t *bias,
const cmsis_nn_dims *output_dims,
q15_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)bias_dims;
(void)ctx;
depthwise_conv_s16_generic_s16(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,347 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8.c
* Description: s8 version of depthwise convolution.
*
* $Date: 30. Dec 2021
* $Revision: V.2.7.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_s8_mult_4(const int8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
int8_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
#if defined(ARM_MATH_MVEI)
(void)out_idx;
int32x4_t res = vldrwq_s32(out_buff);
res = arm_requantize_mve_32x4(res,
vldrwq_s32(&output_mult[out_ch + mult_tile]),
vldrwq_s32(&output_shift[out_ch + mult_tile]));
res = vaddq_n_s32(res, output_offset);
res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
res = vminq_s32(res, vdupq_n_s32(output_activation_max));
vstrbq_s32(output, res);
output += 4;
#else
out_buff[0] = arm_nn_requantize(
out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
out_buff[1] = arm_nn_requantize(
out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
out_buff[2] = arm_nn_requantize(
out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
out_buff[3] = arm_nn_requantize(
out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff[0];
output[out_idx++] = (int8_t)out_buff[1];
output[out_idx++] = (int8_t)out_buff[2];
output[out_idx++] = (int8_t)out_buff[3];
#endif
}
}
}
}
}
static void depthwise_conv_s8_generic(const q7_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const q7_t *kernel,
const uint16_t output_ch,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int32_t *bias,
q7_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
(void)output_ch;
int i_out = 0;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s8 depthwise convolution function.
*
* Refer header file for details.
* Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
*
*/
arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)dw_conv_params->dilation;
(void)bias_dims;
(void)ctx;
if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
depthwise_conv_s8_mult_4(input,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max);
}
else
{
depthwise_conv_s8_generic(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,433 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8_opt.c
* Description: Optimized s8 depthwise separable convolution function for
* channel multiplier of 1.
*
* $Date: January 26, 2021
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
#ifdef ARM_MATH_DSP
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
q15_t *buffer_a = (q15_t *)ctx->buf;
#ifdef ARM_MATH_MVEI
(void)bias_dims;
/* Generate two columns from the input tensor */
q7_t *lhs_buffer = (q7_t *)buffer_a;
q7_t *out = output;
int padded = 0;
int buffer_count = 0;
const int32_t kernel_size = kernel_x * kernel_y;
/* This part implements the im2col function */
for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
{
for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch);
}
lhs_buffer += input_ch;
}
}
buffer_count++;
if (buffer_count == 4)
{
lhs_buffer = (q7_t *)buffer_a;
if (padded == 0)
{
out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
}
else
{
out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
padded = 0;
}
buffer_count = 0;
}
}
}
/* Handle left over buffers */
lhs_buffer = (q7_t *)buffer_a;
for (int i_buf = 0; i_buf < buffer_count; i_buf++)
{
int32_t loop_count = (input_ch + 3) / 4;
int32_t num_ch_to_process = input_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
{
const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
const int8_t *row_0 = kernel + offset;
int32x4_t out_0 = vldrwq_s32(&bias[offset]);
for (int i_ker = 0; i_ker < kernel_size; i_ker++)
{
const int32x4_t ker_0 = vldrbq_s32(row_0);
int32x4_t ip_0 = vldrbq_s32(col_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
col_0 += input_ch;
row_0 += input_ch;
}
const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, output_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out += 4;
}
const int tail_ch = input_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
}
#else // ARM_MATH_DSP
(void)bias_dims;
/* Run the following code in cores using DSP extension */
q15_t *const col_buffer_start = buffer_a;
q15_t *col_buffer = col_buffer_start;
const int32_t *const bias_start_pos = bias;
const q31_t *const out_mult_start_pos = output_mult;
const q31_t *const out_shift_start_pos = output_shift;
uint16_t row_count;
uint16_t row_shift;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
/* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than
along the x axis */
const int ker_y_start = MAX(0, -base_idx_y);
/* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
int32_t index = 0;
if (ker_y_start != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
index += (kernel_x * input_ch) * ker_y_start;
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
if (idx_x < 0 || idx_x >= input_x)
{
memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
}
else
{
arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch,
&col_buffer[index],
input_ch,
input_offset);
}
index += input_ch;
}
}
const int diff = kernel_y - ker_y_end;
if (diff != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
}
row_count = output_ch / 4;
row_shift = 0;
bias = bias_start_pos;
output_mult = out_mult_start_pos;
output_shift = out_shift_start_pos;
while (row_count)
{
q31_t sum = *bias++;
q31_t sum_2 = *bias++;
q31_t sum_3 = *bias++;
q31_t sum_4 = *bias++;
uint16_t col_count = (kernel_x * kernel_y) / 2;
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
row_shift += 4;
while (col_count)
{
/* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
/* Note: variable names can be improved here to align with rows and columns. */
q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
/* Read 4 weights */
ip_b1 = arm_nn_read_q7x4(row_pos);
ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
op_a = arm_nn_read_q15x2(col_pos);
op_b = arm_nn_read_q15x2(col_pos + input_ch);
ip_a2 = __SXTB16(ip_b1);
ip_b1 = __SXTB16(__ROR(ip_b1, 8));
ip_b2 = __SXTB16(ip_a1);
ip_a1 = __SXTB16(__ROR(ip_a1, 8));
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHBT(ip_b2, ip_a2, 16);
sum = __SMLAD(op_c, op_b, sum);
op_b = __PKHBT(ip_b1, ip_a1, 16);
sum_2 = __SMLAD(op_a, op_b, sum_2);
op_a = arm_nn_read_q15x2(col_pos + 2);
op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHTB(ip_a2, ip_b2, 16);
sum_3 = __SMLAD(op_c, op_b, sum_3);
op_b = __PKHTB(ip_a1, ip_b1, 16);
sum_4 = __SMLAD(op_a, op_b, sum_4);
row_pos += input_ch << 1;
col_pos += input_ch << 1;
col_count--;
}
col_count = (kernel_x * kernel_y) & 0x1;
while (col_count)
{
sum += row_pos[0] * col_pos[0];
sum_2 += row_pos[1] * col_pos[1];
sum_3 += row_pos[2] * col_pos[2];
sum_4 += row_pos[3] * col_pos[3];
row_pos += input_ch;
col_pos += input_ch;
col_count--;
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
sum_2 += output_offset;
sum_2 = MAX(sum_2, output_activation_min);
sum_2 = MIN(sum_2, output_activation_max);
*output++ = (q7_t)sum_2;
sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
sum_3 += output_offset;
sum_3 = MAX(sum_3, output_activation_min);
sum_3 = MIN(sum_3, output_activation_max);
*output++ = (q7_t)sum_3;
sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
sum_4 += output_offset;
sum_4 = MAX(sum_4, output_activation_min);
sum_4 = MIN(sum_4, output_activation_max);
*output++ = (q7_t)sum_4;
row_count--;
}
row_count = output_ch & 0x3;
while (row_count)
{
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
q31_t sum = *bias++;
const uint16_t col_count = (kernel_x * kernel_y);
row_shift += 1;
for (int i = 0; i < col_count; i++)
{
sum += row_pos[i * input_ch] * col_pos[i * input_ch];
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
row_count--;
}
// clear counter and pointers
col_buffer = col_buffer_start;
}
}
#endif
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
return arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
kernel,
bias_dims,
bias,
output_dims,
output);
#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
/* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
#elif defined(ARM_MATH_DSP)
return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,336 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_u8_basic_ver1.c
* Description: u8 depthwise convolution function
*
* $Date: 09. October 2020
* $Revision: V.1.1.1
*
* Target : Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_u8_mult_4(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4];
out_buff[0] = 0;
out_buff[1] = 0;
out_buff[2] = 0;
out_buff[3] = 0;
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset);
out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset);
out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset);
out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset);
}
}
if (bias != NULL)
{
out_buff[0] += bias[out_ch + 0 + mult_tile];
out_buff[1] += bias[out_ch + 1 + mult_tile];
out_buff[2] += bias[out_ch + 2 + mult_tile];
out_buff[3] += bias[out_ch + 3 + mult_tile];
}
out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift);
out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift);
out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift);
out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (uint8_t)out_buff[0];
output[out_idx++] = (uint8_t)out_buff[1];
output[out_idx++] = (uint8_t)out_buff[2];
output[out_idx++] = (uint8_t)out_buff[3];
}
}
}
}
}
static void depthwise_conv_u8_generic(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
(void)output_ch;
int i_out = 0;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0;
/* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
const int ker_y_start = MAX(0, -base_idx_y);
const int ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
acc_0 = 0;
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset);
}
}
if (bias != NULL)
{
acc_0 += bias[idx_out_ch];
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
}
/**
* @brief uint8 depthwise convolution function with asymmetric quantization
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x Convolution stride along the width
* @param[in] stride_y Convolution stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* available, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_shift Amount of right-shift for output
* @param[in] output_mult Output multiplier for requantization
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
* <code>ARM_MATH_SUCCESS</code> - Successful operation
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
*
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t output_shift,
const int32_t output_mult)
{
(void)dilation_x;
(void)dilation_y;
if (ch_mult % 4 == 0)
{
depthwise_conv_u8_mult_4(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
else
{
depthwise_conv_u8_generic(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,135 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_wrapper_s8.c
* Description: Wrapper API to select appropriate depthwise conv API based
* on dimensions.
*
* $Date: 20. Dec 2021
* $Revision: V.1.4.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* s8 Depthwise conv wrapper function
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *filter,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
arm_status status = ARM_MATH_SUCCESS;
if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
#if !defined(ARM_MATH_MVEI)
if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) &&
(dw_conv_params->padding.w <= 1))
{
status = arm_depthwise_conv_3x3_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
else
#endif
{
status = arm_depthwise_conv_s8_opt(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
}
else
{
status = arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
/* Return to application */
return status;
}
int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)dw_conv_params;
int32_t size = 0;
if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
}
return size;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,422 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7.c
* Description: Q7 depthwise separable convolution function
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals ch_im_out
*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*/
arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
*/
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel * dim_kernel) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
/* clear counter and pointers */
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
int conv_out;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
{
int in_row = stride * i_out_y + i_ker_y - padding;
int in_col = stride * i_out_x + i_ker_x - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,427 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c
* Description: Q7 depthwise separable convolution function (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding sizes x
* @param[in] padding_y padding sizes y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is equal to ch_im_out
*
*/
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
/*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*
*/
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /*ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
// clear counter and pointers
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out;
int i_ker_y, i_ker_x;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
{
int in_row = stride_y * i_out_y + i_ker_y - padding_y;
int in_col = stride_x * i_out_x + i_ker_x - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,218 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_s8_core.c
* Description: Depthwise convolution on im2col buffers.
*
* $Date: 09. October 2020
* $Revision: V.1.0.4
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/*
* Depthwise conv on an im2col buffer where the input channel equals
* output channel.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t ch_per_loop = num_ch / 4;
const int32_t *bias = output_bias;
int8_t *out_tmp = out;
int32_t idx = 0;
while (ch_per_loop > 0)
{
int32x4_t ip_0;
int32x4_t ip_1;
int32_t ker_loop = kernel_size / 3;
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
bias += 4;
const int32_t offset = idx * 4;
const int8_t *row_0 = row + offset;
const int16_t *col_0 = col + offset;
const int16_t *col_1 = col + kernel_size * num_ch + offset;
int32x4_t ker_0 = vldrbq_s32(row_0);
while (ker_loop > 0)
{
const int8_t *row_1 = row_0 + num_ch;
const int8_t *row_2 = row_0 + 2 * num_ch;
const int32x4_t ker_1 = vldrbq_s32(row_1);
const int32x4_t ker_2 = vldrbq_s32(row_2);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_1);
out_1 += vmulq_s32(ip_1, ker_1);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_2);
out_1 += vmulq_s32(ip_1, ker_2);
row_0 += 3 * num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
idx++;
/* Handle tail kernel elements */
ker_loop = kernel_size - ((kernel_size / 3) * 3);
while (ker_loop > 0)
{
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
col_0 += num_ch;
col_1 += num_ch;
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
row_0 += num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp, out_0);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp + num_ch, out_1);
out_tmp += 4;
ch_per_loop--;
}
int32_t tail_ch = num_ch & 3;
if (tail_ch != 0)
{
int32_t ch_idx = (num_ch & ~3);
int32x4_t col_0_sum;
int32x4_t col_1_sum;
const int32_t single_buffer_size = kernel_size * num_ch;
for (int i = 0; i < tail_ch; i++)
{
const int16_t *col_pos_0 = col + ch_idx;
const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
const int8_t *row_pos = row + ch_idx;
int32_t sum_0 = bias[i];
int32_t sum_1 = bias[i];
for (int j = 0; j < kernel_size; j++)
{
const int8_t row_val = row_pos[j * num_ch];
sum_0 += row_val * col_pos_0[j * num_ch];
sum_1 += row_val * col_pos_1[j * num_ch];
}
col_0_sum[i] = sum_0;
col_1_sum[i] = sum_1;
ch_idx++;
}
const mve_pred16_t p = vctp32q((uint32_t)tail_ch);
const int32x4_t mult = vldrwq_z_s32(out_mult, p);
const int32x4_t shift = vldrwq_z_s32(out_shift, p);
col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift);
col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift);
col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp, col_0_sum, p);
col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
out_tmp += tail_ch;
}
return out_tmp + num_ch;
#else
(void)row;
(void)col;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)kernel_size;
(void)output_bias;
(void)out;
return NULL;
#endif
}

View File

@@ -0,0 +1,186 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15.c
* Description: Matrix-multiplication function for convolution
*
* $Date: January 26, 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @brief Matrix-multiplication function for convolution.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
const q7_t *pBias = bias;
uint16_t rowCnt = ch_im_out >> 1;
/* this loop over rows in A */
while (rowCnt)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
pA2 = read_and_pad(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
rowCnt--;
} /* for over ch_im_out */
/* compute left-over row if any */
if (ch_im_out & 0x1)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* load the bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
while (colCnt)
{
q31_t inA11, inA12;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
colCnt--;
}
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
}
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
(void)pA;
(void)pInBuffer;
(void)ch_im_out;
(void)numCol_A;
(void)bias_shift;
(void)out_shift;
(void)bias;
(void)pOut;
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@@ -0,0 +1,137 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: January 26, 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @brief Matrix-multiplication function for convolution with re-ordered input.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
int i;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad_reordered(pA, &inA11, &inA12);
pA2 = read_and_pad_reordered(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
} /* for over ch_im_out */
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
(void)pA;
(void)pInBuffer;
(void)ch_im_out;
(void)numCol_A;
(void)bias_shift;
(void)out_shift;
(void)bias;
(void)pOut;
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@@ -0,0 +1,245 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16.c
* Description: Matrix-multiplication function for convolution
*
* $Date: 14. December 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/*
* Matrix-multiplication function for convolution with per-channel requantization.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if !defined(ARM_MATH_MVEI)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
q31_t ch_1_out_0 = 0;
q31_t ch_1_out_1 = 0;
/* Init accumulator with bias for channel N and N + 1 */
if (bias)
{
ch_0_out_0 = *bias;
ch_0_out_1 = *bias++;
ch_1_out_0 = *bias;
ch_1_out_1 = *bias++;
}
#if defined(ARM_MATH_DSP)
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ip_a1 = read_and_pad(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
col_count = num_col_a & 0x3;
#else
uint16_t col_count = num_col_a;
#endif
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q7_t a1 = *ip_a1++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
ch_1_out_0 += a1 * b0;
ch_1_out_1 += a1 * b1;
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
/* compute the last odd numbered row if any */
if (output_ch & 0x1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
/* load the bias */
if (bias)
{
ch_0_out_0 = *bias;
ch_0_out_1 = *bias++;
}
#if defined(ARM_MATH_DSP)
uint16_t col_count = num_col_a >> 2;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
}
col_count = num_col_a & 0x3;
#else
uint16_t col_count = num_col_a;
#endif
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
col_count--;
}
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@@ -0,0 +1,201 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: 09. October 2020
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/*
* Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
* requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
*
* Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
* read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
* unifying these two functions is a potential future improvement.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
/* Init accumulator with bias for channel N and N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = *bias++;
q31_t ch_1_out_0 = *bias;
q31_t ch_1_out_1 = *bias++;
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
if (output_ch & 1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* Init accumulator with bias for channel N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = ch_0_out_0;
int32_t col_count = num_col_a / 4;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@@ -0,0 +1,180 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_s8.c
* Description: General Matrix-multiplication function
*
* $Date: 27. October 2021
* $Revision: V.2.0.6
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/*
* s8 General matrix multiplication function with per-channel requantization for upto 4 column batches.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
const uint16_t col_batches,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t out_offset,
const int32_t col_offset,
const int32_t row_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t row_len,
const int32_t *const bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
(void)row_offset;
if (col_batches == 4)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col;
const int8_t *ip_c1 = input_col + row_len;
const int8_t *ip_c2 = input_col + (2 * row_len);
const int8_t *ip_c3 = input_col + (3 * row_len);
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
int32_t acc_3 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
ip_c0 += 8;
c0 = vaddq_s16(c0, offset);
int16x8_t c1 = vldrbq_s16(ip_c1);
ip_c1 += 8;
c1 = vaddq_s16(c1, offset);
int16x8_t c2 = vldrbq_s16(ip_c2);
ip_c2 += 8;
c2 = vaddq_s16(c2, offset);
int16x8_t c3 = vldrbq_s16(ip_c3);
ip_c3 += 8;
c3 = vaddq_s16(c3, offset);
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p);
acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
}
int32x4_t res = {acc_0, acc_1, acc_2, acc_3};
if (bias)
{
res = vaddq_n_s32(res, bias[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(activation_min));
res = vminq_s32(res, vdupq_n_s32(activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
}
out += 4 * output_ch;
}
else
{
for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col + (i_col_batch * row_len);
int32_t acc_0 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
ip_c0 += 8;
c0 = vaddq_s16(c0, offset);
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
}
if (bias)
{
acc_0 += bias[i_out_ch];
}
acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]);
acc_0 += out_offset;
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
out[i_out_ch] = (q7_t)acc_0;
}
out += output_ch;
}
}
return out;
#else
(void)input_row;
(void)input_col;
(void)output_ch;
(void)col_batches;
(void)output_shift;
(void)output_mult;
(void)out_offset;
(void)col_offset;
(void)row_offset;
(void)activation_min;
(void)activation_max;
(void)row_len;
(void)bias;
(void)out;
return NULL;
#endif
}

View File

@@ -0,0 +1,21 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC} arm_fully_connected_s16.c)

View File

@@ -0,0 +1,197 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_mat_q7_vec_q15.c
* Description: Mixed Q15-Q7 fully-connected layer function
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Mixed Q15-Q7 fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Q7_Q15 version of the fully connected layer
*
* Weights are in q7_t and Activations are in q15_t
*
*/
arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q15_t *pOut,
q15_t *vec_buffer)
{
(void)vec_buffer;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
const q7_t *pB2;
q15_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 1;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV, inM11, inM12, inM21, inM22;
pB = read_and_pad(pB, &inM11, &inM12);
pB2 = read_and_pad(pB2, &inM21, &inM22);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM11, sum);
sum2 = __SMLAD(inV, inM21, sum2);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM12, sum);
sum2 = __SMLAD(inV, inM22, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16));
/*adjust the pointers and counters */
pB += dim_vec;
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x1;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16);
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,417 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_mat_q7_vec_q15_opt.c
* Description: Mixed Q15-Q7 opt fully-connected layer function
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Mixed Q15-Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Q7_Q15 version of the fully connected layer
*
* Weights are in q7_t and Activations are in q15_t
*
* Limitation: x4 version requires weight reordering to work
*
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original q7_t matrix looks like this:
*
* | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
*
* | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
*
* | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
*
* | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
*
* | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
*
* | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 |
*
* | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 |
*
* | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 |
*
* The column left over will be in-order.
* which is:
* | a17 | a27 | a37 | a47 |
*
* For the left-over rows, we do 1x1 computation, so the data remains
* as its original order.
*
* So the stored weight matrix looks like this:
*
* | a11 | a21 | a12 | a22 | a31 | a41 |
*
* | a32 | a42 | a13 | a23 | a14 | a24 |
*
* | a33 | a43 | a34 | a44 | a15 | a25 |
*
* | a16 | a26 | a35 | a45 | a36 | a46 |
*
* | a17 | a27 | a37 | a47 | a51 | a52 |
*
* | a53 | a54 | a55 | a56 | a57 | a61 |
*
* | a62 | a63 | a64 | a65 | a66 | a67 |
*
*/
arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q15_t *pOut,
q15_t *vec_buffer)
{
(void)vec_buffer;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
q15_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 2;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = *__SIMD32(pA)++;
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
#ifndef ARM_MATH_BIG_ENDIAN
asm volatile("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r1, [%[pB]], #8\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt)
: "r0", "r1", "r2", "r3", "r4");
#else
asm volatile("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r1, [%[pB]], #8\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt)
: "r0", "r1", "r2", "r3", "r4");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB++;
q7_t inM3 = *pB++;
q7_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q15_t *pA;
q15_t *pO = pOut;
const q7_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inA2 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q15_t)__SSAT((sum >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q15_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,195 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q15.c
* Description: Q15 basic fully-connected layer function
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q15 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
*/
arm_status arm_fully_connected_q15(const q15_t *pV,
const q15_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t *bias,
q15_t *pOut,
q15_t *vec_buffer)
{
(void)vec_buffer;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q15_t *pB = pM;
const q15_t *pB2 = pB + dim_vec;
q15_t *pO = pOut;
const q15_t *pA;
const q15_t *pBias = bias;
uint16_t rowCnt = num_of_rows >> 1;
/* this loop loops over different output */
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV1, inM1, inM2;
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB2);
sum2 = __SMLAD(inV1, inM2, sum2);
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB2);
sum2 = __SMLAD(inV1, inM2, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16));
/* adjust the pointers and counters */
pB = pB + dim_vec;
rowCnt--;
}
rowCnt = num_of_rows & 0x1;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inM1;
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16);
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,336 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q15_opt.c
* Description: Q15 opt fully-connected layer function
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q15 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original matrix looks like this:
*
* | a11 | a12 | a13 |
*
* | a21 | a22 | a23 |
*
* | a31 | a32 | a33 |
*
* | a41 | a42 | a43 |
*
* | a51 | a52 | a53 |
*
* | a61 | a62 | a63 |
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
*
* | a13 | a23 | a33 | a43 |
*
* Remaining rows are kept the same original order.
*
* So the stored weight matrix looks like this:
*
*
* | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
*
* | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 |
*
* | a62 | a63 |
*/
arm_status arm_fully_connected_q15_opt(const q15_t *pV,
const q15_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t *bias,
q15_t *pOut,
q15_t *vec_buffer)
{
(void)vec_buffer;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q15_t *pB = pM;
q15_t *pO = pOut;
const q15_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 2;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
#ifdef USE_INTRINSIC
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV, inM11, sum);
inM12 = arm_nn_read_q15x2_ia(&pB);
sum2 = __SMLAD(inV, inM12, sum2);
inM13 = arm_nn_read_q15x2_ia(&pB);
sum3 = __SMLAD(inV, inM13, sum3);
inM14 = arm_nn_read_q15x2_ia(&pB);
sum4 = __SMLAD(inV, inM14, sum4);
colCnt--;
}
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r0, [%[pB]], #16\n"
"smlad %[sum], r4, r0, %[sum]\n"
"ldr.w r1, [%[pB] , #-12]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r2, [%[pB] , #-8]\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"ldr.w r3, [%[pB] , #-4]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt)
: "r0", "r1", "r2", "r3", "r4");
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB++;
q15_t inM3 = *pB++;
q15_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16));
*pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM1, inM2;
inM1 = arm_nn_read_q15x2_ia(&pB);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM2, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t)(__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
uint16_t rowCnt = num_of_rows >> 2;
const q15_t *pB = pM;
const q15_t *pA;
q15_t *pO = pOut;
const q15_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inA2 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum2 += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum4 += inA1 * inB1 + inA2 * inB2;
colCnt--;
}
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inA = *pA++;
q15_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q15_t)__SSAT((sum >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16);
*pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q15_t inA = *pA++;
q15_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,200 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q7.c
* Description: Q7 basic fully-connected layer function
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q7 basic fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: dim_vec
*
* This basic function is designed to work with regular weight
* matrix without interleaving.
*
*/
arm_status arm_fully_connected_q7(const q7_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut,
q15_t *vec_buffer)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
const q7_t *pB2;
q7_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA;
uint16_t rowCnt = num_of_rows >> 1;
/* expand the vector into the buffer */
arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV, inM11, inM12, inM21, inM22;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
pB2 = read_and_pad_reordered(pB2, &inM21, &inM22);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM11, sum);
sum2 = __SMLAD(inV, inM21, sum2);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM12, sum);
sum2 = __SMLAD(inV, inM22, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q7_t)(__SSAT((sum >> out_shift), 8));
*pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8));
/* adjust the pointers and counters */
pB += dim_vec;
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x1;
while (rowCnt)
{
uint16_t colCnt = dim_vec >> 2;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
pA = vec_buffer;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q7_t)(__SSAT((sum >> out_shift), 8));
rowCnt--;
}
#else
(void)vec_buffer;
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q7_t)__SSAT((ip_out >> out_shift), 8);
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,495 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q7_opt.c
* Description: Q7 basic fully-connected layer function
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: dim_vec
*
* This opt function is designed to work with interleaved weight
* matrix. The vector input is assumed in q7_t format, we call
* arm_q7_to_q15_no_shift_shuffle function to expand into
* q15_t format with certain weight re-ordering, refer to the function
* comments for more details.
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original q7_t matrix looks like this:
*
* | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
*
* | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
*
* | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
*
* | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
*
* | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
*
* | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
*
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 |
*
* | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 |
*
* | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 |
*
* So within the kernel, we first read the re-ordered vector in as:
*
* | b1 | b3 | and | b2 | b4 |
*
* the four q31_t weights will look like
*
* | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 |
*
* | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 |
*
* The column left over will be in-order.
* which is:
*
* | a17 | a27 | a37 | a47 |
*
* For the left-over rows, we do 1x1 computation, so the data remains
* as its original order.
*
* So the stored weight matrix looks like this:
*
* | a11 | a21 | a13 | a23 | a31 | a41 |
*
* | a33 | a43 | a12 | a22 | a14 | a24 |
*
* | a32 | a42 | a34 | a44 | a15 | a25 |
*
* | a35 | a45 | a16 | a26 | a36 | a46 |
*
* | a17 | a27 | a37 | a47 | a51 | a52 |
*
* | a53 | a54 | a55 | a56 | a57 | a61 |
*
* | a62 | a63 | a64 | a65 | a66 | a67 |
*
*
*/
arm_status arm_fully_connected_q7_opt(const q7_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut,
q15_t *vec_buffer)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
q7_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA;
uint16_t rowCnt = num_of_rows >> 2;
arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
#ifndef ARM_MATH_BIG_ENDIAN
asm volatile("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #8\n"
"ldr.w r1, [%[pB]], #16\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-12]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"ldr.w r4, [%[pA], #-4]\n"
"ldr.w r1, [%[pB], #-8]\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt)
: "r0", "r1", "r2", "r3", "r4");
#else
asm volatile("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #8\n"
"ldr.w r1, [%[pB]], #16\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-12]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"ldr.w r4, [%[pA], #-4]\n"
"ldr.w r1, [%[pB], #-8]\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt)
: "r0", "r1", "r2", "r3", "r4");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB++;
q7_t inM3 = *pB++;
q7_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q7_t)(__SSAT((sum >> out_shift), 8));
*pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8));
*pO++ = (q7_t)(__SSAT((sum3 >> out_shift), 8));
*pO++ = (q7_t)(__SSAT((sum4 >> out_shift), 8));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q7_t)(__SSAT((sum >> out_shift), 8));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)vec_buffer;
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q7_t *pA;
q7_t *pO = pOut;
const q7_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q7_t inA1 = *pA++;
q7_t inA3 = *pA++;
q7_t inA2 = *pA++;
q7_t inA4 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum += inA3 * inB1 + inA4 * inB2;
sum2 += inA3 * inB3 + inA4 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA3 * inB1 + inA4 * inB2;
sum4 += inA3 * inB3 + inA4 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pO++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pO++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pO++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q7_t)__SSAT((ip_out >> out_shift), 8);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,97 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_s16
* Description: Fully connected function compatible with TF Lite.
*
* $Date: 3. August 2021
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M and Cortex-A cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/*
* S16 basic fully-connected and matrix multiplication layer function for TensorFlow Lite
*
* Refer header file for details.
*
*/
arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int64_t *bias,
const cmsis_nn_dims *output_dims,
q15_t *output)
{
(void)bias_dims;
(void)ctx;
(void)fc_params->filter_offset;
int32_t batch_cnt = input_dims->n;
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier);
while (batch_cnt)
{
arm_nn_vec_mat_mult_t_s16(input,
kernel,
bias,
output,
reduced_multiplier,
quant_params->shift,
filter_dims->n, /* col_dim or accum_depth */
output_dims->c, /* row_dim or output_depth */
fc_params->activation.min,
fc_params->activation.max);
input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
}
return (ARM_MATH_SUCCESS);
}
int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims)
{
(void)filter_dims;
return 0;
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,99 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_s8
* Description: Fully connected function compatible with TF Lite.
*
* $Date: 8 April 2022
* $Revision: V.3.1.0
*
* Target Processor: Cortex-M and Cortex-A cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/*
* S8 basic fully-connected and matrix multiplication layer function for TensorFlow Lite
*
* Refer header file for details.
*
*/
arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)bias_dims;
(void)ctx;
(void)fc_params->filter_offset;
int32_t batch_cnt = input_dims->n;
while (batch_cnt)
{
arm_nn_vec_mat_mult_t_s8(input,
kernel,
bias,
output,
fc_params->input_offset,
0,
fc_params->output_offset,
quant_params->multiplier,
quant_params->shift,
filter_dims->n, /* col_dim or accum_depth */
output_dims->c, /* row_dim or output_depth */
fc_params->activation.min,
fc_params->activation.max,
1L);
input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
}
return (ARM_MATH_SUCCESS);
}
int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
{
(void)filter_dims;
return 0;
}
/**
* @} end of FC group
*/

View File

@@ -0,0 +1,26 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC} arm_q7_to_q15_with_offset.c
arm_nn_mat_mul_kernel_s16.c
arm_q7_to_q15_with_offset.c
arm_nn_mat_mul_kernel_s16.c
arm_nn_vec_mat_mult_t_s16.c
arm_q7_to_q15_no_shift.c)

View File

@@ -0,0 +1,85 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_accumulate_q7_to_q15.c
* Description: Accumulate q7 vector into q15 one.
*
* $Date: 20 July 2021
* $Revision: V.1.1.2
*
* pSrc Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
{
q15_t *pCnt = pDst;
const q7_t *pV = pSrc;
int32_t count = length;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
q31_t v1, v2, vo1, vo2;
count = length >> 2;
q31_t in;
while (count > 0l)
{
q31_t value = arm_nn_read_q7x4_ia(&pV);
v1 = __SXTB16(__ROR((uint32_t)value, 8));
v2 = __SXTB16(value);
#ifndef ARM_MATH_BIG_ENDIAN
vo2 = (q31_t)__PKHTB(v1, v2, 16);
vo1 = (q31_t)__PKHBT(v2, v1, 16);
#else
vo1 = (q31_t)__PKHTB(v1, v2, 16);
vo2 = (q31_t)__PKHBT(v2, v1, 16);
#endif
in = arm_nn_read_q15x2(pCnt);
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
in = arm_nn_read_q15x2(pCnt);
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
count--;
}
count = length & 0x3;
#endif
while (count > 0l)
{
*pCnt++ += *pV++;
count--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,82 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_add_q7.c
* Description: Non saturating addition of elements of a q7 vector.
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size)
{
uint32_t block_count;
q31_t result = 0;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Loop unrolling: Compute 4 outputs at a time */
block_count = block_size >> 2U;
while (block_count > 0U)
{
const int32_t mult_q15x2 = (1UL << 16) | 1UL;
q31_t in_q7x4 = arm_nn_read_q7x4_ia(&input);
q31_t temp_q15x2 = __SXTAB16(__SXTB16(in_q7x4), __ROR((uint32_t)in_q7x4, 8));
result = __SMLAD(temp_q15x2, mult_q15x2, result);
/* Decrement loop counter */
block_count--;
}
/* Loop unrolling: Compute remaining outputs */
block_count = block_size & 0x3;
#else
block_count = block_size;
#endif
while (block_count > 0U)
{
/* Add and store result in destination buffer. */
result += *input++;
/* Decrement loop counter */
block_count--;
}
*output = result;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,168 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_nt_t_padded_s8.c
* Description: Depthwise convolution with padded matrices.
*
* $Date: 09. October 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M processors with MVE extension
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* Depthwise convolution of transposed rhs matrix with 4 lhs matrices. One or more of the rhs matrices are padded.
* Dimensions are the same for lhs and rhs.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t input_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t loop_count = (num_ch + 3) / 4;
const int32_t *bias = output_bias;
uint32_t num_ch_to_process = num_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
num_ch_to_process -= 4, out += 4, offset += 4, i_loop_cnt++)
{
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
int32x4_t out_2 = out_0;
int32x4_t out_3 = out_0;
bias += 4;
const int8_t *rhs_0 = rhs + offset;
const int8_t *lhs_0 = lhs + offset;
const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset;
const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
{
const int32x4_t ker_0 = vldrbq_s32(rhs_0);
int32x4_t ip_0 = vldrbq_s32(lhs_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
int32x4_t ip_1 = vldrbq_s32(lhs_1);
ip_1 = vaddq_n_s32(ip_1, input_offset);
out_1 += vmulq_s32(ip_1, ker_0);
int32x4_t ip_2 = vldrbq_s32(lhs_2);
ip_2 = vaddq_n_s32(ip_2, input_offset);
out_2 += vmulq_s32(ip_2, ker_0);
int32x4_t ip_3 = vldrbq_s32(lhs_3);
ip_3 = vaddq_n_s32(ip_3, input_offset);
out_3 += vmulq_s32(ip_3, ker_0);
lhs_0 += num_ch;
lhs_1 += num_ch;
lhs_2 += num_ch;
lhs_3 += num_ch;
rhs_0 += num_ch;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
mve_pred16_t p = vctp32q(num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + num_ch, out_1, p);
out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
out_2 = vaddq_n_s32(out_2, out_offset);
out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 2 * num_ch, out_2, p);
out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
out_3 = vaddq_n_s32(out_3, out_offset);
out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 3 * num_ch, out_3, p);
}
const int tail_ch = num_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
return out + (3 * num_ch);
#else
(void)lhs;
(void)rhs;
(void)input_offset;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)row_x_col;
(void)output_bias;
(void)out;
return NULL;
#endif
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,170 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_nt_t_s8.c
* Description: Depthwise convolution on matrices with no padding.
*
* $Date: 09. October 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M processors with MVE extension.
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t input_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
const int32_t *bias = output_bias;
int32_t loop_count = (num_ch + 3) / 4;
uint32_t num_ch_to_process = num_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
{
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
int32x4_t out_2 = out_0;
int32x4_t out_3 = out_0;
bias += 4;
const int8_t *rhs_0 = rhs + offset;
const int8_t *lhs_0 = lhs + offset;
const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset;
const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
int32x4_t ker_sum = vdupq_n_s32(0);
for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
{
const int32x4_t ker_0 = vldrbq_s32(rhs_0);
ker_sum = vaddq_s32(ker_sum, ker_0);
int32x4_t ip_0 = vldrbq_s32(lhs_0);
out_0 += vmulq_s32(ip_0, ker_0);
int32x4_t ip_1 = vldrbq_s32(lhs_1);
out_1 += vmulq_s32(ip_1, ker_0);
int32x4_t ip_2 = vldrbq_s32(lhs_2);
out_2 += vmulq_s32(ip_2, ker_0);
int32x4_t ip_3 = vldrbq_s32(lhs_3);
out_3 += vmulq_s32(ip_3, ker_0);
lhs_0 += num_ch;
lhs_1 += num_ch;
lhs_2 += num_ch;
lhs_3 += num_ch;
rhs_0 += num_ch;
}
ker_sum = vmulq_n_s32(ker_sum, input_offset);
out_0 = ker_sum + out_0;
out_1 = ker_sum + out_1;
out_2 = ker_sum + out_2;
out_3 = ker_sum + out_3;
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
mve_pred16_t p = vctp32q(num_ch_to_process);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_p_s32(out, out_0, p);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + num_ch, out_1, p);
out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
out_2 = vaddq_n_s32(out_2, out_offset);
out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 2 * num_ch, out_2, p);
out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
out_3 = vaddq_n_s32(out_3, out_offset);
out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 3 * num_ch, out_3, p);
}
const int tail_ch = num_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
return out + (3 * num_ch);
#else
(void)lhs;
(void)rhs;
(void)input_offset;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)row_x_col;
(void)output_bias;
(void)out;
return NULL;
#endif
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,86 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mul_core_1x_s8.c
* Description: General Matrix-multiplication function
*
* $Date: 19. April 2022
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication to process 1 row
*
* Refer header file for details.
*
*/
arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output)
{
int32_t acc_n0 = 0;
int32_t sum_tmp = 0;
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vaddva.s8 %[sum], q0 \n"
" vldrb.8 q1, [%[row0]], #16 \n"
" vmladava.s8 %[out0], q0, q1 \n"
" vldrb.8 q0, [%[col]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0)
: [cnt] "r"(row_elements)
: "q0", "q1", "memory", "r14");
#else
for (int i = 0; i < row_elements; i++)
{
sum_tmp += col_base[i];
acc_n0 += row_base[i] * col_base[i];
}
#endif
*sum_col = sum_tmp;
*output = acc_n0;
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,137 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mul_core_4x_s8.c
* Description: General matrix multiplication function for MVE extension
*
* $Date: 19. April 2022
* $Revision: V.3.0.1
*
* Target Processor: Cortex-M processors
* -------------------------------------------------------------------- */
#include "arm_nn_types.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication to process 4 rows and one column
*
* Refer header file for details.
*
*/
int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t offset,
const int8_t *row_base,
const int8_t *col_base_ref,
const int32_t out_ch,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const int32_t *bias,
int8_t *output)
{
#if defined(ARM_MATH_MVEI)
for (int i = 0; i < out_ch; i++)
{
int32_t acc_n0 = 0;
int32_t acc_n1 = 0;
int32_t acc_n2 = 0;
int32_t acc_n3 = 0;
const int8_t *ip_row_0 = row_base;
const int8_t *ip_row_1 = row_base + offset;
const int8_t *ip_row_2 = row_base + (2 * offset);
const int8_t *ip_row_3 = row_base + (3 * offset);
const int8_t *col_base = col_base_ref + i * row_elements;
int32_t sum_tmp = 0;
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vaddva.s8 %[sum], q0 \n"
" vldrb.8 q1, [%[row0]], #16 \n"
" vmladava.s8 %[out0], q0, q1 \n"
" vldrb.8 q2, [%[row1]], #16 \n"
" vmladava.s8 %[out1], q0, q2 \n"
" vldrb.8 q3, [%[row2]], #16 \n"
" vmladava.s8 %[out2], q0, q3 \n"
" vldrb.8 q4, [%[row3]], #16 \n"
" vmladava.s8 %[out3], q0, q4 \n"
" vldrb.8 q0, [%[col]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [col] "+r"(col_base),
[sum] "+Te"(sum_tmp),
[row0] "+r"(ip_row_0),
[row1] "+r"(ip_row_1),
[row2] "+r"(ip_row_2),
[row3] "+r"(ip_row_3),
[out0] "+Te"(acc_n0),
[out1] "+Te"(acc_n1),
[out2] "+Te"(acc_n2),
[out3] "+Te"(acc_n3)
: [cnt] "r"(row_elements)
: "q0", "q1", "q2", "q3", "q4", "memory", "r14");
int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3};
sum_tmp *= conv_params->input_offset;
if (bias)
{
sum_tmp += bias[i];
}
res = vaddq_n_s32(res, sum_tmp);
res = arm_requantize_mve(res, quant_params->multiplier[i], quant_params->shift[i]);
res = vaddq_n_s32(res, conv_params->output_offset);
res = vmaxq_s32(res, vdupq_n_s32(conv_params->activation.min));
res = vminq_s32(res, vdupq_n_s32(conv_params->activation.max));
const uint32x4_t scatter_offset = {0, (uint32_t)out_ch, (uint32_t)out_ch * 2, (uint32_t)out_ch * 3};
vstrbq_scatter_offset_s32(output, scatter_offset, res);
output++;
}
return output + (3 * out_ch);
#else
(void)row_elements;
(void)offset;
(void)row_base;
(void)col_base_ref;
(void)out_ch;
(void)conv_params;
(void)quant_params;
(void)bias;
(void)output;
return NULL;
#endif
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,250 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s16.c
* Description: Matrix-multiplication function for convolution
*
* $Date: 12 August 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/*
* Matrix-multiplication function for convolution with per-channel requantization.
*
* Refer header file for details.
*
*/
q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
const q15_t *input_b,
const int32_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int16_t activation_min,
const int16_t activation_max,
const int32_t num_col_a,
const int64_t *const output_bias,
q15_t *out_0)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* set up the second output pointers */
q15_t *out_1 = out_0 + output_ch;
const int64_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
/* Init accumulator for channel N and N + 1 */
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
q31_t ch_1_out_0 = 0;
q31_t ch_1_out_1 = 0;
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ip_a1 = read_and_pad(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
col_count = num_col_a & 0x3;
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q7_t a1 = *ip_a1++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
ch_1_out_0 += a1 * b0;
ch_1_out_1 += a1 * b1;
col_count--;
} /* while over col_count */
if (bias)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
q63_t acc_64 = ch_0_out_0 + *bias;
ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
acc_64 = ch_0_out_1 + *bias++;
ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
out_mult++;
}
else
{
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
out_mult++;
}
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q15_t)ch_0_out_0;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q15_t)ch_0_out_1;
out_shift++;
if (bias)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
q63_t acc_64 = ch_1_out_0 + *bias;
ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
acc_64 = ch_1_out_1 + *bias++;
ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
out_mult++;
}
else
{
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
out_mult++;
}
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q15_t)ch_1_out_0;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q15_t)ch_1_out_1;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
/* compute the last odd numbered row if any */
if (output_ch & 0x1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
uint16_t col_count = num_col_a >> 2;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
}
col_count = num_col_a & 0x3;
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
col_count--;
}
if (bias)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
q63_t acc_64 = ch_0_out_0 + *bias;
ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
acc_64 = ch_0_out_1 + *bias++;
ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
}
else
{
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
}
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q15_t)ch_0_out_0;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q15_t)ch_0_out_1;
out_mult++;
out_shift++;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@@ -0,0 +1,582 @@
/*
* Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_s8_nt_t_s8
* Description: Matrix multiplication support function with the right-hand-side (rhs) matrix transposed
*
* $Date: 09. October 2020
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication with the right-hand-side matrix transposed
*
* Refer header file for details.
*
*/
arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max)
{
#if defined(ARM_MATH_DSP)
const int32_t off0 = rhs_cols - 4;
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
q31_t lhs_offset_contribution0 = 0;
q31_t lhs_offset_contribution1 = 0;
for (int32_t x = 0; x < rhs_cols; ++x)
{
lhs_offset_contribution0 += rhs[x];
lhs_offset_contribution1 += rhs[x + rhs_cols];
}
lhs_offset_contribution0 *= lhs_offset;
lhs_offset_contribution1 *= lhs_offset;
if (bias)
{
lhs_offset_contribution0 += bias[rhs_rows_idx];
lhs_offset_contribution1 += bias[rhs_rows_idx + 1];
}
int32_t lhs_rows_idx = lhs_rows >> 1;
while (lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
q31_t res10 = lhs_offset_contribution0;
q31_t res11 = lhs_offset_contribution1;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3, val4, val5;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
res11 = __SMLAD(val0, val4, res11);
}
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
lhs_value = lhs_ptr[rhs_cols];
res10 += lhs_value * rhs_value0;
res11 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
res10 += dst_offset;
res11 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res10 = MAX(res10, activation_min);
res10 = MIN(res10, activation_max);
res11 = MAX(res11, activation_min);
res11 = MIN(res11, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
dst_ptr += rhs_rows;
dst_ptr[0] = (q7_t)res10;
dst_ptr[1] = (q7_t)res11;
dst_ptr += rhs_rows;
lhs_ptr += rhs_cols;
lhs_rows_idx--;
}
// Left-over rows
if (lhs_rows % 2)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3, val4, val5;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
}
// Left-over accumulations
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
}
rhs += 2 * rhs_cols;
dst += 2;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
if (bias)
{
res00 = bias[rhs_rows - 1];
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value = rhs_ptr[0];
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr += rhs_rows;
}
}
#else
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
q31_t lhs_offset_contribution0 = 0;
q31_t lhs_offset_contribution1 = 0;
for (int32_t x = 0; x < rhs_cols; ++x)
{
lhs_offset_contribution0 += rhs[x];
lhs_offset_contribution1 += rhs[x + rhs_cols];
}
lhs_offset_contribution0 *= lhs_offset;
lhs_offset_contribution1 *= lhs_offset;
if (bias)
{
lhs_offset_contribution0 += bias[rhs_rows_idx];
lhs_offset_contribution1 += bias[rhs_rows_idx + 1];
}
int32_t lhs_rows_idx = lhs_rows >> 1;
while (lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
q31_t res10 = lhs_offset_contribution0;
q31_t res11 = lhs_offset_contribution1;
for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
lhs_value = lhs_ptr[rhs_cols];
res10 += lhs_value * rhs_value0;
res11 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
res10 += dst_offset;
res11 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res10 = MAX(res10, activation_min);
res10 = MIN(res10, activation_max);
res11 = MAX(res11, activation_min);
res11 = MIN(res11, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
dst_ptr += rhs_rows;
dst_ptr[0] = (q7_t)res10;
dst_ptr[1] = (q7_t)res11;
dst_ptr += rhs_rows;
lhs_ptr += rhs_cols;
lhs_rows_idx--;
}
// Left-over rows
if (lhs_rows % 2)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
}
rhs += 2 * rhs_cols;
dst += 2;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
if (bias)
{
res00 = bias[rhs_rows - 1];
}
for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
{
q31_t rhs_value = rhs_ptr[0];
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr += rhs_rows;
}
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,73 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mult_q15.c
* Description: Q15 vector multiplication with variable output shifts
*
* $Date: 20. July 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
{
uint32_t blkCnt = blockSize; /* loop counters */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q15_t)__SSAT(((q31_t)((q31_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,73 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mult_q7.c
* Description: Q7 vector multiplication with variable output shifts
*
* $Date: 20. July 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
{
uint32_t blkCnt = blockSize; /* loop counters */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,211 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_vec_mat_mult_t_s16
* Description: s16 vector by matrix (transposed) multiplication
*
* $Date: 04. January 2022
* $Revision: V.1.2.0
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s16 vector(lhs) by matrix (transposed) multiplication
*
* Refer header file for details.
*
*/
arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
const q7_t *rhs,
const q63_t *bias,
q15_t *dst,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
const int32_t row_loop_cnt = rhs_rows / 2;
int32_t rhs_cols_fast = rhs_cols;
if (rhs_cols > 512)
{
rhs_cols_fast = 512;
}
for (int32_t i = 0; i < row_loop_cnt; i++)
{
q63_t acc_64_0 = 0;
q63_t acc_64_1 = 0;
int32_t acc_0 = 0;
int32_t acc_1 = 0;
const int32_t col_loop_cnt = rhs_cols_fast / 4;
const int16_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
rhs += 2 * rhs_cols;
for (int j = col_loop_cnt; j != 0; j--)
{
int32_t ker_0, ker_1, vec_part_0, vec_part_1;
vec_part_0 = arm_nn_read_q15x2_ia(&lhs_vec);
vec_part_1 = arm_nn_read_q15x2_ia(&lhs_vec);
rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
acc_0 = __SMLAD(ker_0, vec_part_0, acc_0);
acc_0 = __SMLAD(ker_1, vec_part_1, acc_0);
rhs_1 = read_and_pad(rhs_1, &ker_0, &ker_1);
acc_1 = __SMLAD(ker_0, vec_part_0, acc_1);
acc_1 = __SMLAD(ker_1, vec_part_1, acc_1);
}
acc_64_0 += acc_0;
acc_64_1 += acc_1;
for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
{
const int32_t lhs_temp = (*lhs_vec);
lhs_vec++;
acc_64_0 += lhs_temp * (*rhs_0);
rhs_0++;
acc_64_1 += lhs_temp * (*rhs_1);
rhs_1++;
}
if (bias)
{
acc_64_0 += *bias++;
acc_64_1 += *bias++;
}
q31_t tmp;
tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift);
tmp = MAX(tmp, activation_min);
tmp = MIN(tmp, activation_max);
*dst++ = (q15_t)tmp;
tmp = arm_nn_requantize_s64(acc_64_1, dst_multiplier, dst_shift);
tmp = MAX(tmp, activation_min);
tmp = MIN(tmp, activation_max);
*dst++ = (q15_t)tmp;
}
if (rhs_rows & 0x1)
{
q63_t acc_64_0 = 0;
int32_t acc_0 = 0;
const int32_t col_loop_cnt = rhs_cols_fast / 4;
const int16_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
for (int i = col_loop_cnt; i != 0; i--)
{
int32_t ker_0, ker_1, vec;
rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
vec = arm_nn_read_q15x2_ia(&lhs_vec);
acc_0 = __SMLAD(ker_0, vec, acc_0);
vec = arm_nn_read_q15x2_ia(&lhs_vec);
acc_0 = __SMLAD(ker_1, vec, acc_0);
}
acc_64_0 += acc_0;
for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
{
const int32_t lhs_temp = (*lhs_vec);
lhs_vec++;
acc_64_0 += lhs_temp * (*rhs_0);
rhs_0++;
}
if (bias)
{
acc_64_0 += *bias++;
}
q31_t tmp;
tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift);
tmp = MAX(tmp, activation_min);
tmp = MIN(tmp, activation_max);
*dst++ = (q15_t)tmp;
}
#else
for (int i_row_loop_cnt = 0; i_row_loop_cnt < rhs_rows; i_row_loop_cnt++)
{
const q15_t *lhs_ptr = lhs;
const q7_t *rhs_ptr_0 = &rhs[0];
q63_t result = 0;
if (bias)
{
result = *bias++;
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
const q63_t rhs_value0 = (int8_t)*rhs_ptr_0;
const q63_t lhs_value = *lhs_ptr;
result += lhs_value * rhs_value0;
++rhs_ptr_0;
++lhs_ptr;
}
// Quantize down
result = arm_nn_requantize_s64(result, dst_multiplier, dst_shift);
// Clamp the result
result = ((result) > (activation_min) ? (result) : (activation_min));
result = ((result) < (activation_max) ? (result) : (activation_max));
*dst++ = (q15_t)result;
rhs += rhs_cols;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,402 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_vec_mat_mult_t_s8
* Description: s8 vector by matrix (transposed) multiplication
*
* $Date: 28 April 2022
* $Revision: V.3.0.1
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 vector(lhs) by matrix (transposed) multiplication
*
* Refer header file for details.
*
*/
arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t dst_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max,
const int32_t address_offset)
{
(void)rhs_offset;
#if defined(ARM_MATH_MVEI)
const int32_t row_loop_cnt = rhs_rows / 3;
const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
const int8_t *rhs_2 = rhs + 2 * rhs_cols;
int32_t rhs_sum_0 = 0;
int32_t rhs_sum_1 = 0;
int32_t rhs_sum_2 = 0;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p);
acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p);
rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p);
acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p);
const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p);
rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p);
acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p);
lhs_vec += 16;
rhs_0 += 16;
rhs_1 += 16;
rhs_2 += 16;
}
rhs += 3 * rhs_cols;
int32x4_t acc = {acc_0, acc_1, acc_2, 0};
mve_pred16_t p = vctp32q(3);
if (bias)
{
int32x4_t b = vldrwq_z_s32(bias, p);
acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);
bias += 3;
}
const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
acc += vdupq_n_s32(lhs_offset) * rhs_sum;
acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
acc = vminq_s32(acc, vdupq_n_s32(activation_max));
if (address_offset > 1L)
{
vstrbq_scatter_offset_s32(dst, address_offset_array, acc);
}
else
{
vstrbq_p_s32(dst, acc, p);
}
dst += 3 * address_offset;
}
const int loop_cnt = rhs_rows % 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
{
int32_t acc_0 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
int32_t rhs_sum_0 = 0;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p);
acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
lhs_vec += 16;
rhs_0 += 16;
}
rhs += rhs_cols;
if (bias)
{
acc_0 += *bias;
bias++;
}
const int32_t offsets = rhs_sum_0 * lhs_offset;
acc_0 += offsets;
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
acc_0 += dst_offset;
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
*dst = MIN(acc_0, activation_max);
dst += address_offset;
}
#elif defined(ARM_MATH_DSP)
const int32_t row_loop_cnt = rhs_rows / 2;
const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
for (int32_t i = 0; i < row_loop_cnt; i++)
{
int32_t acc_0 = 0;
int32_t acc_1 = 0;
if (bias)
{
acc_0 = *bias++;
acc_1 = *bias++;
}
const int32_t col_loop_cnt = rhs_cols / 4;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
rhs += 2 * rhs_cols;
for (int j = col_loop_cnt; j != 0; j--)
{
int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
ker_0 = __SXTB16(ker_0);
acc_0 = __SMLAD(ker_1, vec_1, acc_0);
acc_0 = __SMLAD(ker_0, vec_0, acc_0);
ker_0 = arm_nn_read_q7x4_ia(&rhs_1);
ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
ker_0 = __SXTB16(ker_0);
acc_1 = __SMLAD(ker_1, vec_1, acc_1);
acc_1 = __SMLAD(ker_0, vec_0, acc_1);
}
for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
{
const int32_t lhs_temp = (*lhs_vec + lhs_offset);
lhs_vec++;
acc_0 += lhs_temp * (*rhs_0);
rhs_0++;
acc_1 += lhs_temp * (*rhs_1);
rhs_1++;
}
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift);
// Add offset
acc_0 += dst_offset;
acc_1 += dst_offset;
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
acc_1 = MAX(acc_1, activation_min);
acc_1 = MIN(acc_1, activation_max);
*dst = (int8_t)acc_0;
*(dst + address_offset) = (int8_t)acc_1;
dst += 2 * address_offset;
}
if (rhs_rows & 0x1)
{
int32_t acc_0 = 0;
if (bias)
{
acc_0 = *bias++;
}
const int32_t col_loop_cnt = rhs_cols / 4;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
for (int i = col_loop_cnt; i != 0; i--)
{
int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
ker_0 = __SXTB16(ker_0);
acc_0 = __SMLAD(ker_1, vec_1, acc_0);
acc_0 = __SMLAD(ker_0, vec_0, acc_0);
}
for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
{
const int32_t lhs_temp = (*lhs_vec + lhs_offset);
lhs_vec++;
acc_0 += lhs_temp * (*rhs_0);
rhs_0++;
}
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
// Add offset
acc_0 += dst_offset;
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
*dst = (int8_t)acc_0;
dst += address_offset;
}
#else
const int32_t row_loop_cnt = rhs_rows / 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
const q7_t *lhs_ptr = lhs;
const q7_t *rhs_ptr_0 = &rhs[0];
const q7_t *rhs_ptr_1 = &rhs[rhs_cols];
const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
q31_t res00 = 0;
q31_t res01 = 0;
q31_t res02 = 0;
if (bias)
{
res00 = *bias++;
res01 = *bias++;
res02 = *bias++;
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
const q31_t rhs_value0 = (int8_t)*rhs_ptr_0;
const q31_t rhs_value1 = (int8_t)*rhs_ptr_1;
const q31_t rhs_value2 = (int8_t)*rhs_ptr_2;
const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
res02 += lhs_value * rhs_value2;
++rhs_ptr_0;
++rhs_ptr_1;
++rhs_ptr_2;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
res02 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res02 = MAX(res02, activation_min);
res02 = MIN(res02, activation_max);
*dst = (q7_t)res00;
*(dst + address_offset) = (q7_t)res01;
*(dst + 2 * address_offset) = (q7_t)res02;
dst += 3 * address_offset;
rhs += 3 * rhs_cols;
}
const int loop_cnt = rhs_rows % 3;
for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
if (bias)
{
res00 = *bias++;
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = (int8_t)rhs_ptr[0];
q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
*dst = (int8_t)res00;
dst += address_offset;
rhs += rhs_cols;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,341 @@
/*
* Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_vec_mat_mult_t_svdf_s8
* Description: s8 vector by matrix (transposed) multiplication with
* s16 output. Targetted at SVDF operator.
*
* $Date: 15. April 2021
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 vector(lhs) by matrix (transposed) multiplication
*
* Refer header file for details.
*
*/
arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
const q7_t *rhs,
q15_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t dst_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max)
{
(void)rhs_offset;
if (rhs_cols < 0 || (NN_Q31_MAX - rhs_cols) < 16 || dst_offset < 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
(void)rhs_offset;
#if defined(ARM_MATH_MVEI)
int32_t row_loop_cnt = rhs_rows / 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
const int8_t *rhs_2 = rhs + 2 * rhs_cols;
int32_t rhs_sum_0 = 0;
int32_t rhs_sum_1 = 0;
int32_t rhs_sum_2 = 0;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p);
acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p);
rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p);
acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p);
const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p);
rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p);
acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p);
lhs_vec += 16;
rhs_0 += 16;
rhs_1 += 16;
rhs_2 += 16;
}
rhs += 3 * rhs_cols;
int32x4_t acc = {acc_0, acc_1, acc_2, 0};
const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
acc += vdupq_n_s32(lhs_offset) * rhs_sum;
acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
acc = vminq_s32(acc, vdupq_n_s32(activation_max));
*(dst) = (int16_t)acc[0];
*(dst + dst_offset) = (int16_t)acc[1];
*(dst + 2 * dst_offset) = (int16_t)acc[2];
dst += 3 * dst_offset;
}
const int loop_cnt = rhs_rows % 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
{
int32_t acc_0 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
int32_t rhs_sum_0 = 0;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p);
acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
lhs_vec += 16;
rhs_0 += 16;
}
rhs += rhs_cols;
const int32_t offsets = rhs_sum_0 * lhs_offset;
acc_0 = __QADD(acc_0, offsets);
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
*dst = (q15_t)MIN(acc_0, activation_max);
dst += dst_offset;
}
#elif defined(ARM_MATH_DSP)
int32_t row_loop_cnt = rhs_rows / 2;
const int16_t lhs_offset_s16 = lhs_offset;
const int16_t rhs_offset_s16 = rhs_offset;
const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16);
for (int32_t i = 0; i < row_loop_cnt; i++)
{
int32_t acc_0 = 0;
int32_t acc_1 = 0;
const int32_t col_loop_cnt = rhs_cols / 4;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
rhs += 2 * rhs_cols;
for (int j = col_loop_cnt; j != 0; j--)
{
int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
int32_t ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8);
ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
acc_0 = __SMLAD(ker_1, vec_1, acc_0);
acc_0 = __SMLAD(ker_0, vec_0, acc_0);
ker_0 = arm_nn_read_q7x4_ia(&rhs_1);
ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8);
ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
acc_1 = __SMLAD(ker_1, vec_1, acc_1);
acc_1 = __SMLAD(ker_0, vec_0, acc_1);
}
for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
{
const int32_t lhs_temp = (*lhs_vec + lhs_offset);
lhs_vec++;
acc_0 += lhs_temp * (*rhs_0 + rhs_offset);
rhs_0++;
acc_1 += lhs_temp * (*rhs_1 + rhs_offset);
rhs_1++;
}
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift);
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
acc_1 = MAX(acc_1, activation_min);
acc_1 = MIN(acc_1, activation_max);
*dst = (q15_t)acc_0;
*(dst + dst_offset) = (q15_t)acc_1;
dst += 2 * dst_offset;
}
if (rhs_rows & 0x1)
{
int32_t acc_0 = 0;
const int32_t col_loop_cnt = rhs_cols / 4;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
for (int i = col_loop_cnt; i != 0; i--)
{
int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
int32_t vec_1 = __SXTAB16(lhs_offset_s16x2, __ROR((uint32_t)vec_0, 8));
vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
int32_t ker_1 = __SXTAB16(rhs_offset_s16x2, __ROR((uint32_t)ker_0, 8));
ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
acc_0 = __SMLAD(ker_1, vec_1, acc_0);
acc_0 = __SMLAD(ker_0, vec_0, acc_0);
}
for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
{
const int32_t lhs_temp = (*lhs_vec + lhs_offset);
lhs_vec++;
acc_0 += lhs_temp * (*rhs_0 + rhs_offset);
rhs_0++;
}
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
*dst = (q15_t)acc_0;
dst += dst_offset;
}
#else
int32_t row_loop_cnt = rhs_rows / 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
const q7_t *lhs_ptr = lhs;
const q7_t *rhs_ptr_0 = &rhs[0];
const q7_t *rhs_ptr_1 = &rhs[rhs_cols];
const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
q31_t res00 = 0;
q31_t res01 = 0;
q31_t res02 = 0;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
const q31_t rhs_value0 = (int8_t)*rhs_ptr_0;
const q31_t rhs_value1 = (int8_t)*rhs_ptr_1;
const q31_t rhs_value2 = (int8_t)*rhs_ptr_2;
const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
res02 += lhs_value * rhs_value2;
++rhs_ptr_0;
++rhs_ptr_1;
++rhs_ptr_2;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift);
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res02 = MAX(res02, activation_min);
res02 = MIN(res02, activation_max);
*dst = (q15_t)res00;
*(dst + dst_offset) = (q15_t)res01;
*(dst + 2 * dst_offset) = (q15_t)res02;
dst += 3 * dst_offset;
rhs += 3 * rhs_cols;
}
const int loop_cnt = rhs_rows % 3;
for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset;
q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
*dst = (q15_t)res00;
dst += dst_offset;
rhs += rhs_cols;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,203 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nntables.c
* Description: Converts the elements of the Q7 vector to Q15 vector without left-shift
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @brief tables for various activation functions
*
* This file include the declaration of common tables.
* Most of them are used for activation functions
*
* Assumption:
* Unified table: input is 3.x format, i.e, range of [-8, 8)
* sigmoid(8) = 0.9996646498695336
* tanh(8) = 0.9999997749296758
* The accuracy here should be good enough
*
* 2-stage HL table:
*
* The entire input range is divided into two parts:
*
* Low range table: 0x000x xxxx or 0x111x xxxx
* table entry will be the binary number excluding the first
* two digits, i.e., 0x0x xxxx or 0x1x xxxx
*
*
*
* High range table 0x0010 0000 -- 0x0111 1111
* 0x1000 0000 -- 0x1101 1111
*
* For positive numbers, table entry will be
* 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
* i.e., 0x0000 0000 - 0x0101 11111
*
* same thing for the negative numbers, table entry will be
* 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
* i.e., 0x0110 0000 - 0x1011 1111
*/
const q7_t sigmoidTable_q7[256] = {
0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, 0x5e, 0x5f, 0x61,
0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x72, 0x73, 0x74, 0x74,
0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c,
0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06,
0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x11, 0x12,
0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d,
0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
};
const q15_t sigmoidTable_q15[256] = {
0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8,
0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, 0x68a6, 0x69d2, 0x6af1, 0x6c05,
0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6,
0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f,
0x7e69, 0x7e81, 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a,
0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4,
0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3,
0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016,
0x0017, 0x0019, 0x001a, 0x001c, 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034,
0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b,
0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4,
0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671,
0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea,
0x1f5f, 0x20e0, 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615,
0x380b, 0x3a04, 0x3c01, 0x3e00,
};
const q15_t sigmoidLTable_q15[128] = {
0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc,
0x4cd3, 0x4dc8, 0x4ebb, 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, 0x56ef, 0x57cd,
0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216,
0x62cc, 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, 0x68a6, 0x693d, 0x69d2, 0x6a63,
0x6af1, 0x6b7c, 0x6c05, 0x6c8a, 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, 0x0f42,
0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d,
0x162e, 0x16c3, 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, 0x1c81, 0x1d34, 0x1dea,
0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833,
0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, 0x3053, 0x3145, 0x3238, 0x332d, 0x3424,
0x351b, 0x3615, 0x370f, 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00,
};
const q15_t sigmoidHTable_q15[192] = {
0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0,
0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0,
0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, 0x7e98, 0x7eae,
0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5,
0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7,
0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c,
0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043,
0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, 0x0085, 0x008e, 0x0097, 0x00a1,
0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f,
0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828,
0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
};
const q7_t tanhTable_q7[256] = {
0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, 0x61, 0x65, 0x68,
0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e,
0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x81,
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, 0x85, 0x85, 0x86, 0x87,
0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9,
0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
};
const q15_t tanhTable_q15[256] = {
0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2,
0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, 0x73dc, 0x753a, 0x7672, 0x7788,
0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d,
0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4,
0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe,
0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006,
0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020,
0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8,
0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9,
0x9869, 0x9b50, 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941,
0xe0a7, 0xe847, 0xf015, 0xf803,
};
const q15_t tanhLTable_q15[128] = {
0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf,
0x3151, 0x34ae, 0x37f6, 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, 0x514d, 0x53a3,
0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37,
0x6b6e, 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, 0x73dc, 0x7490, 0x753a, 0x75da,
0x7672, 0x7701, 0x7788, 0x7807, 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, 0x849b,
0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26,
0x8ac6, 0x8b70, 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, 0x936b, 0x9492, 0x95c9,
0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d,
0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221,
0xd5a8, 0xd941, 0xdcec, 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00,
};
const q15_t tanhHTable_q15[192] = {
0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14,
0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7,
0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9,
0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8002,
0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009,
0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e,
0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
};

View File

@@ -0,0 +1,121 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_no_shift.c
* Description: Converts the elements of the Q7 vector to Q15 vector without left-shift
*
* $Date: May 29, 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* pDst[n] = (q15_t) pSrc[n]; 0 <= n < blockSize.
* </pre>
*
*/
void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
{
const q7_t *pIn = pSrc;
uint32_t blkCnt;
#if defined(ARM_MATH_DSP)
q31_t in;
q31_t in1, in2;
q31_t out1, out2;
/*loop Unrolling */
blkCnt = blockSize >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. */
while (blkCnt > 0u)
{
in = arm_nn_read_q7x4_ia(&pIn);
/* rotatate in by 8 and extend two q7_t values to q15_t values */
in1 = __SXTB16(__ROR((uint32_t)in, 8));
/* extend remaining two q7_t values to q15_t values */
in2 = __SXTB16(in);
#ifndef ARM_MATH_BIG_ENDIAN
out2 = (int32_t)__PKHTB(in1, in2, 16);
out1 = (int32_t)__PKHBT(in2, in1, 16);
#else
out1 = (int32_t)__PKHTB(in1, in2, 16);
out2 = (int32_t)__PKHBT(in2, in1, 16);
#endif
arm_nn_write_q15x2_ia(&pDst, out1);
arm_nn_write_q15x2_ia(&pDst, out2);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4u;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
while (blkCnt > 0u)
{
/* convert from q7 to q15 and then store the results in the destination buffer */
*pDst++ = (q15_t)*pIn++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of nndata_convert group
*/

View File

@@ -0,0 +1,143 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_reordered_no_shift.c
* Description: Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
*
* $Date: July 20, 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
*
* @details
*
* This function does the q7 to q15 expansion with re-ordering
*
* <pre>
* | A1 | A2 | A3 | A4 |
*
* 0 7 8 15 16 23 24 31
* </pre>
*
* is converted into:
*
* <pre>
* | A1 | A3 | and | A2 | A4 |
*
* 0 15 16 31 0 15 16 31
* </pre>
*
*
* This looks strange but is natural considering how sign-extension is done at
* assembly level.
*
* The expansion of other other oprand will follow the same rule so that the end
* results are the same.
*
* The tail (i.e., last (N % 4) elements) will still be in original order.
*
*/
void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
{
const q7_t *pIn = pSrc; /* Src pointer */
uint32_t blkCnt; /* loop counter */
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
q31_t in;
q31_t in1, in2;
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0u)
{
/* C = (q15_t) A << 8 */
/* convert from q7 to q15 and then store the results in the destination buffer */
in = arm_nn_read_q7x4_ia(&pIn);
/* rotatate in by 8 and extend two q7_t values to q15_t values */
in1 = __SXTB16(__ROR((uint32_t)in, 8));
/* extend remainig two q7_t values to q15_t values */
in2 = __SXTB16(in);
#ifndef ARM_MATH_BIG_ENDIAN
arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
#else
arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
#endif
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4u;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
while (blkCnt > 0u)
{
/* C = (q15_t) A << 8 */
/* convert from q7 to q15 and then store the results in the destination buffer */
*pDst++ = (q15_t)*pIn++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of q7_to_x group
*/

View File

@@ -0,0 +1,100 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_reordered_with_offset.c
* Description: Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. The re-ordering
* is a signature of sign extension intrinsic(DSP extension).
*
* $Date: May 29, 2020
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset.
*
* @note Refer header file for details.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
{
#if defined(ARM_MATH_DSP)
uint32_t block_cnt;
/* Run the below code for cores that support SIMD instructions */
q31_t in_q7x4;
q31_t out_q15x2_1;
q31_t out_q15x2_2;
/*loop unrolling */
block_cnt = block_size >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. */
const q31_t offset_q15x2 = (q31_t)__PKHBT(offset, offset, 16);
while (block_cnt > 0u)
{
/* convert from q7 to q15 and then store the results in the destination buffer */
in_q7x4 = arm_nn_read_q7x4_ia(&src);
/* Extract and sign extend each of the four q7 values to q15 */
out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8));
out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
block_cnt--;
}
/* Handle left over samples */
block_cnt = block_size % 0x4u;
while (block_cnt > 0u)
{
*dst++ = (q15_t)*src++ + offset;
/* Decrement the loop counter */
block_cnt--;
}
#else
(void)src;
(void)dst;
(void)block_size;
(void)offset;
/* Not available */
#endif
}
/**
* @} end of nndata_convert group
*/

View File

@@ -0,0 +1,114 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in_q7x4 compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in_q7x4 writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_with_offset.c
* Description: Converts the elements of the Q7 vector to Q15 vector with an added offset
*
* $Date: March 3, 2020
* $Revision: V.2.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
{
int block_cnt;
#if defined(ARM_MATH_MVEI)
int16x8_t source;
const int16x8_t source_offset = vdupq_n_s16(offset);
block_cnt = block_size / 8;
while (block_cnt > 0)
{
source = vldrbq_s16(src);
source = vaddq_s16(source, source_offset);
vstrhq_s16(dst, source);
dst += 8;
src += 8;
block_cnt--;
}
block_cnt = block_size & 0x7;
#elif defined(ARM_MATH_DSP)
/* Run the below code for cores that support SIMD instructions */
q31_t in_q7x4;
q31_t in_q15x2_1;
q31_t in_q15x2_2;
q31_t out_q15x2_1;
q31_t out_q15x2_2;
/*loop unrolling */
block_cnt = block_size >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. */
const q31_t offset_q15x2 = __PKHBT(offset, offset, 16);
while (block_cnt > 0)
{
/* convert from q7 to q15 and then store the results in the destination buffer */
in_q7x4 = arm_nn_read_q7x4_ia(&src);
/* Extract and sign extend each of the four q7 values to q15 */
in_q15x2_1 = __SXTAB16(offset_q15x2, __ROR(in_q7x4, 8));
in_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16);
out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16);
arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
block_cnt--;
}
/* Handle left over samples */
block_cnt = block_size % 0x4;
#else
/* Run the below code for Cortex-M0 */
/* Loop over block_size number of values */
block_cnt = block_size;
#endif
while (block_cnt > 0)
{
*dst++ = (q15_t)*src++ + offset;
/* Decrement the loop counter */
block_cnt--;
}
}
/**
* @} end of nndata_convert group
*/

View File

@@ -0,0 +1,24 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
file(GLOB SRC_S16 "./*_s16.c")
target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})

View File

@@ -0,0 +1,128 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_avgpool_s16.c
* Description: Pooling function implementations
*
* $Date: 3. February 2022
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Pooling
* @{
*/
/*
* s16 average pooling function
*
* Refer to header file for details.
*
*/
arm_status arm_avgpool_s16(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q15_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q15_t *dst)
{
(void)ctx;
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_x = output_dims->w;
const int32_t stride_y = pool_params->stride.h;
const int32_t stride_x = pool_params->stride.w;
const int32_t kernel_y = filter_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t pad_y = pool_params->padding.h;
const int32_t pad_x = pool_params->padding.w;
const int32_t act_min = pool_params->activation.min;
const int32_t act_max = pool_params->activation.max;
const int32_t ch_src = input_dims->c;
/* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c.
*/
for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
{
for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
{
/* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
const int32_t ker_y_start = MAX(0, -base_idx_y);
const int32_t ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
for (int i_ch_in = 0; i_ch_in < ch_src; i_ch_in++)
{
int sum = 0;
int count = 0;
for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
{
for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
{
sum += src[i_ch_in + ch_src * (k_x + base_idx_x + (k_y + base_idx_y) * input_x)];
count++;
}
}
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
sum = MAX(sum, act_min);
sum = MIN(sum, act_max);
dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum;
}
}
}
return ARM_MATH_SUCCESS;
}
int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
{
(void)output_x;
(void)ch_src;
return 0;
}
/**
* @} end of Pooling group
*/

View File

@@ -0,0 +1,401 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_avgpool_s8.c
* Description: Pooling function implementations
*
* $Date: 01. March 2021
* $Revision: V.2.0.4
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
static void scale_q31_to_q7_and_clamp(const q31_t *buffer,
q7_t *target,
int32_t length,
const int32_t count,
const int act_min,
const int act_max)
{
const int half_count = count / 2;
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return;
}
for (int i = 0; i < length; i++)
{
int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);
sum = sum / count;
sum = MAX(sum, act_min);
sum = MIN(sum, act_max);
target[i] = (q7_t)sum;
}
}
#endif
/**
* @ingroup groupNN
*/
/**
* @addtogroup Pooling
* @{
*/
/*
* s8 average pooling function
*
* Refer to header file for details.
*
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q7_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q7_t *dst)
{
(void)ctx;
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_x = output_dims->w;
const int32_t stride_y = pool_params->stride.h;
const int32_t stride_x = pool_params->stride.w;
const int32_t kernel_y = filter_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t pad_y = pool_params->padding.h;
const int32_t pad_x = pool_params->padding.w;
const int32_t act_min = pool_params->activation.min;
const int32_t act_max = pool_params->activation.max;
const int32_t ch_src = input_dims->c;
int32_t i_x, i_y;
int32_t k_x, k_y;
for (i_y = 0; i_y < output_y; i_y++)
{
for (i_x = 0; i_x < output_x; i_x++)
{
int32_t k_y_start, k_y_end;
int32_t k_x_start, k_x_end;
int32_t chCnt;
const int8_t *pTmp, *pTmpInner;
int8_t *pDst;
k_y_start = MAX(0, i_y * stride_y - pad_y);
k_y_end = MIN(i_y * stride_y - pad_y + kernel_y, input_y);
k_x_start = MAX(0, i_x * stride_x - pad_x);
k_x_end = MIN(i_x * stride_x - pad_x + kernel_x, input_x);
pTmp = src;
pDst = &dst[ch_src * (i_x + i_y * output_x)];
chCnt = ch_src >> 4;
while (chCnt > 0)
{
int32x4_t sumV1, sumV2, sumV3, sumV4;
int8x16_t tempV;
int16x8_t tempVLO, tempVHI;
int32x4_t tempVLOLO, tempVLOHI, tempVHILO, tempVHIHI;
int32_t count = 0;
sumV1 = vdupq_n_s32(0);
sumV2 = vdupq_n_s32(0);
sumV3 = vdupq_n_s32(0);
sumV4 = vdupq_n_s32(0);
for (k_y = k_y_start; k_y < k_y_end; k_y++)
{
for (k_x = k_x_start; k_x < k_x_end; k_x++)
{
pTmpInner = pTmp + (ch_src * (k_x + k_y * input_x));
tempV = vldrbq_s8(pTmpInner);
tempVLO = vmovlbq_s8(tempV);
tempVHI = vmovltq_s8(tempV);
tempVLOLO = vmovlbq_s16(tempVLO);
tempVLOHI = vmovltq_s16(tempVLO);
tempVHILO = vmovlbq_s16(tempVHI);
tempVHIHI = vmovltq_s16(tempVHI);
sumV1 = vaddq_s32(sumV1, tempVLOLO);
sumV2 = vaddq_s32(sumV2, tempVLOHI);
sumV3 = vaddq_s32(sumV3, tempVHILO);
sumV4 = vaddq_s32(sumV4, tempVHIHI);
count++;
}
}
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
sumV1[0] = sumV1[0] > 0 ? (sumV1[0] + count / 2) / count : (sumV1[0] - count / 2) / count;
sumV1[1] = sumV1[1] > 0 ? (sumV1[1] + count / 2) / count : (sumV1[1] - count / 2) / count;
sumV1[2] = sumV1[2] > 0 ? (sumV1[2] + count / 2) / count : (sumV1[2] - count / 2) / count;
sumV1[3] = sumV1[3] > 0 ? (sumV1[3] + count / 2) / count : (sumV1[3] - count / 2) / count;
sumV2[0] = sumV2[0] > 0 ? (sumV2[0] + count / 2) / count : (sumV2[0] - count / 2) / count;
sumV2[1] = sumV2[1] > 0 ? (sumV2[1] + count / 2) / count : (sumV2[1] - count / 2) / count;
sumV2[2] = sumV2[2] > 0 ? (sumV2[2] + count / 2) / count : (sumV2[2] - count / 2) / count;
sumV2[3] = sumV2[3] > 0 ? (sumV2[3] + count / 2) / count : (sumV2[3] - count / 2) / count;
sumV3[0] = sumV3[0] > 0 ? (sumV3[0] + count / 2) / count : (sumV3[0] - count / 2) / count;
sumV3[1] = sumV3[1] > 0 ? (sumV3[1] + count / 2) / count : (sumV3[1] - count / 2) / count;
sumV3[2] = sumV3[2] > 0 ? (sumV3[2] + count / 2) / count : (sumV3[2] - count / 2) / count;
sumV3[3] = sumV3[3] > 0 ? (sumV3[3] + count / 2) / count : (sumV3[3] - count / 2) / count;
sumV4[0] = sumV4[0] > 0 ? (sumV4[0] + count / 2) / count : (sumV4[0] - count / 2) / count;
sumV4[1] = sumV4[1] > 0 ? (sumV4[1] + count / 2) / count : (sumV4[1] - count / 2) / count;
sumV4[2] = sumV4[2] > 0 ? (sumV4[2] + count / 2) / count : (sumV4[2] - count / 2) / count;
sumV4[3] = sumV4[3] > 0 ? (sumV4[3] + count / 2) / count : (sumV4[3] - count / 2) / count;
sumV1 = vmaxq_s32(sumV1, vdupq_n_s32(act_min));
sumV1 = vminq_s32(sumV1, vdupq_n_s32(act_max));
sumV2 = vmaxq_s32(sumV2, vdupq_n_s32(act_min));
sumV2 = vminq_s32(sumV2, vdupq_n_s32(act_max));
sumV3 = vmaxq_s32(sumV3, vdupq_n_s32(act_min));
sumV3 = vminq_s32(sumV3, vdupq_n_s32(act_max));
sumV4 = vmaxq_s32(sumV4, vdupq_n_s32(act_min));
sumV4 = vminq_s32(sumV4, vdupq_n_s32(act_max));
tempVLO = vmovnbq_s32(tempVLO, sumV1);
tempVLO = vmovntq_s32(tempVLO, sumV2);
tempVHI = vmovnbq_s32(tempVHI, sumV3);
tempVHI = vmovntq_s32(tempVHI, sumV4);
tempV = vmovnbq_s16(tempV, tempVLO);
tempV = vmovntq_s16(tempV, tempVHI);
vstrbq_s8(pDst, tempV);
pDst += 16;
chCnt--;
pTmp += 16;
}
chCnt = ch_src & 0xF;
while (chCnt > 0)
{
int32_t sum = 0;
int32_t count = 0;
for (k_y = k_y_start; k_y < k_y_end; k_y++)
{
for (k_x = k_x_start; k_x < k_x_end; k_x++)
{
sum += pTmp[ch_src * (k_x + k_y * input_x)];
count++;
}
}
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
sum = MAX(sum, act_min);
sum = MIN(sum, act_max);
*pDst++ = sum;
chCnt--;
pTmp++;
}
}
}
return ARM_MATH_SUCCESS;
}
#else
arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q7_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q7_t *dst)
{
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_x = output_dims->w;
const int32_t stride_y = pool_params->stride.h;
const int32_t stride_x = pool_params->stride.w;
const int32_t kernel_y = filter_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t pad_y = pool_params->padding.h;
const int32_t pad_x = pool_params->padding.w;
const int32_t act_min = pool_params->activation.min;
const int32_t act_max = pool_params->activation.max;
const int32_t ch_src = input_dims->c;
if (ctx->buf == NULL && arm_avgpool_s8_get_buffer_size(output_dims->w, input_dims->c))
{
return ARM_MATH_ARGUMENT_ERROR;
}
q31_t *buffer = (q31_t *)ctx->buf;
#if defined(ARM_MATH_DSP)
/* Run the following code for CPU's with DSP extension
*/
for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++)
{
for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++)
{
/* Condition for kernel start dimension:
(base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
const int32_t kernel_y_start = MAX(0, -idx_y);
const int32_t kernel_x_start = MAX(0, -idx_x);
/* Condition for kernel end dimension:
(base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y);
const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x);
int count = 0;
for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++)
{
for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
{
const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
if (count == 0)
{
for (int i = 0; i < ch_src; i++)
{
buffer[i] = start[i];
}
}
else
{
for (int i = 0; i < ch_src; i++)
{
buffer[i] = __QADD(start[i], buffer[i]);
}
}
count++;
}
}
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
scale_q31_to_q7_and_clamp(buffer, dst, ch_src, count, act_min, act_max);
dst += ch_src;
}
}
#else
/* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC.
*/
(void)buffer;
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
for (i_y = 0; i_y < output_y; i_y++)
{
for (i_x = 0; i_x < output_x; i_x++)
{
for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++)
{
int sum = 0;
int count = 0;
for (k_y = i_y * stride_y - pad_y; k_y < i_y * stride_y - pad_y + kernel_y; k_y++)
{
for (k_x = i_x * stride_x - pad_x; k_x < i_x * stride_x - pad_x + kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < input_y && k_x < input_x)
{
sum += src[i_ch_in + ch_src * (k_x + k_y * input_x)];
count++;
}
}
}
// Prevent static code issue DIVIDE_BY_ZERO.
if (count == 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
sum = MAX(sum, act_min);
sum = MIN(sum, act_max);
dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum;
}
}
}
#endif
return ARM_MATH_SUCCESS;
}
#endif /* ARM_MATH_MVEI */
int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src)
{
(void)output_x;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
return (ch_src * sizeof(int32_t));
#else
(void)ch_src;
return 0;
#endif
}
/**
* @} end of Pooling group
*/

View File

@@ -0,0 +1,180 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_max_pool_s16.c
* Description: Pooling function implementations
*
* $Date: 24. January 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length)
{
q15_t *dst = base;
const q15_t *src = target;
union arm_nnword ref_max;
union arm_nnword comp_max;
int32_t cnt = length >> 1;
while (cnt > 0l)
{
ref_max.word = arm_nn_read_q15x2(dst);
comp_max.word = arm_nn_read_q15x2_ia(&src);
if (comp_max.half_words[0] > ref_max.half_words[0])
{
ref_max.half_words[0] = comp_max.half_words[0];
}
if (comp_max.half_words[1] > ref_max.half_words[1])
{
ref_max.half_words[1] = comp_max.half_words[1];
}
arm_nn_write_q15x2_ia(&dst, ref_max.word);
cnt--;
}
if (length & 0x1)
{
if (*src > *dst)
{
*dst = *src;
}
}
}
static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max)
{
union arm_nnword in;
int32_t cnt = length >> 1;
while (cnt > 0l)
{
in.word = arm_nn_read_q15x2(source);
in.half_words[0] = MAX(in.half_words[0], act_min);
in.half_words[0] = MIN(in.half_words[0], act_max);
in.half_words[1] = MAX(in.half_words[1], act_min);
in.half_words[1] = MIN(in.half_words[1], act_max);
arm_nn_write_q15x2_ia(&source, in.word);
cnt--;
}
if (length & 0x1)
{
int16_t comp = *source;
comp = MAX(comp, act_min);
comp = MIN(comp, act_max);
*source = comp;
}
}
/**
* @ingroup groupNN
*/
/**
* @addtogroup Pooling
* @{
*/
/*
* Optimized s16 max pooling function
*
* Refer to header file for details.
*
*/
arm_status arm_max_pool_s16(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const int16_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
int16_t *dst)
{
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_x = output_dims->w;
const int32_t stride_y = pool_params->stride.h;
const int32_t stride_x = pool_params->stride.w;
const int32_t kernel_y = filter_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t pad_y = pool_params->padding.h;
const int32_t pad_x = pool_params->padding.w;
const int16_t act_min = pool_params->activation.min;
const int16_t act_max = pool_params->activation.max;
const int32_t channel_in = input_dims->c;
(void)ctx;
int16_t *dst_base = dst;
for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
{
for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
{
/* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
const int32_t ker_y_start = MAX(0, -base_idx_y);
const int32_t ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
int count = 0;
for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
{
for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
{
const int16_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
if (count == 0)
{
memcpy(dst, start, channel_in * sizeof(int16_t));
count++;
}
else
{
compare_and_replace_if_larger(dst, start, channel_in);
}
}
}
/* 'count' is expected to be non-zero here. */
dst += channel_in;
}
}
clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max);
return ARM_MATH_SUCCESS;
}
/**
* @} end of Pooling group
*/

View File

@@ -0,0 +1,229 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_max_pool_s8.c
* Description: Pooling function implementations
*
* $Date: 20. July 2021
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length)
{
#if defined(ARM_MATH_MVEI)
int32_t loop_count = (length + 15) / 16;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp8q((uint32_t)length);
const int8x16_t op_1 = vldrbq_z_s8(base, p);
const int8x16_t op_2 = vldrbq_z_s8(target, p);
const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
vstrbq_p_s8(base, max, p);
base += 16;
target += 16;
length -= 16;
}
#else
q7_t *dst = base;
const q7_t *src = target;
union arm_nnword ref_max;
union arm_nnword comp_max;
int32_t cnt = length >> 2;
while (cnt > 0l)
{
ref_max.word = arm_nn_read_q7x4(dst);
comp_max.word = arm_nn_read_q7x4_ia(&src);
if (comp_max.bytes[0] > ref_max.bytes[0])
{
ref_max.bytes[0] = comp_max.bytes[0];
}
if (comp_max.bytes[1] > ref_max.bytes[1])
{
ref_max.bytes[1] = comp_max.bytes[1];
}
if (comp_max.bytes[2] > ref_max.bytes[2])
{
ref_max.bytes[2] = comp_max.bytes[2];
}
if (comp_max.bytes[3] > ref_max.bytes[3])
{
ref_max.bytes[3] = comp_max.bytes[3];
}
arm_nn_write_q7x4_ia(&dst, ref_max.word);
cnt--;
}
cnt = length & 0x3;
while (cnt > 0l)
{
if (*src > *dst)
{
*dst = *src;
}
dst++;
src++;
cnt--;
}
#endif
}
static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max)
{
#if defined(ARM_MATH_MVEI)
int32_t loop_count = (length + 15) / 16;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp8q((uint32_t)length);
length -= 16;
const int8x16_t src = vldrbq_z_s8(source, p);
const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
vstrbq_p_s8(source, res, p);
source += 16;
}
#else
union arm_nnword in;
int32_t cnt = length >> 2;
while (cnt > 0l)
{
in.word = arm_nn_read_q7x4(source);
in.bytes[0] = MAX(in.bytes[0], act_min);
in.bytes[0] = MIN(in.bytes[0], act_max);
in.bytes[1] = MAX(in.bytes[1], act_min);
in.bytes[1] = MIN(in.bytes[1], act_max);
in.bytes[2] = MAX(in.bytes[2], act_min);
in.bytes[2] = MIN(in.bytes[2], act_max);
in.bytes[3] = MAX(in.bytes[3], act_min);
in.bytes[3] = MIN(in.bytes[3], act_max);
arm_nn_write_q7x4_ia(&source, in.word);
cnt--;
}
cnt = length & 0x3;
while (cnt > 0l)
{
int32_t comp = *source;
comp = MAX(comp, act_min);
comp = MIN(comp, act_max);
*source++ = (int8_t)comp;
cnt--;
}
#endif
}
/**
* @ingroup groupNN
*/
/**
* @addtogroup Pooling
* @{
*/
/*
* Optimized s8 max pooling function
*
* Refer to header file for details.
*
*/
arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q7_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q7_t *dst)
{
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_x = output_dims->w;
const int32_t stride_y = pool_params->stride.h;
const int32_t stride_x = pool_params->stride.w;
const int32_t kernel_y = filter_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t pad_y = pool_params->padding.h;
const int32_t pad_x = pool_params->padding.w;
const int32_t act_min = pool_params->activation.min;
const int32_t act_max = pool_params->activation.max;
const int32_t channel_in = input_dims->c;
(void)ctx;
q7_t *dst_base = dst;
for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
{
for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
{
/* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
const int32_t ker_y_start = MAX(0, -base_idx_y);
const int32_t ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
int count = 0;
for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
{
for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
{
const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
if (count == 0)
{
arm_memcpy_q7(dst, start, channel_in);
count++;
}
else
{
compare_and_replace_if_larger_q7(dst, start, channel_in);
}
}
}
/* 'count' is expected to be non-zero here. */
dst += channel_in;
}
}
clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max);
return ARM_MATH_SUCCESS;
}
/**
* @} end of Pooling group
*/

View File

@@ -0,0 +1,464 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_pool_q7_HWC.c
* Description: Pooling function implementations
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/**
* @brief A few utility functions used by pooling functions
*
*
*/
static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale)
{
int i;
for (i = 0; i < length; i++)
{
target[i] = (q7_t)(buffer[i] / scale);
}
}
static void compare_and_replace_if_larger_q7(q7_t *base, // base data
const q7_t *target, // compare target
const uint16_t length // data length
)
{
q7_t *pIn = base;
const q7_t *pCom = target;
union arm_nnword in;
union arm_nnword com;
uint16_t cnt = length >> 2;
while (cnt > 0u)
{
in.word = arm_nn_read_q7x4((const q7_t *)pIn);
com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);
// if version
if (com.bytes[0] > in.bytes[0])
in.bytes[0] = com.bytes[0];
if (com.bytes[1] > in.bytes[1])
in.bytes[1] = com.bytes[1];
if (com.bytes[2] > in.bytes[2])
in.bytes[2] = com.bytes[2];
if (com.bytes[3] > in.bytes[3])
in.bytes[3] = com.bytes[3];
arm_nn_write_q7x4_ia(&pIn, in.word);
cnt--;
}
cnt = length & 0x3;
while (cnt > 0u)
{
if (*pCom > *pIn)
{
*pIn = *pCom;
}
pIn++;
pCom++;
cnt--;
}
}
static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length)
{
q15_t *pCnt = base;
q7_t *pV = target;
q31_t v1, v2, vo1, vo2;
uint16_t cnt = length >> 2;
q31_t in;
while (cnt > 0u)
{
q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
v1 = __SXTB16(__ROR(value, 8));
v2 = __SXTB16(value);
#ifndef ARM_MATH_BIG_ENDIAN
vo2 = __PKHTB(v1, v2, 16);
vo1 = __PKHBT(v2, v1, 16);
#else
vo1 = __PKHTB(v1, v2, 16);
vo2 = __PKHBT(v2, v1, 16);
#endif
in = arm_nn_read_q15x2(pCnt);
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
in = arm_nn_read_q15x2(pCnt);
arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
cnt--;
}
cnt = length & 0x3;
while (cnt > 0u)
{
*pCnt++ += *pV++;
cnt--;
}
}
#endif // ARM_MATH_DSP
/**
* @ingroup groupNN
*/
/**
* @addtogroup Pooling
* @{
*/
/**
* @brief Q7 max pooling function
* @param[in, out] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA Not used
* @param[in,out] Im_out pointer to output tensor
*
* @details
*
* The pooling function is implemented as split x-pooling then
* y-pooling.
*
* This pooling function is input-destructive. Input data is undefined
* after calling this function.
*
*/
void arm_maxpool_q7_HWC(q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t *bufferA,
q7_t *Im_out)
{
(void)bufferA;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_x, i_y;
/* first does the pooling along x axis */
for (i_y = 0; i_y < dim_im_in; i_y++)
{
for (i_x = 0; i_x < dim_im_out; i_x++)
{
/* for each output pixel */
q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
q7_t *win_start;
q7_t *win_stop;
if (i_x * stride - padding < 0)
{
win_start = target;
}
else
{
win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
}
if (i_x * stride - padding + dim_kernel >= dim_im_in)
{
win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
}
else
{
win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
}
/* first step is to copy over initial data */
/* arm_copy_q7(win_start, target, ch_im_in); */
memmove(target, win_start, ch_im_in);
/* start the max operation from the second part */
win_start += ch_im_in;
for (; win_start < win_stop; win_start += ch_im_in)
{
compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
}
}
}
/* then does the pooling along y axis */
for (i_y = 0; i_y < dim_im_out; i_y++)
{
/* for each output row */
q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
q7_t *row_start;
q7_t *row_end;
/* setting the starting row */
if (i_y * stride - padding < 0)
{
row_start = Im_in;
}
else
{
row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
}
/* setting the stopping row */
if (i_y * stride - padding + dim_kernel >= dim_im_in)
{
row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
}
else
{
row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
}
/* copy over the first row */
/* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
memmove(target, row_start, dim_im_out * ch_im_in);
/* move over to next row */
row_start += ch_im_in * dim_im_in;
for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
{
compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out; i_y++)
{
for (i_x = 0; i_x < dim_im_out; i_x++)
{
int max = -129;
for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
{
for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
{
if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
{
max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
}
}
}
}
Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
}
}
}
#endif /* ARM_MATH_DSP */
}
/**
* @brief Q7 average pooling function
* @param[in,out] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] Im_out pointer to output tensor
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*dim_im_out*ch_im_in
*
* The pooling function is implemented as split x-pooling then
* y-pooling.
*
* This pooling function is input-destructive. Input data is undefined
* after calling this function.
*
*/
void arm_avepool_q7_HWC(q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t *bufferA,
q7_t *Im_out)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
q15_t *buffer = (q15_t *)bufferA;
int16_t i_x, i_y;
int16_t count = 0;
/* first does the pooling along x axis */
for (i_y = 0; i_y < dim_im_in; i_y++)
{
for (i_x = 0; i_x < dim_im_out; i_x++)
{
/* for each output pixel */
q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
q7_t *win_start;
q7_t *win_stop;
if (i_x * stride - padding < 0)
{
win_start = target;
}
else
{
win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
}
if (i_x * stride - padding + dim_kernel >= dim_im_in)
{
win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
}
else
{
win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
}
/* first step is to copy over initial data */
arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
count = 1;
/* start the max operation from the second part */
win_start += ch_im_in;
for (; win_start < win_stop; win_start += ch_im_in)
{
accumulate_q7_to_q15(buffer, win_start, ch_im_in);
count++;
}
buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
}
}
/* then does the pooling along y axis */
for (i_y = 0; i_y < dim_im_out; i_y++)
{
/* for each output row */
q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
q7_t *row_start;
q7_t *row_end;
/* setting the starting row */
if (i_y * stride - padding < 0)
{
row_start = Im_in;
}
else
{
row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
}
/* setting the stopping row */
if (i_y * stride - padding + dim_kernel >= dim_im_in)
{
row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
}
else
{
row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
}
/* copy over the first row */
arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
count = 1;
/* move over to next row */
row_start += ch_im_in * dim_im_in;
for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
{
accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
count++;
}
buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out; i_y++)
{
for (i_x = 0; i_x < dim_im_out; i_x++)
{
int sum = 0;
int count = 0;
for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
{
for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
{
sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
count++;
}
}
}
Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
}
}
}
#endif /* ARM_MATH_DSP */
}
/**
* @} end of Pooling group
*/

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_*.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_reshape_s8.c
* Description: Reshape a s8 vector
*
* $Date: September 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Reshape
* @{
*/
/**
* Basic s8 reshape function.
*
* Refer header file for details.
*
*/
void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size)
{
arm_memcpy_q7(output, input, total_size);
}
/**
* @} end of Reshape group
*/

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,271 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_svdf_s8.c
* Description: S8 basic SVDF layer function
*
* $Date: 28 April 2022
* $Revision: V.3.0.1
*
* Target Processor: Cortex-M processors
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup SVDF
* @{
*/
/*
* S8 SVDF layer function for TensorFlow Lite with 8 bit state tensor
*
* Refer to header file for details.
*
*/
arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
const cmsis_nn_per_tensor_quant_params *output_quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
q7_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
const q7_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
(void)state_dims;
(void)output_dims;
const q31_t multiplier_in = input_quant_params->multiplier;
const q31_t shift_in = input_quant_params->shift;
const q31_t multiplier_out = output_quant_params->multiplier;
const q31_t shift_2 = output_quant_params->shift;
const int32_t zp_in = svdf_params->input_offset;
const int32_t zp_out = svdf_params->output_offset;
const int32_t in_activation_min = svdf_params->input_activation.min;
const int32_t in_activation_max = svdf_params->input_activation.max;
const int32_t out_activation_min = svdf_params->output_activation.min;
const int32_t out_activation_max = svdf_params->output_activation.max;
const int16_t rank = svdf_params->rank;
const int32_t input_batches = input_dims->n;
const int32_t input_height = input_dims->h;
const int32_t feature_batches = weights_feature_dims->n;
const int32_t time_batches = weights_time_dims->h;
const int32_t unit_count = feature_batches / rank;
if (input_ctx->buf == NULL)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q31_t *buffer_a = (q31_t *)input_ctx->buf;
if (output_ctx->buf == NULL)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q31_t *buffer_b = (q31_t *)output_ctx->buf;
// Left shift state
memmove((int8_t *)state_data,
(int8_t *)state_data + 1,
(size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int8_t)));
// Matrix multiplication input * feature weight
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
const q7_t *weight = weights_feature_data;
const q7_t *input = input_data + i_batch * input_height;
arm_status res = arm_nn_vec_mat_mult_t_s8(input,
weight,
NULL,
res_ptr,
-zp_in,
0,
0,
multiplier_in,
shift_in,
input_height,
feature_batches,
in_activation_min,
in_activation_max,
time_batches);
if (res != ARM_MATH_SUCCESS)
{
return res;
}
}
// Matrix multiplicate time weight * state tensors
{
q31_t *ptr_a = buffer_a;
const int8_t *v2 = state_data;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
const int8_t *v1 = weights_time_data;
for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
{
*ptr_a = 0;
int32_t sum = 0;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
// Perform matrix multiplication in blocks of four
int j = 0;
int32_t block_count = time_batches >> 2;
for (int i = 0; i < block_count; i++)
{
j += 4;
q31_t r1_1, r1_2, r2_1, r2_2;
v1 = read_and_pad_reordered(v1, &r1_1, &r1_2);
v2 = read_and_pad_reordered(v2, &r2_1, &r2_2);
sum = __SMLAD(r1_1, r2_1, sum);
sum = __SMLAD(r1_2, r2_2, sum);
}
// Process the remaining data
for (; j < time_batches; j++)
{
sum += *v1 * *v2;
v1++;
v2++;
}
#else
for (int j = 0; j < time_batches; j++)
{
sum += *v1 * *v2;
v1++;
v2++;
}
#endif
*ptr_a = sum;
ptr_a++;
}
}
}
if (bias_data)
{
if (unit_count == feature_batches)
{
for (int i = 0; i < input_batches; i++)
{
q31_t *output_temp = buffer_b + i * feature_batches;
const q31_t *ptr_a = buffer_a + i * feature_batches;
const int32_t *bi = bias_data;
for (int j = 0; j < feature_batches; j++)
{
output_temp[j] = ptr_a[j] + bi[j];
}
}
}
else
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q31_t *output_data_temp = buffer_b + i_batch * unit_count;
q31_t *ptr_a = buffer_a + i_batch * feature_batches;
for (int i = 0; i < unit_count; i++)
{
int32_t sum = bias_data[i];
for (int j = 0; j < rank; j++)
{
sum += *ptr_a;
ptr_a++;
}
output_data_temp[i] = sum;
}
}
}
}
else
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q31_t *output_data_temp = buffer_b + i_batch * unit_count;
q31_t *ptr_a = buffer_a + i_batch * feature_batches;
for (int i = 0; i < unit_count; i++)
{
int32_t sum = 0;
for (int j = 0; j < rank; j++)
{
sum += *ptr_a;
ptr_a++;
}
output_data_temp[i] = sum;
}
}
}
#if defined(ARM_MATH_MVEI)
int32_t num_elements = input_batches * unit_count;
const int32_t loop_count = (num_elements + 3) / 4;
for (int i_op = 0; i_op < loop_count; i_op++)
{
mve_pred16_t p = vctp32q((uint32_t)num_elements);
int32x4_t op = vldrwq_z_s32(buffer_b, p);
op = arm_requantize_mve(op, multiplier_out, shift_2);
op = vaddq_n_s32(op, zp_out);
const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
op = vmaxq_s32(op, min_vec);
op = vminq_s32(op, max_vec);
vstrbq_p_s32(output_data, op, p);
output_data += 4;
buffer_b += 4;
num_elements -= 4;
}
#else
for (int i = 0; i < input_batches * unit_count; i++)
{
output_data[i] = (q7_t)CLAMP(
arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
}
#endif
return (ARM_MATH_SUCCESS);
}
/**
* @} end of SVDF group
*/

View File

@@ -0,0 +1,267 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_svdf_s8.c
* Description: S8 basic SVDF layer function with s16 state tensor
*
* $Date: 28 April 2022
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M processors
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup SVDF
* @{
*/
/*
* S8 SVDF layer function for TensorFlow Lite with 16 bit state tensor
*
* Refer to header file for details.
*
*/
arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
const cmsis_nn_per_tensor_quant_params *output_quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
q15_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
const q15_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
(void)state_dims;
(void)output_dims;
const q31_t multiplier_in = input_quant_params->multiplier;
const q31_t shift_in = input_quant_params->shift;
const q31_t multiplier_out = output_quant_params->multiplier;
const q31_t shift_2 = output_quant_params->shift;
const int32_t zp_in = svdf_params->input_offset;
const int32_t zp_out = svdf_params->output_offset;
const int32_t in_activation_min = svdf_params->input_activation.min;
const int32_t in_activation_max = svdf_params->input_activation.max;
const int32_t out_activation_min = svdf_params->output_activation.min;
const int32_t out_activation_max = svdf_params->output_activation.max;
const int16_t rank = svdf_params->rank;
const int32_t input_batches = input_dims->n;
const int32_t input_height = input_dims->h;
const int32_t feature_batches = weights_feature_dims->n;
const int32_t time_batches = weights_time_dims->h;
const int32_t unit_count = feature_batches / rank;
if (input_ctx->buf == NULL)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q31_t *buffer_a = (q31_t *)input_ctx->buf;
if (output_ctx->buf == NULL)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q31_t *buffer_b = (q31_t *)output_ctx->buf;
// Left shift state
memmove((q15_t *)state_data,
(q15_t *)state_data + 1,
(size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
// Matrix multiplication input * feature weight
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
const q7_t *weight = weights_feature_data;
const q7_t *input = input_data + i_batch * input_height;
arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
weight,
res_ptr,
-zp_in,
0,
time_batches,
multiplier_in,
shift_in,
input_height,
feature_batches,
in_activation_min,
in_activation_max);
if (res != ARM_MATH_SUCCESS)
{
return res;
}
}
{
// Matrix multiplication time weight * state tensors
q31_t *ptr_a = buffer_a;
const q15_t *v2 = state_data;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
const q15_t *v1 = weights_time_data;
for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
{
*ptr_a = 0;
int32_t sum = 0;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
// Perform matrix multiplication in blocks of two
int j = 0;
int32_t block_count = time_batches >> 1;
for (int i = 0; i < block_count; i++)
{
j += 2;
q31_t r1 = arm_nn_read_q15x2_ia(&v1);
q31_t r2 = arm_nn_read_q15x2_ia(&v2);
sum = __SMLAD(r1, r2, sum);
}
// Process the remaining data
for (; j < time_batches; j++)
{
sum += *v1 * *v2;
v1++;
v2++;
}
#else
for (int j = 0; j < time_batches; j++)
{
sum += *v1 * *v2;
v1++;
v2++;
}
#endif
*ptr_a = sum;
ptr_a++;
}
}
}
if (bias_data)
{
if (unit_count == feature_batches)
{
for (int i = 0; i < input_batches; i++)
{
q31_t *output_temp = buffer_b + i * feature_batches;
const q31_t *ptr_a = buffer_a + i * feature_batches;
const int32_t *bi = bias_data;
for (int j = 0; j < feature_batches; j++)
{
output_temp[j] = ptr_a[j] + bi[j];
}
}
}
else
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q31_t *output_data_temp = buffer_b + i_batch * unit_count;
q31_t *ptr_a = buffer_a + i_batch * feature_batches;
for (int i = 0; i < unit_count; i++)
{
int32_t sum = bias_data[i];
for (int j = 0; j < rank; j++)
{
sum += *ptr_a;
ptr_a++;
}
output_data_temp[i] = sum;
}
}
}
}
else
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
q31_t *output_data_temp = buffer_b + i_batch * unit_count;
q31_t *ptr_a = buffer_a + i_batch * feature_batches;
for (int i = 0; i < unit_count; i++)
{
int32_t sum = 0;
for (int j = 0; j < rank; j++)
{
sum += *ptr_a;
ptr_a++;
}
output_data_temp[i] = sum;
}
}
}
#if defined(ARM_MATH_MVEI)
int32_t num_elements = input_batches * unit_count;
const int32_t loop_count = (num_elements + 3) / 4;
for (int i_op = 0; i_op < loop_count; i_op++)
{
mve_pred16_t p = vctp32q((uint32_t)num_elements);
int32x4_t op = vldrwq_z_s32(buffer_b, p);
op = arm_requantize_mve(op, multiplier_out, shift_2);
op = vaddq_n_s32(op, zp_out);
const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
op = vmaxq_s32(op, min_vec);
op = vminq_s32(op, max_vec);
vstrbq_p_s32(output_data, op, p);
output_data += 4;
buffer_b += 4;
num_elements -= 4;
}
#else
for (int i = 0; i < input_batches * unit_count; i++)
{
output_data[i] = (q7_t)CLAMP(
arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
}
#endif
return (ARM_MATH_SUCCESS);
}
/**
* @} end of SVDF group
*/

View File

@@ -0,0 +1,22 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC} arm_softmax_s8_s16.c
arm_softmax_s16.c
arm_nn_softmax_common_s8.c)

View File

@@ -0,0 +1,141 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_softmax_common_s8.c
* Description: Softmax with s8 input and output of s8 or s16.
*
* $Date: 17 March 2022
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M processors
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
#define ACCUM_BITS 12
/**
* @ingroup groupSupport
*/
/**
* @addtogroup Softmax
* @{
*/
/*
* Softmax function with s8 input and output of s8 or s16.
*
* Refer header file for details.
*
*/
void arm_nn_softmax_common_s8(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
const bool int16_output,
void *output)
{
const int32_t mask = (1 << shift);
int32_t col = 0;
int32_t row_idx;
for (row_idx = 0; row_idx < num_rows; ++row_idx)
{
// Find the maximum value in order to ensure numerical stability
int8_t max = *input;
for (col = 1; col < row_size; ++col)
{
max = MAX(max, input[col]);
}
int32_t diff = 0;
int32_t sum = 0;
for (col = 0; col < row_size; ++col)
{
diff = input[col] - max;
if (diff >= diff_min)
{
sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS);
}
}
const int32_t headroom = __CLZ(sum);
const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31));
int32_t bits_over_unit;
if (int16_output)
{
int16_t *output_s16 = (int16_t *)output + row_idx * row_size;
bits_over_unit = ACCUM_BITS - headroom + 15;
for (col = 0; col < row_size; ++col)
{
diff = input[col] - max;
if (diff >= diff_min)
{
const int32_t res =
DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) +
NN_Q15_MIN;
output_s16[col] = (int16_t)CLAMP(res, (int32_t)NN_Q15_MAX, (int32_t)NN_Q15_MIN);
}
else
{
output_s16[col] = NN_Q15_MIN;
}
}
}
else
{
int8_t *output_s8 = (int8_t *)output + row_idx * row_size;
bits_over_unit = ACCUM_BITS - headroom + 23;
for (col = 0; col < row_size; ++col)
{
diff = input[col] - max;
if (diff >= diff_min)
{
const int32_t res =
DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) +
NN_Q7_MIN;
output_s8[col] = (int8_t)CLAMP(res, (int32_t)NN_Q7_MAX, (int32_t)NN_Q7_MIN);
}
else
{
output_s8[col] = NN_Q7_MIN;
}
}
}
input += row_size;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@@ -0,0 +1,118 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_softmax_q15.c
* Description: Q15 softmax function
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Softmax
* @{
*/
/**
* @brief Q15 softmax function
* @param[in] vec_in pointer to input vector
* @param[in] dim_vec input vector dimention
* @param[out] p_out pointer to output vector
*
* @details
*
* Here, instead of typical e based softmax, we use
* 2-based softmax, i.e.,:
*
* y_i = 2^(x_i) / sum(2^x_j)
*
* The relative output will be different here.
* But mathematically, the gradient will be the same
* with a log(2) scaling factor.
*
*/
void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out)
{
q31_t sum;
int16_t i;
uint8_t shift;
q31_t base;
base = -1 * 0x100000;
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
base = vec_in[i];
}
}
/* we ignore really small values
* anyway, they will be 0 after shrinking
* to q15_t
*/
base = base - 16;
sum = 0;
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
shift = (uint8_t)__USAT(vec_in[i] - base, 5);
sum += 0x1 << shift;
}
}
/* This is effectively (0x1 << 32) / sum */
int64_t div_base = 0x100000000LL;
int output_base = (int32_t)(div_base / sum);
/* Final confidence will be output_base >> ( 17 - (vec_in[i] - base) )
* so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16
* and vec_in[i]-base = 16
*/
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
/* Here minimum value of 17+base-vec[i] will be 1 */
shift = (uint8_t)__USAT(17 + base - vec_in[i], 5);
p_out[i] = (q15_t)__SSAT((output_base >> shift), 16);
}
else
{
p_out[i] = 0;
}
}
}
/**
* @} end of Softmax group
*/

View File

@@ -0,0 +1,107 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_softmax_q7.c
* Description: Q7 softmax function
*
* $Date: 09. October 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Softmax
* @{
*/
/**
* @brief Q7 softmax function
* @param[in] vec_in pointer to input vector
* @param[in] dim_vec input vector dimention
* @param[out] p_out pointer to output vector
*
* @details
*
* Here, instead of typical natural logarithm e based softmax, we use
* 2-based softmax here, i.e.,:
*
* y_i = 2^(x_i) / sum(2^x_j)
*
* The relative output will be different here.
* But mathematically, the gradient will be the same
* with a log(2) scaling factor.
*
*/
void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out)
{
q31_t sum;
int16_t i;
uint8_t shift;
q15_t base;
base = -128;
/* We first search for the maximum */
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
base = vec_in[i];
}
}
/*
* So the base is set to max-8, meaning
* that we ignore really small values.
* anyway, they will be 0 after shrinking to q7_t.
*/
base = base - (1 << 3);
sum = 0;
for (i = 0; i < dim_vec; i++)
{
shift = (uint8_t)__USAT(vec_in[i] - base, 3);
sum += 0x1 << shift;
}
/* This is effectively (0x1 << 20) / sum */
int output_base = (1 << 20) / sum;
for (i = 0; i < dim_vec; i++)
{
/* Here minimum value of 13+base-vec_in[i] will be 5 */
shift = (uint8_t)__USAT(13 + base - vec_in[i], 5);
p_out[i] = (q7_t)__SSAT((output_base >> shift), 8);
}
}
/**
* @} end of Softmax group
*/

View File

@@ -0,0 +1,122 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_softmax_s16.c
* Description: S16 softmax function
*
* $Date: 9 March 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @addtogroup Softmax
* @{
*/
arm_status arm_softmax_s16(const int16_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const cmsis_nn_softmax_lut_s16 *softmax_params,
int16_t *output)
{
int32_t col = 0;
int32_t row_idx;
if (softmax_params->exp_lut == NULL || softmax_params->one_by_one_lut == NULL)
{
return ARM_MATH_ARGUMENT_ERROR;
}
for (row_idx = 0; row_idx < num_rows; ++row_idx)
{
// Find the maximum value in order to ensure numerical stability
int16_t max = *input;
for (col = 1; col < row_size; ++col)
{
max = MAX(max, input[col]);
}
int32_t diff = 0;
int32_t sum = 0;
int16_t *cached_exp_results = output;
for (col = 0; col < row_size; ++col)
{
diff = input[col] - max;
const int32_t scaled_diff = arm_nn_requantize(diff, mult, shift);
const int32_t symmetric_scaled_diff = scaled_diff + NN_Q15_MAX;
const int16_t saturated_symmetric_scaled_diff = MIN(MAX(symmetric_scaled_diff, NN_Q15_MIN), NN_Q15_MAX);
// Lookup from exp table and cache result for next step
const int16_t index = (256 + (saturated_symmetric_scaled_diff >> 7));
const int16_t offset = saturated_symmetric_scaled_diff & 0x7f;
const int16_t base = softmax_params->exp_lut[index];
const int16_t slope = softmax_params->exp_lut[index + 1] - softmax_params->exp_lut[index];
const int16_t delta = (slope * offset + 64) >> 7;
const int16_t result = (base + delta);
cached_exp_results[col] = result;
sum += cached_exp_results[col];
}
const int32_t headroom = __CLZ(sum);
// Compute the reciprocal 1/sum
const int32_t shifted_sum = (((sum) << (headroom - 1)) + (1 << 13)) >> 14;
// Since LUT computes 1/(1 + x), compute x = (sum - 1) => -65536
// Since LUT expects a symmetrical input, recenter from [UINT16_MIN, UINT16_MAX] to [INT16_MIN, INT16_MAX] =>
// -32768 ==> So in total -65536 -32768 => -98304
const int16_t symmetric_shifted_sum = shifted_sum - 98304;
// Lookup from one by one table
const int16_t index = (256 + (symmetric_shifted_sum >> 7));
const int16_t offset = symmetric_shifted_sum & 0x7f;
const int16_t base = softmax_params->one_by_one_lut[index];
const int16_t slope = softmax_params->one_by_one_lut[index + 1] - softmax_params->one_by_one_lut[index];
const int16_t delta = (slope * offset + 64) >> 7;
const int16_t one_by_one_result = (base + delta);
for (col = 0; col < row_size; ++col)
{
const int16_t right_shift = 30 - headroom;
int32_t result = (cached_exp_results[col] * one_by_one_result) >> right_shift;
result = (result + 1) >> 1; // Last shift position and insert round
output[col] = (int16_t)result;
}
output += row_size;
input += row_size;
}
return ARM_MATH_SUCCESS;
}
/**
* @} end of Softmax group
*/

View File

@@ -0,0 +1,215 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_softmax_s8.c
* Description: S8 softmax function
*
* $Date: 9 March 2022
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#define ACCUM_BITS 12
#ifdef ARM_MATH_MVEI
static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val)
{
#define SHIFT_START (24)
int32_t shift = SHIFT_START;
int32x4_t mask;
const int32x4_t val_mod_minus_quarter =
vandq_s32(val, vdupq_n_s32((1 << SHIFT_START) - 1)) - vdupq_n_s32(1 << SHIFT_START);
const int32x4_t remainder = vsubq_s32(val_mod_minus_quarter, val);
const int32x4_t x = vaddq_n_s32(val_mod_minus_quarter << 5, 1 << 28);
const int32x4_t x2 = MUL_SAT_MVE(x, x);
const int32x4_t op_1 = DIV_POW2_MVE(MUL_SAT_MVE(x2, x2), 2) + MUL_SAT_MVE(x2, x);
const int32x4_t op_2 = x + DIV_POW2_MVE(MUL_SAT_MVE(op_1, vdupq_n_s32(715827883)) + x2, 1);
int32x4_t result = vdupq_n_s32(1895147668) + MUL_SAT_MVE(vdupq_n_s32(1895147668), op_2);
#define SELECT_IF_NON_ZERO(x) \
{ \
mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0); \
mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); \
result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result); \
}
SELECT_IF_NON_ZERO(1672461947)
SELECT_IF_NON_ZERO(1302514674)
SELECT_IF_NON_ZERO(790015084)
SELECT_IF_NON_ZERO(290630308)
SELECT_IF_NON_ZERO(39332535)
SELECT_IF_NON_ZERO(720401)
SELECT_IF_NON_ZERO(242)
#undef SELECT_IF_NON_ZERO
mve_pred16_t p = vcmpeqq_n_s32(val, 0);
mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p);
result = SELECT_USING_MASK(mask, vdupq_n_s32(NN_Q31_MAX), result);
return result;
}
#endif
/**
* @ingroup groupNN
*/
/**
* @addtogroup Softmax
* @{
*/
void arm_softmax_s8(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int8_t *output)
{
#ifdef ARM_MATH_MVEI
#define ACT_MIN ((int8_t)NN_Q7_MIN)
#define ACT_MAX ((int8_t)NN_Q7_MAX)
const int32_t mask = (1 << shift);
for (int i_num_rows = 0; i_num_rows < num_rows; ++i_num_rows)
{
int8_t max = ACT_MIN;
int32_t vec_count = (row_size + 15) / 16;
uint32_t r_count = (uint32_t)row_size;
for (int i = 0; i < vec_count; i++)
{
mve_pred16_t p = vctp8q(r_count);
const int8x16_t ip = vldrbq_z_s8(&input[i * 16], p);
max = vmaxvq_p_s8(max, ip, p);
r_count -= 16;
}
vec_count = row_size / 4;
int32_t idx = 0;
int32_t sum = 0;
while (vec_count)
{
int32x4_t ip = vldrbq_s32(&input[idx * 4]);
ip = vsubq_n_s32(ip, max);
mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min);
if (p != 0)
{
ip = vmulq_n_s32(ip, mask);
int32x4_t res = MUL_SAT_MVE(ip, vdupq_n_s32(mult));
res = arm_exp_on_negative_values_mve_32x4(res);
res = DIV_POW2_MVE(res, ACCUM_BITS);
res = vpselq_s32(res, vdupq_n_s32(0), p);
sum += vaddvq_s32(res);
}
vec_count--;
idx++;
}
const int32_t tail_idx = row_size & ~3;
for (int i = 0; i < (row_size & 3); i++)
{
const int32_t diff = input[tail_idx + i] - max;
if (diff >= diff_min)
{
sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS);
}
}
const int32_t headroom = __CLZ((uint32_t)sum);
const int32_t bits_over_unit = ACCUM_BITS - headroom + 23;
const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31));
vec_count = row_size / 4;
idx = 0;
while (vec_count)
{
int32x4_t ip = vldrbq_s32(&input[idx]);
ip = vsubq_n_s32(ip, max);
mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min);
int32x4_t tmp_res;
if (p != 0)
{
ip = vmulq_n_s32(ip, mask);
tmp_res = MUL_SAT_MVE(ip, vdupq_n_s32(mult));
tmp_res = arm_exp_on_negative_values_mve_32x4(tmp_res);
tmp_res = MUL_SAT_MVE(vdupq_n_s32(shifted_scale), tmp_res);
tmp_res = DIV_POW2_MVE(tmp_res, bits_over_unit);
tmp_res += vdupq_n_s32(ACT_MIN);
tmp_res = vmaxq_s32(tmp_res, vdupq_n_s32(ACT_MIN));
tmp_res = vminq_s32(tmp_res, vdupq_n_s32(ACT_MAX));
tmp_res = vpselq_s32(tmp_res, vdupq_n_s32(ACT_MIN), p);
}
else
{
tmp_res = vdupq_n_s32(ACT_MIN);
}
vstrbq_s32(&output[idx], tmp_res);
vec_count--;
idx += 4;
}
for (int i = 0; i < (row_size & 3); i++)
{
int32_t diff = input[tail_idx + i] - max;
if (diff >= diff_min)
{
const int32_t res =
DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) +
NN_Q7_MIN;
output[tail_idx + i] = (int8_t)CLAMP(res, (int32_t)ACT_MAX, (int32_t)ACT_MIN);
}
else
{
output[tail_idx + i] = ACT_MIN;
}
}
input += row_size;
output += row_size;
}
#else
arm_nn_softmax_common_s8(input, num_rows, row_size, mult, shift, diff_min, false, (void *)output);
#endif
}
/**
* @} end of Softmax group
*/

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_softmax_s8_s16.c
* Description: S8 to s16 softmax function
*
* $Date: 7 January 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Softmax
* @{
*/
void arm_softmax_s8_s16(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int16_t *output)
{
arm_nn_softmax_common_s8(input, num_rows, row_size, mult, shift, diff_min, true, (void *)output);
}
/**
* @} end of Softmax group
*/

Some files were not shown because too many files have changed in this diff Show More