init commit

This commit is contained in:
2025-09-05 10:30:26 +02:00
commit c144fcd356
1526 changed files with 1115326 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPMatrix)
include(configLib)
include(configDsp)
file(GLOB SRCF64 "./*_f64.c")
file(GLOB SRCF32 "./*_f32.c")
file(GLOB SRCF16 "./*_f16.c")
file(GLOB SRCQ31 "./*_q31.c")
file(GLOB SRCQ15 "./*_q15.c")
file(GLOB SRCQ7 "./*_q7.c")
file(GLOB SRCU32 "./*_u32.c")
file(GLOB SRCU16 "./*_u16.c")
file(GLOB SRCU8 "./*_u8.c")
add_library(CMSISDSPMatrix STATIC ${SRCF64})
target_sources(CMSISDSPMatrix PRIVATE ${SRCF32})
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPMatrix PRIVATE ${SRCF16})
endif()
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ31})
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ15})
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ7})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU32})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU16})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU8})
configLib(CMSISDSPMatrix ${ROOT})
configDsp(CMSISDSPMatrix ${ROOT})
### Includes
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/Include")
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPMatrix PRIVATE arm_mat_add_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_sub_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_scale_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_vec_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_inverse_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_init_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cholesky_f16.c)
endif()

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f32.c"
#include "arm_mat_add_q15.c"
#include "arm_mat_add_q31.c"
#include "arm_mat_cmplx_mult_f32.c"
#include "arm_mat_cmplx_mult_q15.c"
#include "arm_mat_cmplx_mult_q31.c"
#include "arm_mat_init_f32.c"
#include "arm_mat_init_q15.c"
#include "arm_mat_init_q31.c"
#include "arm_mat_inverse_f32.c"
#include "arm_mat_inverse_f64.c"
#include "arm_mat_mult_f64.c"
#include "arm_mat_mult_f32.c"
#include "arm_mat_mult_fast_q15.c"
#include "arm_mat_mult_fast_q31.c"
#include "arm_mat_mult_q7.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_mult_opt_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
#include "arm_mat_sub_f64.c"
#include "arm_mat_sub_f32.c"
#include "arm_mat_sub_q15.c"
#include "arm_mat_sub_q31.c"
#include "arm_mat_trans_f32.c"
#include "arm_mat_trans_f64.c"
#include "arm_mat_trans_q7.c"
#include "arm_mat_trans_q15.c"
#include "arm_mat_trans_q31.c"
#include "arm_mat_vec_mult_f32.c"
#include "arm_mat_vec_mult_q31.c"
#include "arm_mat_vec_mult_q15.c"
#include "arm_mat_vec_mult_q7.c"
#include "arm_mat_cmplx_trans_f32.c"
#include "arm_mat_cmplx_trans_q31.c"
#include "arm_mat_cmplx_trans_q15.c"
#include "arm_mat_cholesky_f64.c"
#include "arm_mat_cholesky_f32.c"
#include "arm_mat_solve_upper_triangular_f32.c"
#include "arm_mat_solve_lower_triangular_f32.c"
#include "arm_mat_solve_upper_triangular_f64.c"
#include "arm_mat_solve_lower_triangular_f64.c"
#include "arm_mat_ldlt_f32.c"
#include "arm_mat_ldlt_f64.c"

View File

@@ -0,0 +1,41 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function f16 source files.
*
* $Date: 18. March 2020
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f16.c"
#include "arm_mat_sub_f16.c"
#include "arm_mat_trans_f16.c"
#include "arm_mat_scale_f16.c"
#include "arm_mat_mult_f16.c"
#include "arm_mat_vec_mult_f16.c"
#include "arm_mat_cmplx_trans_f16.c"
#include "arm_mat_cmplx_mult_f16.c"
#include "arm_mat_inverse_f16.c"
#include "arm_mat_init_f16.c"
#include "arm_mat_cholesky_f16.c"
#include "arm_mat_solve_upper_triangular_f16.c"
#include "arm_mat_solve_lower_triangular_f16.c"

View File

@@ -0,0 +1,217 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f16.c
* Description: Floating-point matrix addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixAdd group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,304 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixAdd Matrix Addition
Adds two matrices.
\image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
#if defined(ARM_MATH_NEON)
/*
Neon version is assuming the matrix is small enough.
So no blocking is used for taking into account cache effects.
For big matrix, there exist better libraries for Neon.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) + (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixAdd group
*/

View File

@@ -0,0 +1,227 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q15 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixAdd group
*/

View File

@@ -0,0 +1,216 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q31 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
arm_status status; /* status of matrix addition */
uint32_t numSamples; /* total number of elements in the matrix */
q31_t *pDataA, *pDataB, *pDataDst;
q31x4_t vecA, vecB, vecDst;
q31_t const *pSrcAVec;
q31_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q31_t const *) pDataA;
pSrcBVec = (q31_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixAdd group
*/

View File

@@ -0,0 +1,256 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f16.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cholesky_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
_Float16 invSqrtVj;
float16_t *pA,*pG;
int kCnt;
mve_pred16_t p0;
f16x8_t acc, acc0, acc1, acc2, acc3;
f16x8_t vecGi;
f16x8_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
acc0 = vdupq_n_f16(0.0f16);
acc0[0]=pA[(j + 0) * n + i];
acc1 = vdupq_n_f16(0.0f16);
acc1[0]=pA[(j + 1) * n + i];
acc2 = vdupq_n_f16(0.0f16);
acc2[0]=pA[(j + 2) * n + i];
acc3 = vdupq_n_f16(0.0f16);
acc3[0]=pA[(j + 3) * n + i];
kCnt = i;
for(k=0; k < i ; k+=8)
{
p0 = vctp16q(kCnt);
vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
vecGj0=vldrhq_z_f16(&pG[(j + 0) * n + k],p0);
vecGj1=vldrhq_z_f16(&pG[(j + 1) * n + k],p0);
vecGj2=vldrhq_z_f16(&pG[(j + 2) * n + k],p0);
vecGj3=vldrhq_z_f16(&pG[(j + 3) * n + k],p0);
acc0 = vfmsq_m(acc0, vecGi, vecGj0, p0);
acc1 = vfmsq_m(acc1, vecGi, vecGj1, p0);
acc2 = vfmsq_m(acc2, vecGi, vecGj2, p0);
acc3 = vfmsq_m(acc3, vecGi, vecGj3, p0);
kCnt -= 8;
}
pG[(j + 0) * n + i] = vecAddAcrossF16Mve(acc0);
pG[(j + 1) * n + i] = vecAddAcrossF16Mve(acc1);
pG[(j + 2) * n + i] = vecAddAcrossF16Mve(acc2);
pG[(j + 3) * n + i] = vecAddAcrossF16Mve(acc3);
}
for(; j < n ; j++)
{
kCnt = i;
acc = vdupq_n_f16(0.0f16);
acc[0] = pA[j * n + i];
for(k=0; k < i ; k+=8)
{
p0 = vctp16q(kCnt);
vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
vecGj=vldrhq_z_f16(&pG[j * n + k],p0);
acc = vfmsq_m(acc, vecGi, vecGj,p0);
kCnt -= 8;
}
pG[j * n + i] = vecAddAcrossF16Mve(acc);
}
if ((_Float16)pG[i * n + i] <= 0.0f16)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f16/(_Float16)sqrtf((float32_t)pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cholesky_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float16_t invSqrtVj;
float16_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] - (_Float16)pG[i * n + k] * (_Float16)pG[j * n + k];
}
}
if ((_Float16)pG[i * n + i] <= 0.0f16)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
/* The division is done in float32 for accuracy reason and
because doing it in f16 would not have any impact on the performances.
*/
invSqrtVj = 1.0f/sqrtf((float32_t)pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,438 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f32.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 05 October 2021
* $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixChol Cholesky and LDLT decompositions
Computes the Cholesky or LDL^t decomposition of a matrix.
If the input matrix does not have a decomposition, then the
algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
int kCnt;
mve_pred16_t p0;
f32x4_t acc, acc0, acc1, acc2, acc3;
f32x4_t vecGi;
f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
kCnt = i;
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
for(k=0; k < i ; k+=4)
{
p0 = vctp32q(kCnt);
vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
vecGj0=vldrwq_z_f32(&pG[(j + 0) * n + k],p0);
vecGj1=vldrwq_z_f32(&pG[(j + 1) * n + k],p0);
vecGj2=vldrwq_z_f32(&pG[(j + 2) * n + k],p0);
vecGj3=vldrwq_z_f32(&pG[(j + 3) * n + k],p0);
acc0 = vfmaq_m(acc0, vecGi, vecGj0, p0);
acc1 = vfmaq_m(acc1, vecGi, vecGj1, p0);
acc2 = vfmaq_m(acc2, vecGi, vecGj2, p0);
acc3 = vfmaq_m(acc3, vecGi, vecGj3, p0);
kCnt -= 4;
}
pG[(j + 0) * n + i] -= vecAddAcrossF32Mve(acc0);
pG[(j + 1) * n + i] -= vecAddAcrossF32Mve(acc1);
pG[(j + 2) * n + i] -= vecAddAcrossF32Mve(acc2);
pG[(j + 3) * n + i] -= vecAddAcrossF32Mve(acc3);
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
kCnt = i;
acc = vdupq_n_f32(0.0f);
for(k=0; k < i ; k+=4)
{
p0 = vctp32q(kCnt);
vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
vecGj=vldrwq_z_f32(&pG[j * n + k],p0);
acc = vfmaq_m(acc, vecGi, vecGj,p0);
kCnt -= 4;
}
pG[j * n + i] -= vecAddAcrossF32Mve(acc);
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
int kCnt;
f32x4_t acc, acc0, acc1, acc2, acc3;
f32x4_t vecGi;
f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
#if !defined(__aarch64__)
f32x2_t tmp = vdup_n_f32(0);
#endif
float32_t sum=0.0f;
float32_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
kCnt = i >> 2;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f32(&pG[i * n + k]);
vecGj0=vld1q_f32(&pG[(j + 0) * n + k]);
vecGj1=vld1q_f32(&pG[(j + 1) * n + k]);
vecGj2=vld1q_f32(&pG[(j + 2) * n + k]);
vecGj3=vld1q_f32(&pG[(j + 3) * n + k]);
acc0 = vfmaq_f32(acc0, vecGi, vecGj0);
acc1 = vfmaq_f32(acc1, vecGi, vecGj1);
acc2 = vfmaq_f32(acc2, vecGi, vecGj2);
acc3 = vfmaq_f32(acc3, vecGi, vecGj3);
kCnt--;
k+=4;
}
#if defined(__aarch64__)
sum0 = vpadds_f32(vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)));
sum1 = vpadds_f32(vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)));
sum2 = vpadds_f32(vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)));
sum3 = vpadds_f32(vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)));
#else
tmp = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
kCnt = i & 3;
while(kCnt > 0)
{
sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
kCnt--;
k++;
}
pG[(j + 0) * n + i] -= sum0;
pG[(j + 1) * n + i] -= sum1;
pG[(j + 2) * n + i] -= sum2;
pG[(j + 3) * n + i] -= sum3;
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
acc = vdupq_n_f32(0.0f);
kCnt = i >> 2;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f32(&pG[i * n + k]);
vecGj=vld1q_f32(&pG[j * n + k]);
acc = vfmaq_f32(acc, vecGi, vecGj);
kCnt--;
k+=4;
}
#if defined(__aarch64__)
sum = vpadds_f32(vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)));
#else
tmp = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
kCnt = i & 3;
while(kCnt > 0)
{
sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
kCnt--;
k++;
}
pG[j * n + i] -= sum;
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f64.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0/sqrt(pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,935 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
#endif
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
#include "arm_helium_utils.h"
#define DONTCARE 0 /* inactive lane content */
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 2
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0,vecColAOffs0,vecColAOffs1;
float16_t *pInA0 = pInA;
f16x8_t acc0, acc1;
f16x8_t vecB, vecA0, vecA1;
f16x8_t vecTmp;
uint16_t tmp;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2, 3,
MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
tmp = 0;
vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
tmp = (CMPLX_DIM * MATRIX_DIM);
vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
/*
* Compute
* re0+re1 | im0+im1 | re0+re1 | im0+im1
* re2+re3 | im2+im3 | re2+re3 | im2+im3
*/
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
vecTmp = vaddq(vecTmp, acc0);
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
vecTmp = vaddq(vecTmp, acc1);
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 3
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2;
f16x8_t vecB, vecA0, vecA1, vecA2;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
DONTCARE, DONTCARE
};
/* enable predication to disable upper half complex vector element */
mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_f16(pInA0);
vecA1 = vldrhq_f16(pInA1);
vecA2 = vldrhq_f16(pInA2);
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 4
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
f16x8_t vecB, vecA;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t const *pInB = (float16_t const *) pSrcB->pData; /* input data matrix pointer B */
float16_t const *pInA = (float16_t const *) pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint16x8_t vecOffs, vecColBOffs;
uint32_t blkCnt,rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* small squared matrix specialized routines
*/
if (numRowsA == numColsB && numColsB == numColsA)
{
if (numRowsA == 1)
{
pOut[0] = (_Float16)pInA[0] * (_Float16)pInB[0] - (_Float16)pInA[1] * (_Float16)pInB[1];
pOut[1] = (_Float16)pInA[0] * (_Float16)pInB[1] + (_Float16)pInA[1] * (_Float16)pInB[0];
return (ARM_MATH_SUCCESS);
}
else if (numRowsA == 2)
return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
float16_t const *pInA0 = pInA;
float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
acc0 = vdupq_n_f16(0.0f16);
acc1 = vdupq_n_f16(0.0f16);
acc2 = vdupq_n_f16(0.0f16);
acc3 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
pSrcA1Vec = (float16_t const *) pInA1;
pSrcA2Vec = (float16_t const *) pInA2;
pSrcA3Vec = (float16_t const *) pInA3;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
blkCnt--;
}
/*
* Unsupported addressing mode compiler crash
*/
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4) * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float16_t const *pSrcA0Vec;
float16_t const *pInA0 = pInA;
f16x8_t acc0;
acc0 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA * CMPLX_DIM;
rowCnt--;
}
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
_Float16 sumReal, sumImag; /* Accumulator */
_Float16 a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
_Float16 a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f16;
sumImag = 0.0f16;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,595 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Q15 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@param[in] pScratch points to an array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Conditions for optimum performance
Input, output and state buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t const *pInA = (q15_t const *) pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t const *pInB = (q15_t const *) pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t const *pInB2;
q15_t *px; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
uint32_t blkCnt; /* loop counters */
uint16x8_t vecOffs, vecColBOffs;
arm_status status; /* Status of matrix multiplication */
(void)pScratch;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2 * numColsB * CMPLX_DIM;
vecColBOffs[5] = 2 * (numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3 * numColsB * CMPLX_DIM;
vecColBOffs[7] = 3 * (numColsB * CMPLX_DIM) + 1;
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA;
px = pDst->pData;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u)
{
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
j = 0;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInB2 = pInB + CMPLX_DIM;
j += 2 * CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc2, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc3, 15);
}
col = numColsB & 1;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB;
q63_t acc0, acc1;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
j += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
}
i = i + numColsA * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
row--;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
q63_t sumReal, sumImag; /* accumulator */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#else
q15_t a, b, c, d;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
#else
/* Initialize blkCnt with number of samples */
col = numColsB;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
i = i + 2U;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
b = *(pInA + 1U);
/* read real and imag values from pSrcB buffer */
c = *pInB;
d = *(pInB + 1U);
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
/* read next real and imag values from pSrcA buffer */
a = *(pInA + 2U);
b = *(pInA + 3U);
/* read next real and imag values from pSrcB buffer */
c = *(pInB + 2U);
d = *(pInB + 3U);
/* update pointer */
pInA += 4U;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
b = *pInA++;
c = *pInB++;
d = *pInB++;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
#endif /* #if defined (ARM_MATH_DSP) */
}
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,131 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_f16.c
* Description: Floating-point complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
{
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,133 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_f32.c
* Description: Floating-point complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixComplexTrans Complex Matrix Transpose
Tranposes a complex matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_f32(const arm_matrix_instance_f32 * pSrc, arm_matrix_instance_f32 * pDst)
{
return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_q31.c
* Description: Q15 complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Q15 complex matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_q15(const arm_matrix_instance_q15 * pSrc, arm_matrix_instance_q15 * pDst)
{
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer pOut is set to starting address of the column being processed */
pOut = pDst->pData + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
pOut[0] = *pSrcA++; //real
pOut[1] = *pSrcA++; //imag
/* Update the pointer pOut to point to the next row of the transposed matrix */
pOut += CMPLX_DIM *nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_q31.c
* Description: Q31 complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Q31 complex matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_q31(const arm_matrix_instance_q31 * pSrc, arm_matrix_instance_q31 * pDst)
{
return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
}
while (row > 0U); /* row loop end */
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f16.c
* Description: Floating-point matrix initialization
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f16(
arm_matrix_instance_f16 * S,
uint16_t nRows,
uint16_t nColumns,
float16_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,76 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
Initializes the underlying matrix data structure.
The functions set the <code>numRows</code>,
<code>numCols</code>, and <code>pData</code> fields
of the matrix data structure.
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
uint16_t nRows,
uint16_t nColumns,
float32_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@@ -0,0 +1,67 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q15 matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
uint16_t nRows,
uint16_t nColumns,
q15_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@@ -0,0 +1,72 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q31 matrix initialization.
@param[in,out] S points to an instance of the Q31 matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
uint16_t nRows,
uint16_t nColumns,
q31_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@@ -0,0 +1,891 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f16.c
* Description: Floating-point matrix inverse
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t *pTmpA, *pTmpB;
_Float16 in = 0.0f16; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
uint32_t blkCnt;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/*
* Working pointer for destination matrix
*/
pOutT1 = pOut;
/*
* Loop over the number of rows
*/
rowCnt = numRows;
/*
* Making the destination matrix as identity matrix
*/
while (rowCnt > 0U)
{
/*
* Writing all zeroes in lower triangle of the destination matrix
*/
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/*
* Writing all ones in the diagonal of the destination matrix
*/
*pOutT1++ = 1.0f16;
/*
* Writing all zeroes in upper triangle of the destination matrix
*/
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/*
* Decrement the loop counter
*/
rowCnt--;
}
/*
* Loop over the number of columns of the input matrix.
* All the elements in each column are processed by the row operations
*/
loopCnt = numCols;
/*
* Index modifier to navigate through the columns
*/
l = 0U;
while (loopCnt > 0U)
{
/*
* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular.
*/
/*
* Working pointer for the input matrix that points
* * to the pivot element of the particular row
*/
pInT1 = pIn + (l * numCols);
/*
* Working pointer for the destination matrix that points
* * to the pivot element of the particular row
*/
pOutT1 = pOut + (l * numCols);
/*
* Temporary variable to hold the pivot value
*/
in = *pInT1;
/*
* Check if the pivot element is zero
*/
if ((_Float16)*pInT1 == 0.0f16)
{
/*
* Loop over the number rows present below
*/
for (i = 1U; i < numRows-l; i++)
{
/*
* Update the input and destination pointers
*/
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/*
* Check if there is a non zero pivot element to
* * replace in the rows below
*/
if ((_Float16)*pInT2 != 0.0f16)
{
f16x8_t vecA, vecB;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pInT1;
pTmpB = pInT2;
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pInT1 += numCols - l;
pInT2 += numCols - l;
pTmpA = pOutT1;
pTmpB = pOutT2;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pOutT1 += numCols;
pOutT2 += numCols;
/*
* Flag to indicate whether exchange is done or not
*/
flag = 1U;
/*
* Break after exchange is done
*/
break;
}
}
}
/*
* Update the status if the matrix is singular
*/
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
/*
* Points to the pivot row of input and destination matrices
*/
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/*
* Temporary pointers to the pivot row pointers
*/
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/*
* Pivot element of the row
*/
in = *(pIn + (l * numCols));
pTmpA = pInT1;
f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
blkCnt = (numCols - l) >> 3;
f16x8_t vecA;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pInT1 += numCols - l;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pOutT1;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pOutT1 += numCols;
/*
* Replace the rows with the sum of that row and a multiple of row i
* * so that each new element in column i above row i is zero.
*/
/*
* Temporary pointers for input and destination matrices
*/
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/*
* Check for the pivot element
*/
if (i == l)
{
/*
* If the processing element is the pivot element,
* only the columns to the right are to be processed
*/
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/*
* Element of the reference row
*/
/*
* Working pointers for input and destination pivot rows
*/
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/*
* Loop over the number of columns to the right of the pivot element,
* to replace the elements in the input matrix
*/
in = *pInT1;
f16x8_t tmpV = vdupq_n_f16(in);
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pInT1, vec1);
pPRT_in += 8;
pInT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pInT1, vec1, p0);
pInT1 += blkCnt;
}
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pOutT1, vec1);
pPRT_pDst += 8;
pOutT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pOutT1, vec1, p0);
pInT2 += blkCnt;
pOutT1 += blkCnt;
}
}
/*
* Increment the temporary input pointer
*/
pInT1 = pInT1 + l;
}
/*
* Increment the input pointer
*/
pIn++;
/*
* Decrement the loop counter
*/
loopCnt--;
/*
* Increment the index modifier
*/
l++;
}
/*
* Set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if ((_Float16)pIn[i] != 0.0f16)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
_Float16 Xchg, in = 0.0f16, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f16;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Check if the pivot element is zero */
if ((_Float16)*pInT1 == 0.0f16)
{
/* Loop over the number rows present below */
for (i = 1U; i < numRows-l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if ((_Float16)*pInT2 != 0.0f16)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && ((_Float16)in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if ((_Float16)pIn[i] != 0.0f16)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,644 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point (64 bit) matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float64_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float64_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = 1U; i < numRows - l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Decrement loop counter */
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#else
float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = 1U; i < numRows-l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
}
for (j = 0U; j < numCols; j++)
{
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/* Pivot element of the row */
in = *(pIn + (l * numCols));
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
*pInT1 = *pInT1 / in;
pInT1++;
}
for (j = 0U; j < numCols; j++)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
*pOutT1 = *pOutT1 / in;
pOutT1++;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
for (j = 0U; j < (numCols - l); j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
pOutT1++;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,509 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_ldl_f32.c
* Description: Floating-point LDL decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/// @private
#define SWAP_ROWS_F32(A,i,j) \
{ \
int cnt = n; \
\
for(int w=0;w < n; w+=4) \
{ \
f32x4_t tmpa,tmpb; \
mve_pred16_t p0 = vctp32q(cnt); \
\
tmpa=vldrwq_z_f32(&A[i*n + w],p0);\
tmpb=vldrwq_z_f32(&A[j*n + w],p0);\
\
vstrwq_p(&A[i*n + w], tmpb, p0); \
vstrwq_p(&A[j*n + w], tmpa, p0); \
\
cnt -= 4; \
} \
}
/// @private
#define SWAP_COLS_F32(A,i,j) \
for(int w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pl,
arm_matrix_instance_f32 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float32_t *pA;
memset(pd->pData,0,sizeof(float32_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
pA = pl->pData;
int cnt = n;
uint16x8_t vecP;
for(int k=0;k < n; k+=8)
{
mve_pred16_t p0;
p0 = vctp16q(cnt);
vecP = vidupq_u16((uint16_t)k, 1);
vstrhq_p(&pp[k], vecP, p0);
cnt -= 8;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float32_t m=F32_MIN,a;
int j=k;
for(int r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F32(pA,k,j);
SWAP_COLS_F32(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabsf(a) < 1.0e-8f)
{
fullRank = 0;
break;
}
float32_t invA;
invA = 1.0f / a;
int32x4_t vecOffs;
int w;
vecOffs = vidupq_u32((uint32_t)0, 1);
vecOffs = vmulq_n_s32(vecOffs,n);
for(w=k+1; w<n; w+=4)
{
int cnt = n - k - 1;
f32x4_t vecX;
f32x4_t vecA;
f32x4_t vecW0,vecW1, vecW2, vecW3;
mve_pred16_t p0;
vecW0 = vdupq_n_f32(pA[(w + 0)*n+k]);
vecW1 = vdupq_n_f32(pA[(w + 1)*n+k]);
vecW2 = vdupq_n_f32(pA[(w + 2)*n+k]);
vecW3 = vdupq_n_f32(pA[(w + 3)*n+k]);
for(int x=k+1;x<n;x += 4)
{
p0 = vctp32q(cnt);
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
vecA = vfmsq_m(vecA, vecW0, vecX, p0);
vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
vecA = vfmsq_m(vecA, vecW1, vecX, p0);
vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
vecA = vfmsq_m(vecA, vecW2, vecX, p0);
vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
vecA = vfmsq_m(vecA, vecW3, vecX, p0);
vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
cnt -= 4;
}
}
for(; w<n; w++)
{
int cnt = n - k - 1;
f32x4_t vecA,vecX,vecW;
mve_pred16_t p0;
vecW = vdupq_n_f32(pA[w*n+k]);
for(int x=k+1;x<n;x += 4)
{
p0 = vctp32q(cnt);
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecA = vldrwq_z_f32(&pA[w*n+x],p0);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vfmsq_m(vecA, vecW, vecX, p0);
vstrwq_p(&pA[w*n+x], vecA, p0);
cnt -= 4;
}
}
for(int w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] * invA;
}
}
diag=k;
if (!fullRank)
{
diag--;
for(int row=0; row < n;row++)
{
mve_pred16_t p0;
int cnt= n-k;
f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=k; col < n;col += 4)
{
p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4;
}
}
}
for(int row=0; row < n;row++)
{
mve_pred16_t p0;
int cnt= n-row-1;
f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=row+1; col < n;col+=4)
{
p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4;
}
}
for(int d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
/// @private
#define SWAP_ROWS_F32(A,i,j) \
for(w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[i*n + w]; \
A[i*n + w] = A[j*n + w];\
A[j*n + w] = tmp; \
}
/// @private
#define SWAP_COLS_F32(A,i,j) \
for(w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pl,
arm_matrix_instance_f32 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float32_t *pA;
int row,d;
memset(pd->pData,0,sizeof(float32_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
pA = pl->pData;
for(k=0;k < n; k++)
{
pp[k] = k;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float32_t m=F32_MIN,a;
int j=k;
int r;
int w;
for(r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F32(pA,k,j);
SWAP_COLS_F32(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabsf(a) < 1.0e-8f)
{
fullRank = 0;
break;
}
for(w=k+1;w<n;w++)
{
int x;
for(x=k+1;x<n;x++)
{
pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
}
}
for(w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] / a;
}
}
diag=k;
if (!fullRank)
{
diag--;
for(row=0; row < n;row++)
{
int col;
for(col=k; col < n;col++)
{
pl->pData[row*n+col]=0.0;
}
}
}
for(row=0; row < n;row++)
{
int col;
for(col=row+1; col < n;col++)
{
pl->pData[row*n+col] = 0.0;
}
}
for(d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,229 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_ldl_f64.c
* Description: Floating-point LDL decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
#include <math.h>
/// @private
#define SWAP_ROWS_F64(A,i,j) \
{ \
int w; \
for(w=0;w < n; w++) \
{ \
float64_t tmp; \
tmp = A[i*n + w]; \
A[i*n + w] = A[j*n + w];\
A[j*n + w] = tmp; \
} \
}
/// @private
#define SWAP_COLS_F64(A,i,j) \
{ \
int w; \
for(w=0;w < n; w++) \
{ \
float64_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
} \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pl,
arm_matrix_instance_f64 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float64_t *pA;
memset(pd->pData,0,sizeof(float64_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float64_t));
pA = pl->pData;
for(k=0;k < n; k++)
{
pp[k] = k;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float64_t m=F64_MIN,a;
int w,r,j=k;
for(r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F64(pA,k,j);
SWAP_COLS_F64(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabs(a) < 1.0e-18)
{
fullRank = 0;
break;
}
for(w=k+1;w<n;w++)
{
int x;
for(x=k+1;x<n;x++)
{
pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
}
}
for(w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] / a;
}
}
diag=k;
if (!fullRank)
{
diag--;
{
int row;
for(row=0; row < n;row++)
{
int col;
for(col=k; col < n;col++)
{
pl->pData[row*n+col]=0.0;
}
}
}
}
{
int row;
for(row=0; row < n;row++)
{
int col;
for(col=row+1; col < n;col++)
{
pl->pData[row*n+col] = 0.0;
}
}
}
{
int d;
for(d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,763 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a10 a10 x x x x }
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b00 b01 x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01
* a10 b00 a10 b01
* x x
* x x }
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a11 a11 x x x x}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
/*
* load {b10, b11, b10, b11, x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11
* a10 b00 + a11 b10 a10 b01 + a11 b11
* x x
* x x }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
mve_pred16_t p0 = vctp16q(2*2);
/*
* Store the result in the destination buffer
* (lower half of the vector)
*/
vstrhq_p(pOut, vecDst, p0);
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a10 a10 a10 a20 a20}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b00 b01 b02 b00 b01}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02
* a10 b00 a10 b01 a10 b02
* a20 b00 a20 b01}
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a01 a11 a11 a11 a21 a21}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b10, b11, b12, b10, b11, b12, b10, b11}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12
* a20 b00 + a21 b10 a20 b01 + a21 b11 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* move to 3rd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a02 a02 a02 a12 a12 a12 a22 a22}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b20, b21, b22, b20, b21, b22, b20, b21}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* {a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22},
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22},
* a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst); pOut += 8;
/* last element computed in scalar mode
* a20 b02 + a21 b12 + a22 b22
*/
_Float16 * pA = (_Float16 *)pSrcA->pData;
_Float16 * pB = (_Float16 *)pSrcB->pData;
*pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
/* offsetA allows to read and duplicate 2 successive column elements of A */
static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst0, vecDst1;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a00 a10 a10 a10 a10}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b03 b00 b01 b02 b03}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02 a00 b03
* a10 b00 a10 b01 a10 b02 a10 b03 }
*/
vecDst0 = vmulq(vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a20 a20 a20 a20 a30 a30 a30 a30}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* { a20 b00 a20 b01 a20 b02 a20 b03
* a30 b00 a30 b01 a30 b02 + a31 b12 }
*/
vecDst1 = vmulq(vecInA, vecInB);
/*
* rewind back to top half of the A matrix (2nd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a01 a01 a01 a01 a11 a11 a11 a11}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b10, b11, b12, b13, b10, b11, b12, b13}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12 a00 b03 + a01 b13
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12 a10 b03 + a11 b13 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a21 a21 a21 a21 a31 a31 a31 a31}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 a20 b01 + a21 b11 a20 b02 + a21 b12 a20 b03 + a21 b13
* a30 b00 + a31 b10 a30 b01 + a31 b11 a30 b02 + a31 b12 a30 b03 + a31 b13 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (3rd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a02 a02 a02 a02 a12 a12 a12 a12}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b20, b21, b22, b23, b20, b21, b22, b23}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22 a00 b03 + a01 b13 + a02 b23
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22 a10 b03 + a11 b13 + a12 b23 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a22 a22 a22 a22 a32 a32 a32 a32}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 a20 b02 + a21 b12 + a22 b22 a20 b03 + a21 b13 + a22 b23
* a30 b00 + a31 b10 + a32 b20 a30 b01 + a31 b11 + a32 b21 a30 b02 + a31 b12 + a32 b22 a30 b03 + a31 b13 + a32 b23 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (4th column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a03 a03 a03 a03 a13 a13 a13 a13}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b30, b31, b32, b33, b30, b31, b32, b33}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 +...+ a03 b30, a00 b01 +...+ a03 b31, a00 b02 +...+ a03 b32, a00 b03 +...+ a03 b33
* a10 b00 +...+ a13 b30, a10 b01 +...+ a13 b31, a10 b02 +...+ a13 b32, a10 b03 +...+ a13 b33 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a23 a23 a23 a23 a33 a33 a33 a33}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 +...+ a23 b30, a20 b01 +...+ a23 b31, a20 b02 +...+ a23 b32, a20 b03 +...+ a23 b33
* a30 b00 +...+ a33 b30, a30 b01 +...+ a33 b31, a30 b02 +...+ a33 b32, a30 b03 +...+ a33 b33 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst0); pOut += 8;
vst1q(pOut, vecDst1);
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t blkCnt; /* loop counters */
int i;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
return(ARM_MATH_SIZE_MISMATCH);
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if(numRowsA == 2)
return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 4)
return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
/* main loop process 4 rows */
i = numRowsA / 4;
while(i > 0)
{
float16_t *pInA0, *pInA1, *pInA2, *pInA3;
float16_t *pInB0;
float16_t *pOut0, *pOut1, *pOut2, *pOut3;
f16x8_t vecMac0, vecMac1, vecMac2, vecMac3;
f16x8_t vecInB;
/* pointers to 4 consecutive output rows */
pOut0 = pOut;
pOut1 = pOut0 + numColsB;
pOut2 = pOut1 + numColsB;
pOut3 = pOut2 + numColsB;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
/* pointers to 4 consecutive Matrix A rows */
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
vst1q(pOut1, vecMac1); pOut1 += 8;
vst1q(pOut2, vecMac2); pOut2 += 8;
vst1q(pOut3, vecMac3); pOut3 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
vstrhq_p_f16(pOut1, vecMac1, p0);
vstrhq_p_f16(pOut2, vecMac2, p0);
vstrhq_p_f16(pOut3, vecMac3, p0);
}
pInA += 4 * numColsA;
pOut += 4 * numColsB;
i--;
}
/*
* non multiple of 4 rows for Matrix A
* process single row
*/
if (numRowsA & 3)
{
i = numRowsA & 3;
do
{
float16_t *pInA0;
float16_t *pInB0;
float16_t *pOut0;
f16x8_t vecInB;
f16x8_t vecMac0;
pOut0 = pOut;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
pInA0 = pInA;
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
}
pInA += 1 * numColsA;
pOut += 1 * numColsB;
}
while (--i);
}
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
}
#else
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
_Float16 sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f16;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,202 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f64.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @defgroup MatrixMult Matrix Multiplication
*
* Multiplies two matrices.
*
* \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
* Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f64(
const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float64_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float64_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float64_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float64_t *pOut = pDst->pData; /* Output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
float64_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint64_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixMult group
*/

View File

@@ -0,0 +1,483 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.15 x 1.15 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.15 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 16 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q31_t sum; /* Accumulator */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#if defined (ARM_MATH_DSP)
/* Read two elements from row */
in = read_q15x2_ia (&pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = read_q15x2_ia (&pInB);
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#else /* #if defined (ARM_MATH_DSP) */
/* Read one element from row */
in = *pInB++;
/* Store one element in destination */
*px = in;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1U;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
pInB = pSrcBT + j;
pInA2 = pInA + numColsA;
pInB2 = pInB + numRowsB;
/* Read in two elements at once - allows dual MAC instruction */
colCnt = numColsA >> 1U;
#else
colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inA2 = read_q15x2_ia (&pInA2);
inB2 = read_q15x2_ia (&pInB2);
/* Multiply and Accumulates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumulates */
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
inA1 = *pInA++;
inB1 = *pInB++;
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA2++;
inB2 = *pInB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
}
#else
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += (q31_t) *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB * (numColsB-1);
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia (&pInA);
inA2 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inB2 = read_q15x2_ia (&pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia (&pInA);
inA2 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inB2 = read_q15x2_ia (&pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement column loop counter */
col--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,374 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.31 x 1.31 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.31 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
q31_t sum1, sum2, sum3, sum4; /* Accumulator */
q31_t inA1, inA2, inB1, inB2;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
px = pDst->pData;
row = row >> 1U;
px2 = px + numColsB;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
/* Decrement column loop counter */
col--;
}
i = i + (numColsA << 1U);
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px = sum1 << 1;
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1 << 1;
i++;
/* Decrement col loop counter */
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,784 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_opt_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 3 Nov 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
@remark
This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring
additional storage for intermediate results.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA2;
q31_t *pInB2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q31_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q31 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
} else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pDst->pData[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q31(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* main loop
* compute 2 x 2 output blocks
* with dot products (Matrix A rows * Transposed MAtrix B rows)
*/
while (row > 0u) {
/*
* For every row wise process, the column loop counter is to be initiated
* Compute 2 columns and 2 rows in parrallel
*/
col = numColsB >> 1;
j = 0;
/*
* column pair loop
*/
while (col > 0u) {
q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q31x4_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointers
* - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q31_t const *) pInA;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcBVec = (q31_t const *) pInB;
pSrcB2Vec = (q31_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/* load scheduling */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
}
/* Convert to 1.31 */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
/* Store the results (2 x 2 block) in the destination buffer */
*px++ = (q31_t) acc0;
*px++ = (q31_t) acc2;
*px2++ = (q31_t) acc1;
*px2++ = (q31_t) acc3;
j += numRowsB * 2;
/*
* Decrement the column pair loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row pair loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
/* single dot-product */
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px = (q31_t) acc0;
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px++ = (q31_t) acc0;
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,843 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
* $Date: 3 Nov 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow.
The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
and then saturated to 1.15 format.
@par
Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q15x8_t vecB, vecA0, vecA1;
mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM3;
q15_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q15x8_t vecB, vecA0, vecA1, vecA2;
mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
vecColBOffs = vidupq_u16((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM4;
q15_t *pInA2 = pInA1 + MATRIX_DIM4;
q15_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q15x8_t vecB, vecA0, vecA1, vecA2, vecA3;
mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
vecColBOffs = vidupq_u16((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
vecA3 = vldrhq_s16(pInA3);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA2;
q15_t *pInB2;
q15_t *px; /* Temporary output data matrix pointer */
q15_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/* small squared matrix specialized routines */
if (numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1) {
q63_t sum;
sum = pInA[0] * pInB[0];
pDst->pData[0] = (q15_t) __SSAT((sum >> 15), 16);
return (ARM_MATH_SUCCESS);
} else if (numRowsA == 2)
return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q15(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u) {
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
/*
* For every row wise process, the pIn2 pointer is set
* to the starting address of the transposed pSrcB data
*/
pInB = pSrcBT;
pInB2 = pInB + numRowsB;
j = 0;
/*
* column loop
*/
while (col > 0u) {
q15_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q15x8_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q15_t const *) pInA;
pSrcA2Vec = (q15_t const *) pInA2;
pSrcBVec = (q15_t const *) pInB;
pSrcB2Vec = (q15_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
blkCnt = numColsA / 8;
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 8;
acc1 = vmlaldavaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 8;
acc2 = vmlaldavaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
acc3 = vmlaldavaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
*/
blkCnt = numColsA & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vmlaldavaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vmlaldavaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vmlaldavaq_p(acc3, vecA2, vecB2, p0);
}
*px++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
*px2++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
*px2++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
j += numRowsB * 2;
/*
* Decrement the column loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0) {
q15_t const *pSrcAVec, *pSrcBVec;
q15x8_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q15_t const *) pInA;
pSrcBVec = (q15_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) / 8;
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA & 7);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
}
*px = (q15_t) MVE_ASRL_SAT16(acc0, 15);
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0) {
q15_t const *pSrcAVec, *pSrcBVec;
q15x8_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q15_t const *) pInA;
pSrcBVec = (q15_t const *) pInB;
acc0 = 0LL;
blkCnt = ((numColsA) / 8);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA & 7);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
}
*px++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q63_t sum; /* Accumulator */
#if defined (ARM_MATH_DSP) /* != CM0 */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q31_t inA1, inB1, inA2, inB2;
arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q15(pSrcB,&BT);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inA2 = read_q15x2_ia (&pInA);
inB2 = read_q15x2_ia (&pInB);
/* Multiply and Accumulates */
sum = __SMLALD(inA1, inB1, sum);
sum = __SMLALD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
/* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#else /* #if defined (ARM_MATH_DSP) */
q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,761 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pOut[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + numColsA;
q31_t const *pInA2 = pInA1 + numColsA;
q31_t const *pInA3 = pInA2 + numColsA;
q63_t acc0, acc1, acc2, acc3;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
pSrcA1Vec = (q31_t const *) pInA1;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcA3Vec = (q31_t const *) pInA3;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
}
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
px[0] = (q31_t) acc0;
px[1 * numColsB] = (q31_t) acc1;
px[2 * numColsB] = (q31_t) acc2;
px[3 * numColsB] = (q31_t) acc3;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4);
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
q31_t const *pSrcA0Vec;
q31_t const *pInA0 = pInA;
q63_t acc0;
acc0 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
}
acc0 = asrl(acc0, 23);
px[0] = (q31_t) acc0;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA;
/*
* Decrement the row loop counter
*/
rowCnt--;
}
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,678 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q7.c
* Description: Q15 matrix multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
* @brief Q7 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
*
*
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint32_t MATRIX_DIM = 2;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q31_t acc0, acc1;
q7x16_t vecB, vecA0, vecA1;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint8_t MATRIX_DIM = 3;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q7_t *pInA2 = pInA1 + MATRIX_DIM;
q31_t acc0, acc1, acc2;
q7x16_t vecB, vecA0, vecA1, vecA2;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM;
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
vecA2 = vldrbq_s8(pInA2);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint32_t MATRIX_DIM = 4;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q7_t *pInA2 = pInA1 + MATRIX_DIM;
q7_t *pInA3 = pInA2 + MATRIX_DIM;
q31_t acc0, acc1, acc2, acc3;
q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
vecA2 = vldrbq_s8(pInA2);
vecA3 = vldrbq_s8(pInA3);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q7(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst,
q7_t * pState)
{
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
q7_t *pInA2;
q7_t *pInB2;
q7_t *px; /* Temporary output data matrix pointer */
q7_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
arm_matrix_instance_q7 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if(numRowsA == 2)
return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q7(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u)
{
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
/*
* For every row wise process, the pIn2 pointer is set
* to the starting address of the transposed pSrcB data
*/
pInB = pSrcBT;
pInB2 = pInB + numRowsB;
j = 0;
/*
* column loop
*/
while (col > 0u)
{
q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q7x16_t vecA, vecA2, vecB, vecB2;
q31_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q7_t const *) pInA;
pSrcA2Vec = (q7_t const *)pInA2;
pSrcBVec = (q7_t const *) pInB;
pSrcB2Vec = (q7_t const *)pInB2;
acc0 = 0L;
acc1 = 0L;
acc2 = 0L;
acc3 = 0L;
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
blkCnt = numColsA >> 4;
while (blkCnt > 0U)
{
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 16;
acc1 = vmladavaq_s8(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 16;
acc2 = vmladavaq_s8(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
}
*px++ = (q7_t) __SSAT(acc0 >> 7, 8);
*px++ = (q7_t) __SSAT(acc2 >> 7, 8);
*px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
*px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
j += numRowsB * 2;
/*
* Decrement the column loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u)
{
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0)
{
q7_t const *pSrcAVec, *pSrcBVec;
q7x16_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q7_t const *) pInA;
pSrcBVec = (q7_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) >> 4;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
}
*px = (q7_t) __SSAT(acc0 >> 7, 8);
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u)
{
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0)
{
q7_t const *pSrcAVec, *pSrcBVec;
q7x16_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q7_t const *) pInA;
pSrcBVec = (q7_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) >> 4;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
}
*px++ = (q7_t) __SSAT(acc0 >> 7, 8);
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/*
* Return to application
*/
status = ARM_MATH_SUCCESS;
}
return(status);
}
#else
arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
{
q31_t sum; /* accumulator */
q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
q7_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do {
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do {
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U) {
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
sum += (q31_t)*pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
/* Saturate and store the result in the destination buffer */
*px++ = (q7_t)__SSAT((sum >> 7), 8);
/* Decrement the column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update the pointer pSrcA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,208 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f16.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f16x8_t vecIn, vecOut, vecScale;
float16_t const *pInVec;
pInVec = (float16_t const *) pIn;
vecScale = vdupq_n_f16(scale);
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 8;
vecOut = vmulq_f16(vecIn, vecScale);
vst1q(pOut, vecOut);
pOut += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* Input data matrix pointer */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixScale group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,287 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixScale Matrix Scale
Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
matrix by the scalar. For example:
\image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
The function checks to make sure that the input and output matrices are of the same size.
In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f32x4_t vecIn, vecOut;
float32_t const *pInVec;
pInVec = (float32_t const *) pIn;
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vecIn * scale;
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
vec1 = vld1q_f32(pIn);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* The results are stored in the destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixScale group
*/

View File

@@ -0,0 +1,249 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q15 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
arm_status status; /* Status of matrix scaling */
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q15x8_t vecIn, vecOut;
q15_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
pInVec = (q15_t const *) pIn;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec); pInVec += 8;
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
vst1q(pOut, vecOut); pOut += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec); pInVec += 8;
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia (&pIn);
inA2 = read_q15x2_ia (&pIn);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store result to destination */
write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
#else
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixScale group
*/

View File

@@ -0,0 +1,242 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q31 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q31x4_t vecIn, vecOut;
q31_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
arm_status status; /* Status of matrix scaling */
pInVec = (q31_t const *) pIn;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = shift + 1; /* Shift to apply after scaling */
q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the results. */
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out; /* Store result destination */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixScale group
*/

View File

@@ -0,0 +1,234 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f16.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_lower_triangular_f16(
const arm_matrix_instance_f16 * lt,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pLT = lt->pData;
float16_t *pA = a->pData;
float16_t *lt_row;
float16_t *a_col;
_Float16 invLT;
f16x8_t vecA;
f16x8_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+7 < cols; j += 8)
{
vecA = vld1q_f16(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f16(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f16(pLT[n*i + k]),vecX);
}
if ((_Float16)pLT[n*i + i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f16 / (_Float16)pLT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f16(invLT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
_Float16 tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= (_Float16)lt_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)lt_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / (_Float16)lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_lower_triangular_f16(
const arm_matrix_instance_f16 * lt,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pLT = lt->pData;
float16_t *pA = a->pData;
float16_t *lt_row;
float16_t *a_col;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
lt_row = &pLT[n*i];
float16_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= (_Float16)lt_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)lt_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = (_Float16)tmp / (_Float16)lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,333 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f32.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
float32_t invLT;
f32x4_t vecA;
f32x4_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+3 < cols; j += 4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f32(invLT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float32_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
float32_t invLT;
f32x4_t vecA;
f32x4_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+3 < cols; j += 4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq_f32(vecA,vdupq_n_f32(invLT));
vst1q_f32(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float32_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
n = dst->numRows;
cols = dst -> numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float32_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f64.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float64_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,226 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f16.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_upper_triangular_f16(
const arm_matrix_instance_f16 * ut,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pUT = ut->pData;
float16_t *pA = a->pData;
float16_t *ut_row;
float16_t *a_col;
_Float16 invUT;
f16x8_t vecA;
f16x8_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+7 < cols; j +=8)
{
vecA = vld1q_f16(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f16(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f16(pUT[n*i + k]),vecX);
}
if ((_Float16)pUT[n*i + i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f16 / (_Float16)pUT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f16(invUT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
_Float16 tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= (_Float16)ut_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)ut_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / (_Float16)ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_upper_triangular_f16(
const arm_matrix_instance_f16 * ut,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pUT = ut->pData;
float16_t *pA = a->pData;
float16_t *ut_row;
float16_t *a_col;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
ut_row = &pUT[n*i];
float16_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= (_Float16)ut_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)ut_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = (_Float16)tmp / (_Float16)ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,319 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f32.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
float32_t invUT;
f32x4_t vecA;
f32x4_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+3 < cols; j +=4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f32(invUT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float32_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
float32_t invUT;
f32x4_t vecA;
f32x4_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+3 < cols; j +=4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq_f32(vecA,vdupq_n_f32(invUT));
vst1q_f32(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float32_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float32_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,120 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f64.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float64_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,215 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f16.c
* Description: Floating-point matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixSub group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,298 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) - (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixSub group
*/

View File

@@ -0,0 +1,143 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f64.c
* Description: Floating-point matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_sub_f64(
const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB,
arm_matrix_instance_f64 * pDst)
{
float64_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float64_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
uint64_t numSamples; /* total number of elements in the matrix */
uint64_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint64_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixSub group
*/

View File

@@ -0,0 +1,219 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q15 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixSub group
*/

View File

@@ -0,0 +1,218 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q31.c
* Description: Q31 matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q31 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q31_t *pDataA, *pDataB, *pDataDst;
q31x4_t vecA, vecB, vecDst;
q31_t const *pSrcAVec;
q31_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q31_t const *) pDataA;
pSrcBVec = (q31_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and then store the results in the destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and store result in destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixSub group
*/

View File

@@ -0,0 +1,202 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f16.c
* Description: Floating-point matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 1)
{
pDst->pData[0] = pSrc->pData[0];
return(ARM_MATH_SUCCESS);
}
if (pDst->numCols == 2)
return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
}
arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,325 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f32.c
* Description: Floating-point matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixTrans Matrix Transpose
Tranposes a matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 2)
return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
}
arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* Row loop */
rowCnt = row >> 2;
while (rowCnt > 0U)
{
float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{
row0V = vld1q_f32(pIn);
row1V = vld1q_f32(pIn + 1 * nColumns);
row2V = vld1q_f32(pIn + 2 * nColumns);
row3V = vld1q_f32(pIn + 3 * nColumns);
pIn += 4;
ra0 = vzipq_f32(row0V,row2V);
ra1 = vzipq_f32(row1V,row3V);
rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
vst1q_f32(px,rb0.val[0]);
px += nRows;
vst1q_f32(px,rb0.val[1]);
px += nRows;
vst1q_f32(px,rb1.val[0]);
px += nRows;
vst1q_f32(px,rb1.val[1]);
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
*px++ = *(pIn + 2 * nColumns);
*px++ = *(pIn + 3 * nColumns);
px += (nRows - 4);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
i += 4;
pIn += 3 * nColumns;
/* Decrement the row loop counter */
rowCnt--;
} /* Row loop end */
rowCnt = row & 3;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
i++;
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,155 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f64.c
* Description: Floating-point matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixTrans Matrix Transpose
Tranposes a matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint64_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,233 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q15.c
* Description: Q15 matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q15 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 1)
{
pDst->pData[0] = pSrc->pData[0];
return(ARM_MATH_SUCCESS);
}
if (pDst->numCols == 2)
return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
}
arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in; /* variable to hold temporary output */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer pOut is set to starting address of column being processed */
pOut = pDst->pData + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read two elements from row */
in = read_q15x2_ia (&pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Read two elements from row */
in = read_q15x2_ia (&pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*pOut = *pIn++;
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixTrans group
*/

View File

@@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q31.c
* Description: Q31 matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q31 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 2)
return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
}
arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixTrans group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q7.c
* Description: Q7 matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q7 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
{
uint16x8_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint8_t const *pDataC;
uint8_t *pDataDestR;
uint16x8_t vecIn;
const uint8_t * pDataSrc=(const uint8_t *)pSrc->pData;
uint8_t * pDataDst=(uint8_t *)pDst->pData;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
return ARM_MATH_SIZE_MISMATCH;
}
#endif
vecOffs = vidupq_u16((uint32_t)0, 1);
vecOffs = vecOffs * pSrc->numCols;
i = pSrc->numCols;
do
{
pDataC = (uint8_t const *) pDataSrc;
pDataDestR = (uint8_t*)pDataDst;
blkCnt = pSrc->numRows >> 3;
while (blkCnt > 0U)
{
/* widened loads */
vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
vstrbq_u16(pDataDestR, vecIn);
pDataDestR += 8;
pDataC = pDataC + pSrc->numCols * 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = pSrc->numRows & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
vstrbq_p_u16(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDst += pSrc->numRows;
}
while (--i);
return (ARM_MATH_SUCCESS);
}
#else
arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
{
q7_t *pSrcA = pSrc->pData; /* input data matrix pointer */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) {
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
} else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do {
/* The pointer pOut is set to starting address of the column being processed */
pOut = pDst->pData + i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U) {
/* Read and store the input element in the destination */
*pOut = *pSrcA++;
/* Update the pointer pOut to point to the next row of the transposed matrix */
pOut += nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixTrans group
*/

View File

@@ -0,0 +1,396 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_f16.c
* Description: Floating-point matrix and vector multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Floating-point matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to input vector
* @param[out] *pDst points to output vector
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mat_vec_mult_f16(
const arm_matrix_instance_f16 *pSrcMat,
const float16_t *pSrcVec,
float16_t *pDstVec)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float16_t *pSrcA = pSrcMat->pData;
const float16_t *pInA0;
const float16_t *pInA1;
float16_t *px;
int32_t row;
uint32_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 4 rows in parallel
*/
while (row >= 4)
{
const float16_t *pInA2, *pInA3;
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
f16x8_t vecIn, acc0, acc1, acc2, acc3;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 4 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
acc1 = vdupq_n_f16(0.0f);
acc2 = vdupq_n_f16(0.0f);
acc3 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
pSrcA2Vec = pInA2;
pSrcA3Vec = pInA3;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 8;
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
pSrcA2Vec += 8;
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
pSrcA3Vec += 8;
acc3 = vfmaq(acc3, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
acc3 = vfmaq(acc3, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
*px++ = vecAddAcrossF16Mve(acc1);
*px++ = vecAddAcrossF16Mve(acc2);
*px++ = vecAddAcrossF16Mve(acc3);
pSrcA += numCols * 4;
/*
* Decrement the row loop counter
*/
row -= 4;
}
/*
* compute 2 rows in parrallel
*/
if (row >= 2)
{
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
f16x8_t vecIn, acc0, acc1;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 2 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
acc1 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 8;
acc1 = vfmaq(acc1, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
*px++ = vecAddAcrossF16Mve(acc1);
pSrcA += numCols * 2;
row -= 2;
}
if (row >= 1)
{
f16x8_t vecIn, acc0;
float16_t const *pSrcA0Vec, *pInVec;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to last MatrixA row
*/
pInA0 = pSrcA;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
}
}
#else
void arm_mat_vec_mult_f16(const arm_matrix_instance_f16 *pSrcMat, const float16_t *pVec, float16_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float16_t *pSrcA = pSrcMat->pData;
const float16_t *pInA1; /* input data matrix pointer A of Q31 type */
const float16_t *pInA2; /* input data matrix pointer A of Q31 type */
const float16_t *pInA3; /* input data matrix pointer A of Q31 type */
const float16_t *pInA4; /* input data matrix pointer A of Q31 type */
const float16_t *pInVec; /* input data matrix pointer B of Q31 type */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t i, row, colCnt; /* loop counters */
float16_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
/* row loop */
while (row > 0) {
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Initialize accumulators */
float16_t sum1 = 0.0f16;
float16_t sum2 = 0.0f16;
float16_t sum3 = 0.0f16;
float16_t sum4 = 0.0f16;
/* Loop unrolling: process 2 columns per iteration */
colCnt = numCols;
/* Initialize pointers to the starting address of the column being processed */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Main loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 2 values from vector
vecData = *(pInVec)++;
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
matData = *(pInA1)++;
sum1 += (_Float16)matData * (_Float16)vecData;
matData = *(pInA2)++;
sum2 += (_Float16)matData * (_Float16)vecData;
matData = *(pInA3)++;
sum3 += (_Float16)matData * (_Float16)vecData;
matData = *(pInA4)++;
sum4 += (_Float16)matData * (_Float16)vecData;
// Decrement the loop counter
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1;
*px++ = sum2;
*px++ = sum3;
*px++ = sum4;
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
float16_t sum = 0.0f16;
pInVec = pVec;
pInA1 = pSrcA + i;
colCnt = numCols >> 1;
while (colCnt > 0) {
vecData = *(pInVec)++;
vecData2 = *(pInVec)++;
matData = *(pInA1)++;
matData2 = *(pInA1)++;
sum += (_Float16)matData * (_Float16)vecData;
sum += (_Float16)matData2 * (_Float16)vecData2;
colCnt--;
}
// process remainder of row
colCnt = numCols & 1u;
while (colCnt > 0) {
sum += (_Float16)*pInA1++ * (_Float16)*pInVec++;
colCnt--;
}
*px++ = sum;
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,399 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_f32.c
* Description: Floating-point matrix and vector multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @defgroup MatrixVectMult Matrix Vector Multiplication
*
* Multiplies a matrix and a vector.
*
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Floating-point matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to input vector
* @param[out] *pDst points to output vector
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mat_vec_mult_f32(
const arm_matrix_instance_f32 *pSrcMat,
const float32_t *pSrcVec,
float32_t *pDstVec)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float32_t *pSrcA = pSrcMat->pData;
const float32_t *pInA0;
const float32_t *pInA1;
float32_t *px;
int32_t row;
uint32_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 4 rows in parallel
*/
while (row >= 4)
{
const float32_t *pInA2, *pInA3;
float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
f32x4_t vecIn, acc0, acc1, acc2, acc3;
float32_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 4 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
pSrcA2Vec = pInA2;
pSrcA3Vec = pInA3;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
f32x4_t vecA;
vecIn = vld1q(pInVec);
pInVec += 4;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 4;
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
pSrcA2Vec += 4;
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
pSrcA3Vec += 4;
acc3 = vfmaq(acc3, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vecA;
vecIn = vldrwq_z_f32(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
acc3 = vfmaq(acc3, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF32Mve(acc0);
*px++ = vecAddAcrossF32Mve(acc1);
*px++ = vecAddAcrossF32Mve(acc2);
*px++ = vecAddAcrossF32Mve(acc3);
pSrcA += numCols * 4;
/*
* Decrement the row loop counter
*/
row -= 4;
}
/*
* compute 2 rows in parrallel
*/
if (row >= 2)
{
float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
f32x4_t vecIn, acc0, acc1;
float32_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 2 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
f32x4_t vecA;
vecIn = vld1q(pInVec);
pInVec += 4;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 4;
acc1 = vfmaq(acc1, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vecA;
vecIn = vldrwq_z_f32(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF32Mve(acc0);
*px++ = vecAddAcrossF32Mve(acc1);
pSrcA += numCols * 2;
row -= 2;
}
if (row >= 1)
{
f32x4_t vecIn, acc0;
float32_t const *pSrcA0Vec, *pInVec;
float32_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to last MatrixA row
*/
pInA0 = pSrcA;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f32(0.0f);
pSrcA0Vec = pInA0;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
f32x4_t vecA;
vecIn = vld1q(pInVec);
pInVec += 4;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vfmaq(acc0, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vecA;
vecIn = vldrwq_z_f32(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF32Mve(acc0);
}
}
#else
void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_t *pVec, float32_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float32_t *pSrcA = pSrcMat->pData;
const float32_t *pInA1; /* input data matrix pointer A of Q31 type */
const float32_t *pInA2; /* input data matrix pointer A of Q31 type */
const float32_t *pInA3; /* input data matrix pointer A of Q31 type */
const float32_t *pInA4; /* input data matrix pointer A of Q31 type */
const float32_t *pInVec; /* input data matrix pointer B of Q31 type */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t i, row, colCnt; /* loop counters */
float32_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
/* row loop */
while (row > 0) {
/* Initialize accumulators */
float32_t sum1 = 0.0f;
float32_t sum2 = 0.0f;
float32_t sum3 = 0.0f;
float32_t sum4 = 0.0f;
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Loop unrolling: process 2 columns per iteration */
colCnt = numCols;
/* Initialize pointers to the starting address of the column being processed */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Main loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 2 values from vector
vecData = *(pInVec)++;
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
matData = *(pInA1)++;
sum1 += matData * vecData;
matData = *(pInA2)++;
sum2 += matData * vecData;
matData = *(pInA3)++;
sum3 += matData * vecData;
matData = *(pInA4)++;
sum4 += matData * vecData;
// Decrement the loop counter
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1;
*px++ = sum2;
*px++ = sum3;
*px++ = sum4;
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
float32_t sum = 0.0f;
pInVec = pVec;
pInA1 = pSrcA + i;
colCnt = numCols >> 1;
while (colCnt > 0) {
vecData = *(pInVec)++;
vecData2 = *(pInVec)++;
matData = *(pInA1)++;
matData2 = *(pInA1)++;
sum += matData * vecData;
sum += matData2 * vecData2;
colCnt--;
}
// process remainder of row
colCnt = numCols & 1u;
while (colCnt > 0) {
sum += *pInA1++ * *pInVec++;
colCnt--;
}
*px++ = sum;
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/

View File

@@ -0,0 +1,388 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_q15.c
* Description: Q15 matrix and vector multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Q15 matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to input vector
* @param[out] *pDst points to output vector
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mat_vec_mult_q15(
const arm_matrix_instance_q15 * pSrcMat,
const q15_t *pSrcVec,
q15_t *pDstVec)
{
const q15_t *pMatSrc = pSrcMat->pData;
const q15_t *pMat0, *pMat1;
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
q15_t *px;
int32_t row;
uint16_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 3x64-bit accumulators per loop
*/
while (row >= 3)
{
q15_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
const q15_t *pMat2;
q15_t const *pSrcVecPtr = pSrcVec;
q63_t acc0, acc1, acc2;
q15x8_t vecMatA0, vecMatA1, vecMatA2, vecIn;
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
pMat2 = pMat1 + numCols;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pMat2Vec = pMat2;
pVec = pSrcVecPtr;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 8;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 8;
vecMatA2 = vld1q(pMat2Vec);
pMat2Vec += 8;
vecIn = vld1q(pVec);
pVec += 8;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecMatA2 = vld1q(pMat2Vec);
vecIn = vldrhq_z_s16(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
}
*px++ = MVE_ASRL_SAT16(acc0, 15);
*px++ = MVE_ASRL_SAT16(acc1, 15);
*px++ = MVE_ASRL_SAT16(acc2, 15);
pMatSrc += numCols * 3;
/*
* Decrement the row loop counter
*/
row -= 3;
}
/*
* process any remaining rows pair
*/
if (row >= 2)
{
q15_t const *pMat0Vec, *pMat1Vec, *pVec;
q15_t const *pSrcVecPtr = pSrcVec;
q63_t acc0, acc1;
q15x8_t vecMatA0, vecMatA1, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
acc0 = 0LL;
acc1 = 0LL;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pVec = pSrcVecPtr;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 8;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 8;
vecIn = vld1q(pVec);
pVec += 8;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecIn = vldrhq_z_s16(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
}
*px++ = MVE_ASRL_SAT16(acc0, 15);
*px++ = MVE_ASRL_SAT16(acc1, 15);
pMatSrc += numCols * 2;
/*
* Decrement the row loop counter
*/
row -= 2;
}
if (row >= 1)
{
q15_t const *pMat0Vec, *pVec;
q15_t const *pSrcVecPtr = pSrcVec;
q63_t acc0;
q15x8_t vecMatA0, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
acc0 = 0LL;
pMat0Vec = pMat0;
pVec = pSrcVecPtr;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 8;
vecIn = vld1q(pVec);
pVec += 8;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecIn = vldrhq_z_s16(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
}
*px++ = MVE_ASRL_SAT16(acc0, 15);
}
}
#else
void arm_mat_vec_mult_q15(const arm_matrix_instance_q15 *pSrcMat, const q15_t *pVec, q15_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const q15_t *pSrcA = pSrcMat->pData;
const q15_t *pInA1; /* input data matrix pointer A of Q15 type */
const q15_t *pInA2; /* input data matrix pointer A of Q15 type */
const q15_t *pInA3; /* input data matrix pointer A of Q15 type */
const q15_t *pInA4; /* input data matrix pointer A of Q15 type */
const q15_t *pInVec; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t i, row, colCnt; /* loop counters */
q31_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
/* row loop */
while (row > 0) {
/* Initialize accumulators */
q63_t sum1 = 0;
q63_t sum2 = 0;
q63_t sum3 = 0;
q63_t sum4 = 0;
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Loop unrolling: process 2 columns per iteration */
colCnt = numCols >> 1;
/* Initialize pointers to the starting address of the column being processed */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Main loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 2 values from vector
vecData = read_q15x2_ia (&pInVec);
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
matData = read_q15x2_ia (&pInA1);
sum1 = __SMLALD(matData, vecData, sum1);
matData = read_q15x2_ia (&pInA2);
sum2 = __SMLALD(matData, vecData, sum2);
matData = read_q15x2_ia (&pInA3);
sum3 = __SMLALD(matData, vecData, sum3);
matData = read_q15x2_ia (&pInA4);
sum4 = __SMLALD(matData, vecData, sum4);
// Decrement the loop counter
colCnt--;
}
/* process any remaining columns */
colCnt = numCols & 1u;
if (numCols & 1u) {
vecData = *pInVec++;
sum1 += (q63_t)*pInA1++ * vecData;
sum2 += (q63_t)*pInA2++ * vecData;
sum3 += (q63_t)*pInA3++ * vecData;
sum4 += (q63_t)*pInA4++ * vecData;
}
/* Saturate and store the result in the destination buffer */
*px++ = (q15_t)(__SSAT((sum1 >> 15), 16));
*px++ = (q15_t)(__SSAT((sum2 >> 15), 16));
*px++ = (q15_t)(__SSAT((sum3 >> 15), 16));
*px++ = (q15_t)(__SSAT((sum4 >> 15), 16));
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
q63_t sum = 0;
pInVec = pVec;
pInA1 = pSrcA + i;
// loop unrolling - process 4 elements at a time
colCnt = numCols >> 2;
while (colCnt > 0) {
vecData = read_q15x2_ia (&pInVec);
vecData2 = read_q15x2_ia (&pInVec);
matData = read_q15x2_ia (&pInA1);
matData2 = read_q15x2_ia (&pInA1);
sum = __SMLALD(matData, vecData, sum);
sum = __SMLALD(matData2, vecData2, sum);
colCnt--;
}
// process remainder of row
colCnt = numCols & 3u;
while (colCnt > 0) {
sum += (q63_t)*pInA1++ * *pInVec++;
colCnt--;
}
*px++ = (q15_t)(__SSAT((sum >> 15), 16));
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
*/

View File

@@ -0,0 +1,376 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_q31.c
* Description: Q31 matrix and vector multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Q31 matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to the input vector
* @param[out] *pDst points to the output vector
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_mat_vec_mult_q31(
const arm_matrix_instance_q31 * pSrcMat,
const q31_t *pSrcVec,
q31_t *pDstVec)
{
const q31_t *pMatSrc = pSrcMat->pData;
const q31_t *pMat0, *pMat1;
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
q31_t *px;
int32_t row;
uint16_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 3x64-bit accumulators per loop
*/
while (row >= 3)
{
q31_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
const q31_t *pMat2;
q31_t const *pSrcVecPtr = pSrcVec;
q63_t acc0, acc1, acc2;
q31x4_t vecMatA0, vecMatA1, vecMatA2, vecIn;
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
pMat2 = pMat1 + numCols;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pMat2Vec = pMat2;
pVec = pSrcVecPtr;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 4;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 4;
vecMatA2 = vld1q(pMat2Vec);
pMat2Vec += 4;
vecIn = vld1q(pVec);
pVec += 4;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecMatA2 = vld1q(pMat2Vec);
vecIn = vldrwq_z_s32(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
}
*px++ = asrl(acc0, 31);
*px++ = asrl(acc1, 31);
*px++ = asrl(acc2, 31);
pMatSrc += numCols * 3;
/*
* Decrement the row loop counter
*/
row -= 3;
}
/*
* process any remaining rows pair
*/
if (row >= 2)
{
q31_t const *pMat0Vec, *pMat1Vec, *pVec;
q31_t const *pSrcVecPtr = pSrcVec;
q63_t acc0, acc1;
q31x4_t vecMatA0, vecMatA1, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
acc0 = 0LL;
acc1 = 0LL;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pVec = pSrcVecPtr;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 4;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 4;
vecIn = vld1q(pVec);
pVec += 4;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecIn = vldrwq_z_s32(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
}
*px++ = asrl(acc0, 31);
*px++ = asrl(acc1, 31);
pMatSrc += numCols * 2;
/*
* Decrement the row loop counter
*/
row -= 2;
}
if (row >= 1)
{
q31_t const *pMat0Vec, *pVec;
q31_t const *pSrcVecPtr = pSrcVec;
q63_t acc0;
q31x4_t vecMatA0, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
acc0 = 0LL;
pMat0Vec = pMat0;
pVec = pSrcVecPtr;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 4;
vecIn = vld1q(pVec);
pVec += 4;
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecIn = vldrwq_z_s32(pVec, p0);
acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
}
*px++ = asrl(acc0, 31);
}
}
#else
void arm_mat_vec_mult_q31(const arm_matrix_instance_q31 *pSrcMat, const q31_t *pVec, q31_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const q31_t *pSrcA = pSrcMat->pData;
const q31_t *pInA1; /* input data matrix pointer A of Q31 type */
const q31_t *pInA2; /* input data matrix pointer A of Q31 type */
const q31_t *pInA3; /* input data matrix pointer A of Q31 type */
const q31_t *pInA4; /* input data matrix pointer A of Q31 type */
const q31_t *pInVec; /* input data matrix pointer B of Q31 type */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t i, row, colCnt; /* loop counters */
q31_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
/* row loop */
while (row > 0) {
/* Initialize accumulators */
q63_t sum1 = 0;
q63_t sum2 = 0;
q63_t sum3 = 0;
q63_t sum4 = 0;
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Loop unrolling: process 2 columns per iteration */
colCnt = numCols;
/* Initialize pointers to the starting address of the column being processed */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Main loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 2 values from vector
vecData = *(pInVec)++;
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
matData = *(pInA1)++;
sum1 += (q63_t)matData * vecData;
matData = *(pInA2)++;
sum2 += (q63_t)matData * vecData;
matData = *(pInA3)++;
sum3 += (q63_t)matData * vecData;
matData = *(pInA4)++;
sum4 += (q63_t)matData * vecData;
// Decrement the loop counter
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = (q31_t)(sum1 >> 31);
*px++ = (q31_t)(sum2 >> 31);
*px++ = (q31_t)(sum3 >> 31);
*px++ = (q31_t)(sum4 >> 31);
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
q63_t sum = 0;
pInVec = pVec;
pInA1 = pSrcA + i;
colCnt = numCols >> 1;
while (colCnt > 0) {
vecData = *(pInVec)++;
vecData2 = *(pInVec)++;
matData = *(pInA1)++;
matData2 = *(pInA1)++;
sum += (q63_t)matData * vecData;
sum += (q63_t)matData2 * vecData2;
colCnt--;
}
// process remainder of row
colCnt = numCols & 1u;
while (colCnt > 0) {
sum += (q63_t)*pInA1++ * *pInVec++;
colCnt--;
}
*px++ = (q31_t)(sum >> 31);
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
*/

View File

@@ -0,0 +1,420 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_q7.c
* Description: Q7 matrix and vector multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Q7 matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to the input vector
* @param[out] *pDst points to the output vector
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mat_vec_mult_q7(
const arm_matrix_instance_q7 * pSrcMat,
const q7_t *pSrcVec,
q7_t *pDstVec)
{
const q7_t *pMatSrc = pSrcMat->pData;
const q7_t *pMat0, *pMat1;
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
q7_t *px;
int32_t row;
uint16_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 4x64-bit accumulators per loop
*/
while (row >= 4)
{
q7_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pMat3Vec, *pVec;
const q7_t *pMat2, *pMat3;
q7_t const *pSrcVecPtr = pSrcVec;
q31_t acc0, acc1, acc2, acc3;
q7x16_t vecMatA0, vecMatA1, vecMatA2, vecMatA3, vecIn;
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
pMat2 = pMat1 + numCols;
pMat3 = pMat2 + numCols;
acc0 = 0L;
acc1 = 0L;
acc2 = 0L;
acc3 = 0L;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pMat2Vec = pMat2;
pMat3Vec = pMat3;
pVec = pSrcVecPtr;
blkCnt = numCols >> 4;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 16;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 16;
vecMatA2 = vld1q(pMat2Vec);
pMat2Vec += 16;
vecMatA3 = vld1q(pMat3Vec);
pMat3Vec += 16;
vecIn = vld1q(pVec);
pVec += 16;
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
acc1 = vmladavaq(acc1, vecIn, vecMatA1);
acc2 = vmladavaq(acc2, vecIn, vecMatA2);
acc3 = vmladavaq(acc3, vecIn, vecMatA3);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecMatA2 = vld1q(pMat2Vec);
vecMatA3 = vld1q(pMat3Vec);
vecIn = vldrbq_z_s8(pVec, p0);
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
acc1 = vmladavaq(acc1, vecIn, vecMatA1);
acc2 = vmladavaq(acc2, vecIn, vecMatA2);
acc3 = vmladavaq(acc3, vecIn, vecMatA3);
}
*px++ = __SSAT(acc0 >> 7, 8);
*px++ = __SSAT(acc1 >> 7, 8);
*px++ = __SSAT(acc2 >> 7, 8);
*px++ = __SSAT(acc3 >> 7, 8);
pMatSrc += numCols * 4;
/*
* Decrement the row loop counter
*/
row -= 4;
}
/*
* process any remaining rows pair
*/
if (row >= 2)
{
q7_t const *pMat0Vec, *pMat1Vec, *pVec;
q7_t const *pSrcVecPtr = pSrcVec;
q31_t acc0, acc1;
q7x16_t vecMatA0, vecMatA1, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
pMat1 = pMat0 + numCols;
acc0 = 0;
acc1 = 0;
pMat0Vec = pMat0;
pMat1Vec = pMat1;
pVec = pSrcVecPtr;
blkCnt = numCols >> 4;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 16;
vecMatA1 = vld1q(pMat1Vec);
pMat1Vec += 16;
vecIn = vld1q(pVec);
pVec += 16;
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
acc1 = vmladavaq(acc1, vecIn, vecMatA1);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecMatA1 = vld1q(pMat1Vec);
vecIn = vldrbq_z_s8(pVec, p0);
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
acc1 = vmladavaq(acc1, vecIn, vecMatA1);
}
*px++ = __SSAT(acc0 >> 7, 8);
*px++ = __SSAT(acc1 >> 7, 8);
pMatSrc += numCols * 2;
/*
* Decrement the row loop counter
*/
row -= 2;
}
if (row >= 1)
{
q7_t const *pMat0Vec, *pVec;
q7_t const *pSrcVecPtr = pSrcVec;
q31_t acc0;
q7x16_t vecMatA0, vecIn;
/*
* For every row wise process, the pInVec pointer is set
* to the starting address of the vector
*/
pVec = pSrcVec;
/*
* Initialize the pointer pIn1 to point to the starting address of the column being processed
*/
pMat0 = pMatSrc;
acc0 = 0LL;
pMat0Vec = pMat0;
pVec = pSrcVecPtr;
blkCnt = numCols >> 4;
while (blkCnt > 0U)
{
vecMatA0 = vld1q(pMat0Vec);
pMat0Vec += 16;
vecIn = vld1q(pVec);
pVec += 16;
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecMatA0 = vld1q(pMat0Vec);
vecIn = vldrbq_z_s8(pVec, p0);
acc0 = vmladavaq(acc0, vecIn, vecMatA0);
}
*px++ = __SSAT(acc0 >> 7, 8);
}
}
#else
void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec, q7_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const q7_t *pSrcA = pSrcMat->pData;
const q7_t *pInA1; /* input data matrix pointer of Q7 type */
const q7_t *pInA2; /* input data matrix pointer of Q7 type */
const q7_t *pInA3; /* input data matrix pointer of Q7 type */
const q7_t *pInA4; /* input data matrix pointer of Q7 type */
const q7_t *pInVec; /* input data vector pointer of Q7 type */
q7_t *px; /* output data pointer */
uint32_t i, row, colCnt; /* loop counters */
q31_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
while (row > 0) {
/* Initialize accumulators */
q31_t sum1 = 0;
q31_t sum2 = 0;
q31_t sum3 = 0;
q31_t sum4 = 0;
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Loop unrolling: process 4 columns per iteration */
colCnt = numCols >> 2;
/* Initialize row pointers so we can track 4 rows at once */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Inner loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 4 values from vector
vecData = read_q7x4_ia (&pInVec);
vecData2 = __SXTB16(__ROR(vecData, 8));
vecData = __SXTB16(vecData);
// Read 16 values from the matrix - 4 values from each of 4 rows, and do multiply accumulate
matData = read_q7x4_ia (&pInA1);
matData2 = __SXTB16(__ROR(matData, 8));
matData = __SXTB16(matData);
sum1 = __SMLAD(matData, vecData, sum1);
sum1 = __SMLAD(matData2, vecData2, sum1);
matData = read_q7x4_ia (&pInA2);
matData2 = __SXTB16(__ROR(matData, 8));
matData = __SXTB16(matData);
sum2 = __SMLAD(matData, vecData, sum2);
sum2 = __SMLAD(matData2, vecData2, sum2);
matData = read_q7x4_ia (&pInA3);
matData2 = __SXTB16(__ROR(matData, 8));
matData = __SXTB16(matData);
sum3 = __SMLAD(matData, vecData, sum3);
sum3 = __SMLAD(matData2, vecData2, sum3);
matData = read_q7x4_ia (&pInA4);
matData2 = __SXTB16(__ROR(matData, 8));
matData = __SXTB16(matData);
sum4 = __SMLAD(matData, vecData, sum4);
sum4 = __SMLAD(matData2, vecData2, sum4);
// Decrement the loop counter
colCnt--;
}
/* process any remaining columns */
colCnt = numCols & 3u;
while (colCnt > 0) {
vecData = *pInVec++;
sum1 += *pInA1++ * vecData;
sum2 += *pInA2++ * vecData;
sum3 += *pInA3++ * vecData;
sum4 += *pInA4++ * vecData;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = (q7_t)(__SSAT((sum1 >> 7), 8));
*px++ = (q7_t)(__SSAT((sum2 >> 7), 8));
*px++ = (q7_t)(__SSAT((sum3 >> 7), 8));
*px++ = (q7_t)(__SSAT((sum4 >> 7), 8));
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
q31_t sum = 0;
pInVec = pVec;
pInA1 = pSrcA + i;
// loop unrolling - process 4 elements at a time
colCnt = numCols >> 2;
while (colCnt > 0) {
vecData = read_q7x4_ia (&pInVec);
vecData2 = __SXTB16(__ROR(vecData, 8));
vecData = __SXTB16(vecData);
matData = read_q7x4_ia (&pInA1);
matData2 = __SXTB16(__ROR(matData, 8));
matData = __SXTB16(matData);
sum = __SMLAD(matData, vecData, sum);
sum = __SMLAD(matData2, vecData2, sum);
colCnt--;
}
// process remainder of row
colCnt = numCols & 3u;
while (colCnt > 0) {
sum += *pInA1++ * *pInVec++;
colCnt--;
}
*px++ = (q7_t)(__SSAT((sum >> 7), 8));
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
*/