FPU实验：BasicMathFunctions之求绝对值

stm32f4 · 发表于 2012-12-21 20:16:34

函数太多慢慢整理，不过同一种类型的都差不多
说明：每个基本的数学函数，都包括4个文件f32  q31  q15  q7这四个文件。
         根据要计算的数据的个数，都是以4个为基本单位，来计算，余下的
         1-3个再单独的计算。
         特别注意：这些运算都是饱和运算，什么是饱和运算，可以看权威指南。

1.  求绝对值
   arm_abs_f32.c
   arm_abs_q31.c
   arm_abs_q15.c
   arm_abs_q7.c
a、arm_abs_f32.c
/**
* @brief    Floating-point vector absolute value.
* @param[in]    *pSrc points to the input buffer
* @param[out]    *pDst points to the output buffer
* @param[in]    blockSize number of samples in each vector
* @return none.
*/

void arm_abs_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                            /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
/* 每四个是一次循环 */
  while(blkCnt > 0u)
  {
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* 调用指令fabsf 来实现求浮点数绝对值 */
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);

/* Decrement the loop counter */
blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

/* 计算余下的 */
  while(blkCnt > 0u)
  {
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
*pDst++ = fabsf(*pSrc++);

/* Decrement the loop counter */
blkCnt--;
  }

}

stm32f4 · 发表于 2012-12-21 20:39:48

指令学习：
（1） #define __SIMD32(addr)    (*(int32_t **) & (addr))    //这个不是汇编指令，查一下权威指南，
                                                                                          //SIMD 单指令流多数据流
                                                                                                           //这个在CM3 中没有实现，貌似CM4也没用实现
  //此指令用于将4个8位数据打包成32位的。
（2）#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \\
                                                         (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \\
                                                         (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \\
                                                         (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )
（3）有符号数的饱和运算（详细的饱和运算资料，看权威指南）
      param [in]  value  Value to be saturated
   param [in] sat    Bit position to saturate to (1..32)
   return          Saturated value
*/
#define __SSAT                         __ssat
注意： Q7格式的数据0x80经过饱和运算以后，输出0x7F
b、arm_abs_q7.c
void arm_abs_q7(
  q7_t * pSrc,
  q7_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                            /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q7_t in1;                                     /* Input value1 */
  q7_t in2;                                     /* Input value2 */
  q7_t in3;                                     /* Input value3 */
  q7_t in4;                                     /* Input value4 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read 4 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;

/* Store the Absolute result in the destination buffer by packing the 4 values in single cycle */
*__SIMD32(pDst)++ =
   __PACKq7(((in1 > 0) ? in1 : __SSAT(-in1, 8)),
            ((in2 > 0) ? in2 : __SSAT(-in2, 8)),
            ((in3 > 0) ? in3 : __SSAT(-in3, 8)),
            ((in4 > 0) ? in4 : __SSAT(-in4, 8)));

/* Decrement the loop counter */
blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read the input */
in1 = *pSrc++;

/* Store the Absolute result in the destination buffer */
*pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 8);

/* Decrement the loop counter */
blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  q7_t in;                                     /* Temporary input varible */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read the input */
in = *pSrc++;

/* Store the Absolute result in the destination buffer */
*pDst++ = (in > 0) ? in : __SSAT(-in, 8);

/* Decrement the loop counter */
blkCnt--;
  }
#endif /* #ifndef ARM_MATH_CM0 */
}

stm32f4 · 发表于 2012-12-21 20:55:36

指令学习：
//此指令用于将2个数据打包成32位的。
#define __PKHBT(ARG1,ARG2,ARG3)       ( ((((uint32_t)(ARG1))       ) & 0x0000FFFFUL) |  \\
                                                               ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
注意： Q15格式的数据0x8000经过饱和运算以后，输出0x7FFF
b、arm_abs_q15.c
void arm_abs_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                            /* loop counter */

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1;                                  /* Input value1 */
  q15_t in2;                                  /* Input value2 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read two inputs */
in1 = *pSrc++;
in2 = *pSrc++;

/* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */

#ifndef  ARM_MATH_BIG_ENDIAN

*__SIMD32(pDst)++ =
   __PKHBT(((in1 > 0) ? in1 : __SSAT(-in1, 16)),
            ((in2 > 0) ? in2 : __SSAT(-in2, 16)), 16);

#else

*__SIMD32(pDst)++ =
   __PKHBT(((in2 > 0) ? in2 : __SSAT(-in2, 16)),
            ((in1 > 0) ? in1 : __SSAT(-in1, 16)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

in1 = *pSrc++;
in2 = *pSrc++;

#ifndef  ARM_MATH_BIG_ENDIAN

*__SIMD32(pDst)++ =
   __PKHBT(((in1 > 0) ? in1 : __SSAT(-in1, 16)),
            ((in2 > 0) ? in2 : __SSAT(-in2, 16)), 16);

#else

*__SIMD32(pDst)++ =
   __PKHBT(((in2 > 0) ? in2 : __SSAT(-in2, 16)),
            ((in1 > 0) ? in1 : __SSAT(-in1, 16)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */

/* Decrement the loop counter */
blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read the input */
in1 = *pSrc++;

/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 16);

/* Decrement the loop counter */
blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  q15_t in;                                     /* Temporary input variable */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
/* C = |A| */
/* Read the input */
in = *pSrc++;

/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in > 0) ? in : __SSAT(-in, 16);

/* Decrement the loop counter */
blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}

stm32f4 · 发表于 2012-12-21 21:00:02

指令学习：
注意： Q32格式的数据0x80000000经过饱和运算以后，输出0x7FFFFFFF
void arm_abs_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                            /* loop counter */
  q31_t in;                                     /* Input value */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == 0x80000000) ? 0x7fffffff : -in);
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == 0x80000000) ? 0x7fffffff : -in);
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == 0x80000000) ? 0x7fffffff : -in);
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == 0x80000000) ? 0x7fffffff : -in);

/* Decrement the loop counter */
blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
/* C = |A| */
/* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == 0x80000000) ? 0x7fffffff : -in);

/* Decrement the loop counter */
blkCnt--;
  }

}

		自动登录	找回密码
密码			立即注册

[客户分享] FPU实验：BasicMathFunctions之求绝对值