Commit a7de0aa5 authored by Frank Bösing's avatar Frank Bösing

Fast clip + abs

parent 8150e9ee
......@@ -6,6 +6,22 @@
* Updated on: 27.11.2021
*/
#include "mp3_decoder.h"
/* clip to range [-2^n, 2^n - 1] */
#if 0 //Fast on ARM:
#define CLIP_2N(y, n) { \
int sign = (y) >> 31; \
if (sign != (y) >> (n)) { \
(y) = sign ^ ((1 << (n)) - 1); \
} \
}
#else //on xtensa this is faster, due to asm min/max instructions:
#define CLIP_2N(y, n) { \
int x = 1 << n; \
if (y < -x) y = -x; \
x--; \
if (y > x) y = x; \
}
#endif
const uint8_t m_SYNCWORDH =0xff;
const uint8_t m_SYNCWORDL =0xf0;
......@@ -2849,100 +2865,38 @@ void WinPrevious(int *xPrev, int *xPrevWin, int btPrev){
* Return: updated mOut (from new outputs y)
**********************************************************************************************************************/
int FreqInvertRescale(int *y, int *xPrev, int blockIdx, int es){
int i, d, mOut;
int y0, y1, y2, y3, y4, y5, y6, y7, y8;
if (es == 0) {
/* fast case - frequency invert only (no rescaling) - can fuse into overlap-add for speed, if desired */
if (blockIdx & 0x01) {
y += m_NBANDS;
y0 = *y;
y += 2 * m_NBANDS;
y1 = *y;
y += 2 * m_NBANDS;
y2 = *y;
y += 2 * m_NBANDS;
y3 = *y;
y += 2 * m_NBANDS;
y4 = *y;
y += 2 * m_NBANDS;
y5 = *y;
y += 2 * m_NBANDS;
y6 = *y;
y += 2 * m_NBANDS;
y7 = *y;
y += 2 * m_NBANDS;
y8 = *y;
y += 2 * m_NBANDS;
y -= 18 * m_NBANDS;
*y = -y0;
y += 2 * m_NBANDS;
*y = -y1;
y += 2 * m_NBANDS;
*y = -y2;
y += 2 * m_NBANDS;
*y = -y3;
y += 2 * m_NBANDS;
*y = -y4;
y += 2 * m_NBANDS;
*y = -y5;
y += 2 * m_NBANDS;
*y = -y6;
y += 2 * m_NBANDS;
*y = -y7;
y += 2 * m_NBANDS;
*y = -y8;
y += 2 * m_NBANDS;
int FreqInvertRescale(int *y, int *xPrev, int blockIdx, int es) {
if (es == 0) {
/* fast case - frequency invert only (no rescaling) */
if (blockIdx & 0x01) {
y += m_NBANDS;
for (int i = 0; i < 9; i++) {
*y = - *y; y += 2 * m_NBANDS;
}
}
return 0;
}
int d, mOut;
/* undo pre-IMDCT scaling, clipping if necessary */
mOut = 0;
if (blockIdx & 0x01) {
/* frequency invert */
for (int i = 0; i < 9; i++) {
d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += m_NBANDS;
d = -*y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += m_NBANDS;
d = *xPrev; CLIP_2N(d, 31 - es); *xPrev++ = d << es;
}
return 0;
} else {
/* undo pre-IMDCT scaling, clipping if necessary */
mOut = 0;
int sign=0;
if (blockIdx & 0x01) {
/* frequency invert */
for (i = 0; i < 18; i += 2) {
d = *y;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*y = d << es;
mOut |= FASTABS(*y);
y += m_NBANDS;
d = -*y;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*y = d << es;
mOut |= FASTABS(*y);
y += m_NBANDS;
d = *xPrev;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*xPrev++ = d << es;
}
} else {
for (i = 0; i < 18; i += 2) {
d = *y;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*y = d << es;
mOut |= FASTABS(*y);
y += m_NBANDS;
d = *y;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*y = d << es;
mOut |= FASTABS(*y);
y += m_NBANDS;
d = *xPrev;
sign = (d) >> 31;
if (sign != (d) >> (31 - es)){(d) = sign ^ ((1 << (31 - es)) - 1);}
*xPrev++ = d << es;
}
for (int i = 0; i < 9; i++) {
d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += m_NBANDS;
d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += m_NBANDS;
d = *xPrev; CLIP_2N(d, 31 - es); *xPrev++ = d << es;
}
return mOut;
}
return mOut;
}
......
// based om helix mp3 decoder
#pragma once
#pragma GCC optimize ("O3")
#include "Arduino.h"
#include "assert.h"
......@@ -172,7 +171,7 @@ typedef struct ScaleFactorJS { /* used in MPEG 2, 2.5 intensity (joint) stereo o
/* NOTE - could get by with smaller vbuf if memory is more important than speed
* (in Subband, instead of replicating each block in FDCT32 you would do a memmove on the
* last 15 blocks to shift them down one, a hardware style FIFO)
*/
*/
typedef struct SubbandInfo {
int vbuf[m_MAX_NCHAN * m_VBUF_LENGTH]; /* vbuf for fast DCT-based synthesis PQMF - double size for speed (no modulo indexing) */
int vindex; /* internal index for tracking position in vbuf */
......@@ -509,9 +508,6 @@ int HybridTransform(int *xCurr, int *xPrev, int y[m_BLOCK_SIZE][m_NBANDS], SideI
inline uint64_t SAR64(uint64_t x, int n) {return x >> n;}
inline int MULSHIFT32(int x, int y) { int z; z = (uint64_t) x * (uint64_t) y >> 32; return z;}
inline uint64_t MADD64(uint64_t sum64, int x, int y) {sum64 += (uint64_t) x * (uint64_t) y; return sum64;}/* returns 64-bit value in [edx:eax] */
//inline int CLZ(int x){int numZeros; if (!x) return(sizeof(int) * 8); numZeros = 0; while (!(x & 0x80000000)){numZeros++; x <<= 1;} return numZeros;}
#define CLZ(x) __builtin_clz(x)
inline uint64_t xSAR64(uint64_t x, int n){return x >> n;}
inline int FASTABS(int x){ int sign; sign=x>>(sizeof(int)*8-1); x^=sign; x-=sign; return x;}
inline int FASTABS(int x){ return __builtin_abs(x);} //xtensa has a fast abs instruction
#define CLZ(x) __builtin_clz(x)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment