From 1ddcfcaad365734f6723db62b061a3800e28b6f5 Mon Sep 17 00:00:00 2001 From: cnlohr <lohr85@gmail.com> Date: Thu, 27 Apr 2023 12:19:51 -0400 Subject: [PATCH] Use fast multiply. --- examples/ws2812bdemo/color_utilities.h | 65 ++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/examples/ws2812bdemo/color_utilities.h b/examples/ws2812bdemo/color_utilities.h index 3b6f1b7..15e615c 100644 --- a/examples/ws2812bdemo/color_utilities.h +++ b/examples/ws2812bdemo/color_utilities.h @@ -136,6 +136,65 @@ static const unsigned char sintable[] = { 0x26, 0x28, 0x2a, 0x2d, 0x2f, 0x31, 0x34, 0x36, 0x39, 0x3c, 0x3e, 0x41, 0x44, 0x47, 0x49, 0x4c, 0x4f, 0x52, 0x55, 0x58, 0x5b, 0x5e, 0x61, 0x64, 0x67, 0x6a, 0x6d, 0x70, 0x73, 0x76, 0x79, 0x7d, }; +static inline uint32_t FastMultiply( uint32_t big_num, uint32_t small_num ) __attribute__((section(".data"))); +static inline uint32_t FastMultiply( uint32_t big_num, uint32_t small_num ) +{ + // The CH32V003 is an EC core, so no hardware multiply. GCC's way multiply + // is slow, so I wrote this. + // + // This basically does this: + // return small_num * big_num; + // + // Note: This does NOT check for zero to begin with, though this still + // produces the correct results, it is a little weird that even if + // small_num is zero it executes once. + // + // Additionally note, instead of the if( m&1 ) you can do the following: + // ret += multiplciant & neg(multiplicand & 1). + // + // BUT! Shockingly! That is slower than an extra branch! The CH32V003 + // can branch unbelievably fast. + // + // This is functionally equivelent and much faster. + // + // Perf numbers, with small_num set to 180V. + // No multiply: 21.3% CPU Usage + // Assembly below: 42.4% CPU Usage (1608 bytes for whole program) + // C version: 41.4% CPU Usage (1600 bytes for whole program) + // Using GCC (__mulsi3) 65.4% CPU Usage (1652 bytes for whole program) + // + // The multiply can be done manually: + uint32_t ret = 0; + uint32_t multiplicand = small_num; + uint32_t mutliplicant = big_num; + do + { + if( multiplicand & 1 ) + ret += mutliplicant; + mutliplicant<<=1; + multiplicand>>=1; + } while( multiplicand ); + return ret; + + // Which is equivelent to the following assembly (If you were curious) +/* + uint32_t ret = 0; + asm volatile( "\n\ + .option rvc;\n\ + 1: andi t0, %[small], 1\n\ + beqz t0, 2f\n\ + add %[ret], %[ret], %[big]\n\ + 2: srli %[small], %[small], 1\n\ + slli %[big], %[big], 1\n\ + bnez %[small], 1b\n\ + " : + [ret]"=&r"(ret), [big]"+&r"(big_num), [small]"+&r"(small_num) : : + "t0" ); + return ret; +*/ +} + +static uint32_t TweenHexColors( uint32_t hexa, uint32_t hexb, int tween ) __attribute__((section(".data"))); static uint32_t TweenHexColors( uint32_t hexa, uint32_t hexb, int tween ) { if( tween <= 0 ) return hexa; @@ -148,9 +207,9 @@ static uint32_t TweenHexColors( uint32_t hexa, uint32_t hexb, int tween ) int32_t hbb = hexb & 0xff; int32_t hbr = (hexb>>8) & 0xff; int32_t hbg = (hexb>>16) & 0xff; - int32_t b = (hab * aamt + hbb * bamt + 128) >> 8; - int32_t r = (har * aamt + hbr * bamt + 128) >> 8; - int32_t g = (hag * aamt + hbg * bamt + 128) >> 8; + int32_t b = (FastMultiply( hab, aamt ) + FastMultiply( hbb, bamt ) + 128) >> 8; + int32_t r = (FastMultiply( har, aamt ) + FastMultiply( hbr, bamt ) + 128) >> 8; + int32_t g = (FastMultiply( hag, aamt ) + FastMultiply( hbg, bamt ) + 128) >> 8; return b | (r<<8) | (g<<16); } -- GitLab