#pragma once // // FILE: IEEE754tools.h // AUTHOR: Rob Tillaart // VERSION: 0.2.1 // PURPOSE: manipulate IEEE754 float numbers fast // URL: https://github.com/RobTillaart/IEEE754tools.git // // EXPERIMENTAL ==> USE WITH CARE // not tested extensively, // // 0.1.00 2013-09-08 initial version // 0.1.01 2013-09-08 added IEEE_NAN, IEEE_INF tests + version string // 0.1.02 2013-09-08 added SHIFT_POW2 // 0.1.03 2013-09-10 renamed IEEE_Sign IEEE_Exponent // 0.2.0 2020-06-30 own repo + some refactor... // 0.2.1 2020-12-30 arduino-CI #include "Arduino.h" #define IEEE754_VERSION "0.2.1" // (un)comment lines to configure functionality / size //#define IEEE754_ENABLE_MSB // +78 bytes // IEEE754 float layout; struct IEEEfloat { uint32_t m:23; uint8_t e:8; uint8_t s:1; }; // IEEE754 double layout; struct IEEEdouble { uint64_t m:52; uint16_t e:11; uint8_t s:1; }; // Arduino UNO double layout: // the UNO has no 64 bit double, it is only able to map 23 bits of the mantisse // a filler is added for the remaining bits. These might be useful in future? struct _DBL { uint32_t filler:29; uint32_t m:23; uint16_t e:11; uint8_t s:1; }; // for packing and unpacking a float union _FLOATCONV { IEEEfloat p; float f; byte b[4]; }; // for packing and unpacking a double union _DBLCONV { // IEEEdouble p; _DBL p; double d; // !! is a 32bit float for UNO. byte b[8]; }; // // DEBUG FUNCTIONS // // print float components void dumpFloat(float number) { IEEEfloat* x = (IEEEfloat*) ((void*)&number); Serial.print(x->s, HEX); Serial.print("\t"); Serial.print(x->e, HEX); Serial.print("\t"); Serial.println(x->m, HEX); // Serial.print(" sign: "); Serial.print(x->s); // Serial.print(" exp: "); Serial.print(x->e); // Serial.print(" mant: "); Serial.println(x->m); } // print "double" components void dumpDBL(struct _DBL dbl) { Serial.print(dbl.s, HEX); Serial.print("\t"); Serial.print(dbl.e, HEX); Serial.print("\t"); Serial.println(dbl.m, HEX); } // // mapping to/from 64bit double - best effort // // converts a float to a packed array of 8 bytes representing a 64 bit double // restriction exponent and mantisse. // float; array of 8 bytes; LSBFIRST; MSBFIRST void float2DoublePacked(float number, byte* bar, int byteOrder = LSBFIRST) { _FLOATCONV fl; fl.f = number; _DBLCONV dbl; dbl.p.filler = 0; dbl.p.s = fl.p.s; dbl.p.e = fl.p.e - 127 + 1023; // exponent adjust dbl.p.m = fl.p.m; #ifdef IEEE754_ENABLE_MSB if (byteOrder == LSBFIRST) { #endif for (int i = 0; i < 8; i++) { bar[i] = dbl.b[i]; } #ifdef IEEE754_ENABLE_MSB } else { for (int i = 0; i < 8; i++) { bar[i] = dbl.b[7-i]; } } #endif } // converts a packed array of bytes into a 32bit float. // there can be an exponent overflow // the mantisse is truncated to 23 bits. float doublePacked2Float(byte* bar, int byteOrder = LSBFIRST) { _FLOATCONV fl; _DBLCONV dbl; #ifdef IEEE754_ENABLE_MSB if (byteOrder == LSBFIRST) { #endif for (int i = 0; i < 8; i++) { dbl.b[i] = bar[i]; } #ifdef IEEE754_ENABLE_MSB } else { for (int i = 0; i < 8; i++) { dbl.b[i] = bar[7-i]; } } #endif int e = dbl.p.e - 1023 + 127; // exponent adjust // TODO check exponent overflow. if (e >=0 || e <= 255) { fl.p.s = dbl.p.s; fl.p.e = e; fl.p.m = dbl.p.m; // note this one clips the mantisse return fl.f; } return NAN; // OR +-INF? // return (fl.p.s) ? -INFINITY : INFINITY; } // // TEST FUNCTIONS // // ~1.7x faster int IEEE_NAN(float number) { uint16_t* x = ((uint16_t*) &number + 1); return ((*x) == 0x7FC0); } // ~3.4x faster int IEEE_INF(float number) { uint8_t* x = ((uint8_t*) &number); if (*(x+2) != 0x80) return 0; if (*(x+3) == 0x7F) return 1; if (*(x+3) == 0xFF) return -1; return 0; } // for the real speed freaks, the next two boolean IEEE_PosINF(float number) { return (* ((uint16_t*) &number + 1) ) == 0x7F80; } boolean IEEE_NegINF(float number) { return (* ((uint16_t*) &number + 1) ) == 0xFF80; } // // PROPERTIES // uint8_t IEEE_Sign(float number) { IEEEfloat* x = (IEEEfloat*) ((void*)&number); return x->s; } int IEEE_Exponent(float number) { IEEEfloat* x = (IEEEfloat*) ((void*)&number); return x->e - 127; } uint32_t IEEE_Mantisse(float number) { IEEEfloat* x = (IEEEfloat*) ((void*)&number); return x->m; } // // MATH FUNCTIONS // // factor ~2.7; (tested with *16) more correct than the faster one float IEEE_POW2(float number, int n) { _FLOATCONV fl; fl.f = number; int e = fl.p.e + n; if (e >= 0 && e < 256) { fl.p.e = e; return fl.f; } return (fl.p.s) ? -INFINITY : INFINITY; } // WARNING no overflow detection in the SHIFT (factor ~3.5) float IEEE_POW2fast(float number, int n) { _FLOATCONV fl; fl.f = number; fl.p.e += n; return fl.f; } // // NOT TESTED FUNCTIONS // // // get truncated part as separate float. // void doublePacked2Float2(byte* bar, int byteOrder, float* value, float* error) { _FLOATCONV fl; _DBLCONV dbl; #ifdef IEEE754_ENABLE_MSB if (byteOrder == LSBFIRST) { #endif for (int i = 0; i < 8; i++) { dbl.b[i] = bar[i]; } #ifdef IEEE754_ENABLE_MSB } else { for (int i = 0; i < 8; i++) { dbl.b[i] = bar[7 - i]; } } #endif int e = dbl.p.e - 1023 + 127; // exponent adjust // TODO check exponent overflow. if (e >=0 || e <= 255) { fl.p.s = dbl.p.s; fl.p.e = e; fl.p.m = dbl.p.m; // note this one clips the mantisse *value = fl.f; fl.p.s = dbl.p.s; fl.p.e = e-23; fl.p.m = dbl.p.filler; // note this one clips the mantisse *error = fl.f; } *value = (dbl.p.s) ? -INFINITY : INFINITY; *error = 0; } // what is this??? float IEEE_FLIP(float number) { _FLOATCONV fl; fl.f = number; fl.p.e = -fl.p.e; fl.p.m = (0x007FFFFF - fl.p.m); return fl.f; } /* // ONELINERS to speed up some specific 32 bit float math // *(((byte*) &number)+3) &= 0x7F; // number == fabs(number); // x = *(((byte*) &number)+3) & 0x7F; // x = fabs(number); // GAIN = factor 2 // *(((byte*) &number)+3) |= 0x80; // number == -fabs(number); // x = *(((byte*) &number)+3) | 0x80; // x == -fabs(number); // GAIN = factor 2 // *(((byte*) &number)+3) ^= 0x80; // number = -number; // x = *(((byte*) &number)+3) ^ 0x80; // x = -number; // GAIN = factor 2 // s = *(((uint8_t*) &number)+3) & 0x80; // s = sign(number); // if ( *(((byte*) &number)+3) & 0x80) x=2; // if (number < 0) x=2; // GAIN = factor 5 // no speed optimize found for boolean IEEE_ZERO(float number) { return (* ((uint32_t*) &number) ) & 0x7FFFFFFF; } float IEEE_DIV2(float number) { IEEEfloat* x = (IEEEfloat*) ((void*)&number); x->e--; return number; } bool IEEE_LESS(float f, float g) { IEEEfloat* x = (IEEEfloat*) ((void*)&f); IEEEfloat* y = (IEEEfloat*) ((void*)&g); if (x->s > y->s) return 1; if (x->s < y->s) return 0; if (x->e < y->e) return 1; if (x->e > y->e) return 0; if (x->m < y->m) return 1; return 0; } bool IEEE_EQ(float f, float g) { IEEEfloat* x = (IEEEfloat*) ((void*)&f); IEEEfloat* y = (IEEEfloat*) ((void*)&g); return (x->m == y->m) && (x->e == y->e) && (x->s != y->s); } */ // -- END OF FILE --